Re: [Qemu-block] [Qemu-devel] [PATCH v9 0/8] Optimize VMDK I/O by allocating multiple clusters
On Fri, Oct 20, 2017 at 11:58 Fam Zheng wrote: > On Mon, 10/09 22:12, Fam Zheng wrote: > > On Mon, 10/09 18:29, Ashijeet Acharya wrote: > > > Optimization test results: > > > > > > This patch series improves 128 KB sequential write performance to an > > > empty VMDK file by 54% > > > > > > Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f > > > vmdk test.vmdk > > > > > > Changes in v9: > > > - rebase the series > > > > Thanks, looks good to me, applied: > > > > https://github.com/famz/qemu/tree/staging > > Ashijeet: I've been testing my branch and it seems installing > Fedora/CentOS to a > VMDK image is broken with your patches applied. Both guest and QEMU are > responsive, but the installing of packages stops to make any progress at > some > point: > > Installing rootfiles.noarch (317/318) > Installing langpacks-en.noarch (318/318) > Performing post-installation setup tasks > Configuring fedora-release.noarch > Configuring filesystem.x86_64 > Configuring GeoIP-GeoLite-data.noarch > Configuring python3.x86_64 > Configuring fedora-logos.x86_64 > Configuring kernel-core.x86_64 > > # hang here > > Can you reproduce this on your machine? > > My command line is something like this: > > qemu-system-x86_64 -enable-kvm -cpu host -m 1G -qmp > unix:/home/fam/.q/qemu-8DOC9EF4/qmp,server,nowait -name 8DOC9EF4 -netdev > user,id=vnet,hostfwd=:0.0.0.0:10022-:22 -device > virtio-net-pci,netdev=vnet -drive > file=/var/tmp/test2.vmdk,if=none,id=drive-1,cache=none,aio=native -device > virtio-blk-pci,drive=drive-1 -cdrom /stor/iso/CentOS-6.9-x86_64-minimal.iso > -pidfile /home/fam/.q/qemu-8DOC9EF4/pid > > qemu.git master doesn't have this problem. So I'll drop this series from > the > pull request until it is resolved. Fam: Alright, I will look into this but I cannot give you a deadline unfortunately. I will try my best to resolve this as soon as though. Ashijeet > > > Fam >
[Qemu-block] [PATCH v9 6/8] vmdk: New functions to assist allocating multiple clusters
Introduce two new helper functions handle_alloc() and vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple clusters at once starting from a given offset on disk and performs COW if necessary for first and last allocated clusters. vmdk_alloc_cluster_offset() helps to return the offset of the first of the many newly allocated clusters. Also, provide proper documentation for both. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 201 --- 1 file changed, 191 insertions(+), 10 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 11bc0f09c7..d5dfd21abe 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -136,6 +136,7 @@ typedef struct VmdkMetaData { unsigned int l2_offset; int valid; uint32_t *l2_cache_entry; +uint32_t nb_clusters; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1259,6 +1260,183 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, return VMDK_OK; } +/* + * vmdk_handle_alloc + * + * Allocate new clusters for an area that either is yet unallocated or needs a + * copy on write. + * + * Returns: + * VMDK_OK: if new clusters were allocated, *bytes may be decreased if + * the new allocation doesn't cover all of the requested area. + * *cluster_offset is updated to contain the offset of the + * first newly allocated cluster. + * + * VMDK_UNALLOC: if no clusters could be allocated. *cluster_offset is left + * unchanged. + * + * VMDK_ERROR:in error cases + */ +static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, + uint64_t offset, uint64_t *cluster_offset, + int64_t *bytes, VmdkMetaData *m_data, + bool allocate, uint32_t *alloc_clusters_counter) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +uint32_t cluster_sector; +uint32_t nb_clusters; +bool zeroed = false; +uint64_t skip_start_bytes, skip_end_bytes; +int ret; + +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, +&l2_index, &l2_table); +if (ret < 0) { +return ret; +} + +cluster_sector = le32_to_cpu(l2_table[l2_index]); + +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset); +/* Calculate the number of clusters to look for. Here we truncate the last + * cluster, i.e. 1 less than the actual value calculated as we may need to + * perform COW for the last one. */ +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes, + extent->cluster_sectors << BDRV_SECTOR_BITS) - 1; + +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index); +assert(nb_clusters <= INT_MAX); + +/* update bytes according to final nb_clusters value */ +if (nb_clusters != 0) { +*bytes = ((nb_clusters * extent->cluster_sectors) << BDRV_SECTOR_BITS) + - skip_start_bytes; +} else { +nb_clusters = 1; +} +*alloc_clusters_counter += nb_clusters; + +/* we need to use MIN() for basically 3 cases that arise : + * 1. alloc very first cluster : here skip_start_bytes >= 0 and + **bytes <= cluster_size. + * 2. alloc middle clusters : here *bytes is a perfect multiple of + *cluster_size and skip_start_bytes is 0. + * 3. alloc very last cluster : here *bytes <= cluster_size and + *skip_start_bytes is 0 + */ +skip_end_bytes = skip_start_bytes + MIN(*bytes, + extent->cluster_sectors * BDRV_SECTOR_SIZE +- skip_start_bytes); + +if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { +zeroed = true; +} + +if (!cluster_sector || zeroed) { +if (!allocate) { +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; +} + +cluster_sector = extent->next_cluster_sector; +extent->next_cluster_sector += extent->cluster_sectors +* nb_clusters; + +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, + offset, skip_start_bytes, + skip_end_bytes); +if (ret < 0) { +return ret; +} +if (m_data) { +m_data->valid = 1; +m_data->l1_index = l1_index; +m_data->l2_index = l2_index; +m_data->l2_offset = l2_offset; +m_data->l2_cache_entry = &l2_table[l2_index]; +m_data->nb_clusters = nb_clusters; +} +} +*cluster_offset = cluster_sector << BDRV_SECTOR_BITS; +return VMDK_OK; +} + +/* + * vmdk_alloc_clusters + * + * For a given offset on
[Qemu-block] [PATCH v9 7/8] vmdk: Update metadata for multiple clusters
Include a next pointer in VmdkMetaData struct to point to the previous allocated L2 table. Modify vmdk_L2update to start updating metadata for allocation of multiple clusters at once. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 128 ++- 1 file changed, 101 insertions(+), 27 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index d5dfd21abe..cbeffb1552 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -137,6 +137,8 @@ typedef struct VmdkMetaData { int valid; uint32_t *l2_cache_entry; uint32_t nb_clusters; +uint32_t offset; +struct VmdkMetaData *next; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1133,34 +1135,87 @@ exit: return ret; } -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, - uint32_t offset) +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent, + VmdkMetaData *m_data, bool zeroed) { -offset = cpu_to_le32(offset); +int i; +uint32_t offset, temp_offset; +int *l2_table_array; +int l2_array_size; + +if (zeroed) { +temp_offset = VMDK_GTE_ZEROED; +} else { +temp_offset = m_data->offset; +} + +l2_array_size = sizeof(uint32_t) * m_data->nb_clusters; +l2_table_array = qemu_try_blockalign(extent->file->bs, + QEMU_ALIGN_UP(l2_array_size, + BDRV_SECTOR_SIZE)); +if (l2_table_array == NULL) { +return VMDK_ERROR; +} +memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE)); /* update L2 table */ +offset = temp_offset; +for (i = 0; i < m_data->nb_clusters; i++) { +l2_table_array[i] = cpu_to_le32(offset); +if (!zeroed) { +offset += extent->cluster_sectors; +} +} if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } /* update backup L2 table */ if (extent->l1_backup_table_offset != 0) { m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } } + +offset = temp_offset; if (m_data->l2_cache_entry) { -*m_data->l2_cache_entry = offset; +for (i = 0; i < m_data->nb_clusters; i++) { +*m_data->l2_cache_entry = cpu_to_le32(offset); +m_data->l2_cache_entry++; + +if (!zeroed) { +offset += extent->cluster_sectors; +} +} } +qemu_vfree(l2_table_array); return VMDK_OK; } +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + bool zeroed) +{ +int ret; + +while (m_data->next != NULL) { + +ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed); +if (ret < 0) { +return ret; +} + +m_data = m_data->next; + } + + return VMDK_OK; +} + /* * vmdk_l2load * @@ -1277,9 +1332,10 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, * * VMDK_ERROR:in error cases */ + static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint64_t offset, uint64_t *cluster_offset, - int64_t *bytes, VmdkMetaData *m_data, + int64_t *bytes, VmdkMetaData **m_data, bool allocate, uint32_t *alloc_clusters_counter) { int l1_index, l2_offset, l2_index; @@ -1288,6 +1344,7 @@ static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint32_t nb_clusters; bool zeroed = false; uint64_t skip_start_bytes, skip_end_bytes; +VmdkMetaData *old_m_data; int ret; ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, @@ -1348,13 +1405,21 @@ static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, if (ret < 0) { return ret; } -if (m_data) { -m_data-&g
[Qemu-block] [PATCH v9 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()
Move the cluster tables loading code out of the existing vmdk_get_cluster_offset() function and implement it in separate get_cluster_table() and vmdk_l2load() functions. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 153 --- 1 file changed, 105 insertions(+), 48 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index d4ea92bdcf..07707779a0 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1160,6 +1160,105 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, return VMDK_OK; } +/* + * vmdk_l2load + * + * Load a new L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns: + * VMDK_OK: on success + * VMDK_ERROR:in error cases + */ +static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset, + uint32_t **new_l2_table, int *new_l2_index) +{ +int min_index, i, j; +uint32_t *l2_table; +uint32_t min_count; + +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (l2_offset == extent->l2_cache_offsets[i]) { +/* increment the hit count */ +if (++extent->l2_cache_counts[i] == UINT32_MAX) { +for (j = 0; j < L2_CACHE_SIZE; j++) { +extent->l2_cache_counts[j] >>= 1; +} +} +l2_table = extent->l2_cache + (i * extent->l2_size); +goto found; +} +} +/* not found: load a new entry in the least used one */ +min_index = 0; +min_count = UINT32_MAX; +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (extent->l2_cache_counts[i] < min_count) { +min_count = extent->l2_cache_counts[i]; +min_index = i; +} +} +l2_table = extent->l2_cache + (min_index * extent->l2_size); +if (bdrv_pread(extent->file, +(int64_t)l2_offset * 512, +l2_table, +extent->l2_size * sizeof(uint32_t) +) != extent->l2_size * sizeof(uint32_t)) { +return VMDK_ERROR; +} + +extent->l2_cache_offsets[min_index] = l2_offset; +extent->l2_cache_counts[min_index] = 1; +found: +*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; +*new_l2_table = l2_table; + +return VMDK_OK; +} + +/* + * get_cluster_table + * + * For a given offset, load (and allocate if needed) the l2 table. + * + * Returns: + * VMDK_OK:on success + * + * VMDK_UNALLOC: if cluster is not mapped + * + * VMDK_ERROR: in error cases + */ +static int get_cluster_table(VmdkExtent *extent, uint64_t offset, + int *new_l1_index, int *new_l2_offset, + int *new_l2_index, uint32_t **new_l2_table) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +int ret; + +offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; +l1_index = (offset >> 9) / extent->l1_entry_sectors; +if (l1_index >= extent->l1_size) { +return VMDK_ERROR; +} +l2_offset = extent->l1_table[l1_index]; +if (!l2_offset) { +return VMDK_UNALLOC; +} + +ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index); +if (ret < 0) { +return ret; +} + +*new_l1_index = l1_index; +*new_l2_offset = l2_offset; +*new_l2_index = l2_index; +*new_l2_table = l2_table; + +return VMDK_OK; +} + /** * vmdk_get_cluster_offset * @@ -1189,66 +1288,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, uint64_t skip_start_bytes, uint64_t skip_end_bytes) { -unsigned int l1_index, l2_offset, l2_index; -int min_index, i, j; -uint32_t min_count, *l2_table; +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; bool zeroed = false; int64_t ret; int64_t cluster_sector; -if (m_data) { -m_data->valid = 0; -} if (extent->flat) { *cluster_offset = extent->flat_start_offset; return VMDK_OK; } -offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; -l1_index = (offset >> 9) / extent->l1_entry_sectors; -if (l1_index >= extent->l1_size) { -return VMDK_ERROR; -} -l2_offset = extent->l1_table[l1_index]; -if (!l2_offset) { -return VMDK_UNALLOC; -} -for (i = 0; i < L2_CACHE_SIZE; i++) { -if (l2_offset == extent->l2_cache_offsets[i]) { -/* increment the hit count */ -if (++extent->l2_cache_counts[i] == 0x) { -for (j = 0; j < L2_CACHE_SIZE; j++) { -extent->l2_cache_counts[j] >>= 1; -
[Qemu-block] [PATCH v9 5/8] vmdk: Set maximum bytes allocated in one cycle
Set the maximum bytes allowed to get allocated at once to be not more than the extent size boundary to handle writes at two separate extents appropriately. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 07707779a0..11bc0f09c7 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1641,6 +1641,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t cluster_offset; uint64_t bytes_done = 0; VmdkMetaData m_data; +uint64_t extent_end; if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) { error_report("Wrong offset: offset=0x%" PRIx64 @@ -1654,9 +1655,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, if (!extent) { return -EIO; } +extent_end = extent->end_sector * BDRV_SECTOR_SIZE; + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - - offset_in_cluster); + +/* truncate n_bytes to first cluster because we need to perform COW */ +if (offset_in_cluster > 0) { +n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); +} else { +n_bytes = MIN(bytes, extent_end - offset); +} ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, !(extent->compressed || zeroed), -- 2.13.5
[Qemu-block] [PATCH v9 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only
vmdk_alloc_clusters() introduced earlier now handles the task of allocating clusters and performing COW when needed. Thus we can change vmdk_get_cluster_offset() to stick to the sole purpose of returning cluster offset using sector number. Update the changes at all call sites. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 56 1 file changed, 12 insertions(+), 44 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index cbeffb1552..497e30f6df 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1511,25 +1511,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs, * For flat extents, the start offset as parsed from the description file is * returned. * - * For sparse extents, look up in L1, L2 table. If allocate is true, return an - * offset for a new cluster and update L2 cache. If there is a backing file, - * COW is done before returning; otherwise, zeroes are written to the allocated - * cluster. Both COW and zero writing skips the sector range - * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller - * has new data to write there. + * For sparse extents, look up the L1, L2 table. * * Returns: VMDK_OK if cluster exists and mapped in the image. - * VMDK_UNALLOC if cluster is not mapped and @allocate is false. - * VMDK_ERROR if failed. + * VMDK_UNALLOC if cluster is not mapped. + * VMDK_ERROR if failed */ static int vmdk_get_cluster_offset(BlockDriverState *bs, VmdkExtent *extent, - VmdkMetaData *m_data, uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) + uint64_t *cluster_offset) { int l1_index, l2_offset, l2_index; uint32_t *l2_table; @@ -1554,31 +1545,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, } if (!cluster_sector || zeroed) { -if (!allocate) { -return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; -} - -cluster_sector = extent->next_cluster_sector; -extent->next_cluster_sector += extent->cluster_sectors; - -/* First of all we write grain itself, to avoid race condition - * that may to corrupt the image. - * This problem may occur because of insufficient space on host disk - * or inappropriate VM shutdown. - */ -ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, -offset, skip_start_bytes, skip_end_bytes); -if (ret) { -return ret; -} -if (m_data) { -m_data->valid = 1; -m_data->l1_index = l1_index; -m_data->l2_index = l2_index; -m_data->l2_offset = l2_offset; -m_data->l2_cache_entry = &l2_table[l2_index]; -} +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; } + *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; return VMDK_OK; } @@ -1621,9 +1590,7 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = vmdk_get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1814,13 +1781,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); +ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset); + if (ret != VMDK_OK) { /* if not allocated, try to read from parent image, if exist */ if (bs->backing && ret != VMDK_ZEROED) { @@ -2565,9 +2533,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, ret = -EINVAL; break; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, +ret = vmdk_get_cluster_offset(bs, extent, sector_num << BDRV_SECTOR_BITS, - false, &clu
[Qemu-block] [PATCH v9 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
Rename the existing get_cluster_offset() to vmdk_get_cluster_offset() and update name in all the callers accordingly. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 46 +++--- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 109c589b43..d4ea92bdcf 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1161,7 +1161,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, } /** - * get_cluster_offset + * vmdk_get_cluster_offset * * Look up cluster offset in extent file by sector number, and store in * @cluster_offset. @@ -1180,14 +1180,14 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, * VMDK_UNALLOC if cluster is not mapped and @allocate is false. * VMDK_ERROR if failed. */ -static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_get_cluster_offset(BlockDriverState *bs, + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + bool allocate, + uint64_t *cluster_offset, + uint64_t skip_start_bytes, + uint64_t skip_end_bytes) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; @@ -1321,9 +1321,9 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num * 512, false, &offset, + 0, 0); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1514,8 +1514,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + offset, false, &cluster_offset, 0, 0); offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE @@ -1601,10 +1601,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); -ret = get_cluster_offset(bs, extent, &m_data, offset, - !(extent->compressed || zeroed), - &cluster_offset, offset_in_cluster, - offset_in_cluster + n_bytes); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + !(extent->compressed || zeroed), + &cluster_offset, offset_in_cluster, + offset_in_cluster + n_bytes); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1613,8 +1613,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, return -EIO; } else { /* allocate */ -ret = get_cluster_offset(bs, extent, &m_data, offset, - true, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + true, &cluster_offset, 0, 0); } } if (ret == VMDK_ERROR) { @@ -2244,9 +2244,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, ret = -EINVAL; break; } -ret = get_cluster_offset(bs, extent, NULL, - sector_num << BDRV_SECTOR_BITS, - false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num << BDRV_SECTOR_BITS, +
[Qemu-block] [PATCH v9 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
Rename the existing function get_whole_cluster() to vmdk_perform_cow() as its sole purpose is to perform COW for the first and the last allocated clusters if needed. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 23 ++- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index e86ca39ff2..109c589b43 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1045,8 +1045,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } -/** - * get_whole_cluster +/* + * vmdk_perform_cow * * Copy backing file's cluster that covers @sector_num, otherwise write zero, * to the cluster at @cluster_sector_num. @@ -1054,13 +1054,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) * If @skip_start_sector < @skip_end_sector, the relative range * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave * it for call to write user data in the request. + * + * Returns: + * VMDK_OK: on success + * + * VMDK_ERROR:in error cases */ -static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_offset, - uint64_t offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_perform_cow(BlockDriverState *bs, +VmdkExtent *extent, +uint64_t cluster_offset, +uint64_t offset, +uint64_t skip_start_bytes, +uint64_t skip_end_bytes) { int ret = VMDK_OK; int64_t cluster_bytes; @@ -1261,7 +1266,7 @@ static int get_cluster_offset(BlockDriverState *bs, * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ -ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, offset, skip_start_bytes, skip_end_bytes); if (ret) { return ret; -- 2.13.5
[Qemu-block] [PATCH v9 0/8] Optimize VMDK I/O by allocating multiple clusters
Previously posted series patches: v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html v5 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00929.html v6 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00947.html v7 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg06600.html v8 - http://lists.nongnu.org/archive/html/qemu-devel/2017-07/msg08623.html This series helps to optimize the I/O performance of VMDK driver. Patch 1 helps us to move vmdk_find_offset_in_cluster. Patch 2 & 3 perform a simple function re-naming tasks. Patch 4 is used to factor out metadata loading code and implement it in separate functions. This will help us to avoid code duplication in future patches of this series. Patch 5 helps to set the upper limit of the bytes handled in one cycle. Patch 6 adds new functions to help us allocate multiple clusters according to the size requested, perform COW if required and return the offset of the first newly allocated cluster. Patch 7 changes the metadata update code to update the L2 tables for multiple clusters at once. Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster offset only as cluster allocation task is now handled by vmdk_alloc_clusters() Optimization test results: This patch series improves 128 KB sequential write performance to an empty VMDK file by 54% Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f vmdk test.vmdk Changes in v9: - rebase the series Changes in v8: - fix minor variable naming issue in patch 6 Changes in v7: - comment the use of MIN() in calculating skip_end_bytes - use extent->cluster_sectors instead of 128 - place check for m_data != NULL - use g_new0(VmdkMetaData, 1) instead of g_malloc0(sizeof(*m_data)) Changes in v6: - rename total_alloc_clusters as alloc_clusters_counter (fam) Changes in v5: - fix commit message and comment in patch 4 (fam) - add vmdk_ prefix to handle_alloc() (fam) - fix alignment issue in patch 6 (fam) - use BDRV_SECTOR_BITS (fam) - fix endianness calculation in patch 7 (fam) Changes in v4: - fix commit message in patch 1 (fam) - drop size_to_clusters() function (fam) - fix grammatical errors in function documentations (fam) - factor out metadata loading coding in a separate patch (patch 4) (fam) - rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam) - break patch 4(in v3) into separate patches (patch 3 and 8) (fam) - rename extent_size to extent_end (fam) - use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam) - drop next and simply do m_data = m_data->next (fam) Changes in v3: - move size_to_clusters() from patch 1 to 3 (fam) - use DIV_ROUND_UP in size_to_clusters (fam) - make patch 2 compilable (fam) - rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam) - combine patch 3 and patch 4 (as in v2) to make them compilable (fam) - call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam) Changes in v2: - segregate the ugly Patch 1 in v1 into 6 readable and sensible patches - include benchmark test results in v2 Ashijeet Acharya (8): vmdk: Move vmdk_find_offset_in_cluster() to the top vmdk: Rename get_whole_cluster() to vmdk_perform_cow() vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset() vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset() vmdk: Set maximum bytes allocated in one cycle vmdk: New functions to assist allocating multiple clusters vmdk: Update metadata for multiple clusters vmdk: Make vmdk_get_cluster_offset() return cluster offset only block/vmdk.c | 538 +-- 1 file changed, 416 insertions(+), 122 deletions(-) -- 2.13.5
[Qemu-block] [PATCH v9 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top
Move the existing vmdk_find_offset_in_cluster() function to the top of the driver. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index c665bcc977..e86ca39ff2 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs) s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); } +static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, + int64_t offset) +{ +uint64_t extent_begin_offset, extent_relative_offset; +uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; + +extent_begin_offset = +(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; +extent_relative_offset = offset - extent_begin_offset; +return extent_relative_offset % cluster_size; +} + /* Return -ve errno, or 0 on success and write CID into *pcid. */ static int vmdk_read_cid(BlockDriverState *bs, int parent, uint32_t *pcid) { @@ -1283,18 +1295,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } -static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, - int64_t offset) -{ -uint64_t extent_begin_offset, extent_relative_offset; -uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; - -extent_begin_offset = -(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; -extent_relative_offset = offset - extent_begin_offset; -return extent_relative_offset % cluster_size; -} - static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, int64_t sector_num) { -- 2.13.5
Re: [Qemu-block] [Qemu-devel] [PATCH v8 0/8] Optimize VMDK I/O by allocating multiple clusters
On Wed, Oct 4, 2017 at 1:58 PM, Fam Zheng wrote: > On Wed, 10/04 13:47, Ashijeet Acharya wrote: > > Fam: Ping? > > Hi Ashijeet, looks like this patch doesn't apply to current master, could > you > rebase and post another version? > Hello Fam, I will try to do it over the weekend then and you can merge it next week hopefully. Ashijeet > > Fam >
Re: [Qemu-block] [PATCH v8 0/8] Optimize VMDK I/O by allocating multiple clusters
On Thu, Aug 10, 2017 at 11:13 PM, Stefan Hajnoczi wrote: > On Thu, Aug 10, 2017 at 9:18 AM, Ashijeet Acharya > wrote: > > On Thu, Aug 10, 2017 at 1:41 PM, Stefan Hajnoczi > wrote: > >> > >> On Thu, Jul 27, 2017 at 3:33 PM, Ashijeet Acharya > >> wrote: > >> > Previously posted series patches: > >> > v1 - > >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html > >> > v2 - > >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html > >> > v3 - > >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html > >> > v4 - > >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html > >> > v5 - > >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00929.html > >> > v6 - > >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00947.html > >> > v7 - > >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg06600.html > >> > > >> > This series helps to optimize the I/O performance of VMDK driver. > >> > > >> > Patch 1 helps us to move vmdk_find_offset_in_cluster. > >> > > >> > Patch 2 & 3 perform a simple function re-naming tasks. > >> > > >> > Patch 4 is used to factor out metadata loading code and implement it > in > >> > separate > >> > functions. This will help us to avoid code duplication in future > patches > >> > of this > >> > series. > >> > > >> > Patch 5 helps to set the upper limit of the bytes handled in one > cycle. > >> > > >> > Patch 6 adds new functions to help us allocate multiple clusters > >> > according to > >> > the size requested, perform COW if required and return the offset of > the > >> > first > >> > newly allocated cluster. > >> > > >> > Patch 7 changes the metadata update code to update the L2 tables for > >> > multiple > >> > clusters at once. > >> > > >> > Patch 8 helps us to finally change vmdk_get_cluster_offset() to find > >> > cluster > >> > offset only as cluster allocation task is now handled by > >> > vmdk_alloc_clusters() > >> > > >> > Optimization test results: > >> > > >> > This patch series improves 128 KB sequential write performance to an > >> > empty VMDK file by 54% > >> > > >> > Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f > >> > vmdk test.vmdk > >> > > >> > Changes in v8: > >> > - fix minor variable naming issue in patch 6 > >> > >> Fam: Ping? > >> > >> Ashijeet: Feel free to send a ping reply if no one reviews your > >> patches within a few days. > > > > > > Hi Stefan, > > > > I had a chat with Fam on #qemu-block before submitting this series and he > > said he will be merging it soon when the freeze is over (I am not sure > if it > > is yet) since all the patches are already reviewed :-) > > Good to hear :). > > QEMU 2.10 is scheduled to be released on 22nd or 29th of August. > > Stefan > Fam: Ping? Ashijeet
Re: [Qemu-block] [PATCH v2 0/7] Refactor DMG driver to have chunk size independence
On Tue, Sep 5, 2017 at 4:28 PM, Stefan Hajnoczi wrote: > On Wed, Aug 30, 2017 at 06:32:52PM +0530, Ashijeet Acharya wrote: > > On Tue, Aug 29, 2017 at 8:55 PM, Stefan Hajnoczi > wrote: > > > > > On Sun, Aug 20, 2017 at 1:47 PM, Ashijeet Acharya > > > wrote: > > > > On Fri, May 5, 2017 at 7:29 PM, Stefan Hajnoczi > > > wrote: > > > >> > > > >> On Thu, Apr 27, 2017 at 01:36:30PM +0530, Ashijeet Acharya wrote: > > > >> > This series helps to provide chunk size independence for DMG > driver to > > > >> > prevent > > > >> > denial-of-service in cases where untrusted files are being > accessed by > > > >> > the user. > > > >> > > > >> The core of the chunk size dependence problem are these lines: > > > >> > > > >> s->compressed_chunk = qemu_try_blockalign(bs->file->bs, > > > >> > ds.max_compressed_size + > > > 1); > > > >> s->uncompressed_chunk = qemu_try_blockalign(bs->file->bs, > > > >> 512 * > > > >> ds.max_sectors_per_chunk); > > > >> > > > >> The refactoring needs to eliminate these buffers because their size > is > > > >> controlled by the untrusted input file. > > > > > > > > > > > > Oh okay, I understand now. But wouldn't I still need to allocate some > > > memory > > > > for these buffers to be able to use them for the compressed chunks > case > > > you > > > > mentioned below. Instead of letting the DMG images control the size > of > > > these > > > > buffers, maybe I can hard-code the size of these buffers instead? > > > > > > > >> > > > >> > > > >> After applying your patches these lines remain unchanged and we > still > > > >> cannot use input files that have a 250 MB chunk size, for example. > So > > > >> I'm not sure how this series is supposed to work. > > > >> > > > >> Here is the approach I would take: > > > >> > > > >> In order to achieve this dmg_read_chunk() needs to be scrapped. It > is > > > >> designed to read a full chunk. The new model does not read full > chunks > > > >> anymore. > > > >> > > > >> Uncompressed reads or zeroes should operate directly on qiov, not > > > >> s->uncompressed_chunk. This code will be dropped: > > > >> > > > >> data = s->uncompressed_chunk + sector_offset_in_chunk * 512; > > > >> qemu_iovec_from_buf(qiov, i * 512, data, 512); > > > > > > > > > > > > I have never worked with qiov before, are there any places where I > can > > > refer > > > > to inside other drivers to get the idea of how to use it directly (I > am > > > > searching by myself in the meantime...)? > > > > > > A QEMUIOVector is a utility type for struct iovec iov[] processing. > > > See util/iov.c. This is called "vectored" or "scatter-gather" I/O. > > > > > > Instead of transferring data to/from a single tuple, > > > they take an array []. For example, the buffer "Hello > > > world" could be split into two elements: > > > [{"Hello ", strlen("Hello ")}, > > > {"world", strlen("world")}] > > > > > > Vectored I/O is often used because it eliminates memory copies. Say > > > you have a network packet header struct and also a data payload array. > > > Traditionally you would have to allocate a new buffer large enough for > > > both header and payload, copy the header and payload into the buffer, > > > and finally give this temporary buffer to the I/O function. This is > > > inefficient. With vectored I/O you can create a vector with two > > > elements, the header and the payload, and the I/O function can process > > > them without needing a temporary buffer copy. > > > > > > > Thanks for the detailed explanation, I think I understood the concept now > > and how to use qiov efficiently. > > Correct me if I am wrong here. In order to use qiov directly for > > uncompressed chunks: > > > > 1. Declare a new local_qiov inside dmg_co_preadv (let's say) > > No, the operation should use qiov directly if the chunk
Re: [Qemu-block] [PATCH v2 0/7] Refactor DMG driver to have chunk size independence
On Tue, Aug 29, 2017 at 8:55 PM, Stefan Hajnoczi wrote: > On Sun, Aug 20, 2017 at 1:47 PM, Ashijeet Acharya > wrote: > > On Fri, May 5, 2017 at 7:29 PM, Stefan Hajnoczi > wrote: > >> > >> On Thu, Apr 27, 2017 at 01:36:30PM +0530, Ashijeet Acharya wrote: > >> > This series helps to provide chunk size independence for DMG driver to > >> > prevent > >> > denial-of-service in cases where untrusted files are being accessed by > >> > the user. > >> > >> The core of the chunk size dependence problem are these lines: > >> > >> s->compressed_chunk = qemu_try_blockalign(bs->file->bs, > >> ds.max_compressed_size + > 1); > >> s->uncompressed_chunk = qemu_try_blockalign(bs->file->bs, > >> 512 * > >> ds.max_sectors_per_chunk); > >> > >> The refactoring needs to eliminate these buffers because their size is > >> controlled by the untrusted input file. > > > > > > Oh okay, I understand now. But wouldn't I still need to allocate some > memory > > for these buffers to be able to use them for the compressed chunks case > you > > mentioned below. Instead of letting the DMG images control the size of > these > > buffers, maybe I can hard-code the size of these buffers instead? > > > >> > >> > >> After applying your patches these lines remain unchanged and we still > >> cannot use input files that have a 250 MB chunk size, for example. So > >> I'm not sure how this series is supposed to work. > >> > >> Here is the approach I would take: > >> > >> In order to achieve this dmg_read_chunk() needs to be scrapped. It is > >> designed to read a full chunk. The new model does not read full chunks > >> anymore. > >> > >> Uncompressed reads or zeroes should operate directly on qiov, not > >> s->uncompressed_chunk. This code will be dropped: > >> > >> data = s->uncompressed_chunk + sector_offset_in_chunk * 512; > >> qemu_iovec_from_buf(qiov, i * 512, data, 512); > > > > > > I have never worked with qiov before, are there any places where I can > refer > > to inside other drivers to get the idea of how to use it directly (I am > > searching by myself in the meantime...)? > > A QEMUIOVector is a utility type for struct iovec iov[] processing. > See util/iov.c. This is called "vectored" or "scatter-gather" I/O. > > Instead of transferring data to/from a single tuple, > they take an array []. For example, the buffer "Hello > world" could be split into two elements: > [{"Hello ", strlen("Hello ")}, > {"world", strlen("world")}] > > Vectored I/O is often used because it eliminates memory copies. Say > you have a network packet header struct and also a data payload array. > Traditionally you would have to allocate a new buffer large enough for > both header and payload, copy the header and payload into the buffer, > and finally give this temporary buffer to the I/O function. This is > inefficient. With vectored I/O you can create a vector with two > elements, the header and the payload, and the I/O function can process > them without needing a temporary buffer copy. > Thanks for the detailed explanation, I think I understood the concept now and how to use qiov efficiently. Correct me if I am wrong here. In order to use qiov directly for uncompressed chunks: 1. Declare a new local_qiov inside dmg_co_preadv (let's say) 2. Initialize it with qemu_iovec_init() 3. Reset it with qemu_iovec_reset() (this is because we will perform this action in a loop and thus need to reset it before every loop?) 4. Declare a buffer "uncompressed_buf" and allocate it with qemu_try_blockalign() 5. Add this buffer to our local_qiov using qemu_iovec_add() 6. Read data from file directly into local_qiov using bdrv_co_preadv() 7. On success concatenate it with the qiov passed into the main dmg_co_preadv() function. I think this method only works for uncompressed chunks. For the compressed ones, I believe we will still need to do it in the existing way, i.e. read chunk from file -> decompress into output buffer -> use qemu_iovec_from_buf() because we cannot read directly since data is in compressed form. Makes sense? > > I got clearly what you are trying > > to say, but don't know how to implement it. I think, don't we already do > > that for the zeroed chunks in DMG in dmg_co_preadv()? > > Yes, dmg_co_preadv() directl
Re: [Qemu-block] [PATCH v2 0/7] Refactor DMG driver to have chunk size independence
On Fri, May 5, 2017 at 7:29 PM, Stefan Hajnoczi wrote: > On Thu, Apr 27, 2017 at 01:36:30PM +0530, Ashijeet Acharya wrote: > > This series helps to provide chunk size independence for DMG driver to > prevent > > denial-of-service in cases where untrusted files are being accessed by > the user. > > The core of the chunk size dependence problem are these lines: > > s->compressed_chunk = qemu_try_blockalign(bs->file->bs, > ds.max_compressed_size + 1); > s->uncompressed_chunk = qemu_try_blockalign(bs->file->bs, > 512 * > ds.max_sectors_per_chunk); > > The refactoring needs to eliminate these buffers because their size is > controlled by the untrusted input file. > Oh okay, I understand now. But wouldn't I still need to allocate some memory for these buffers to be able to use them for the compressed chunks case you mentioned below. Instead of letting the DMG images control the size of these buffers, maybe I can hard-code the size of these buffers instead? > > After applying your patches these lines remain unchanged and we still > cannot use input files that have a 250 MB chunk size, for example. So > I'm not sure how this series is supposed to work. > > Here is the approach I would take: > > In order to achieve this dmg_read_chunk() needs to be scrapped. It is > designed to read a full chunk. The new model does not read full chunks > anymore. > > Uncompressed reads or zeroes should operate directly on qiov, not > s->uncompressed_chunk. This code will be dropped: > > data = s->uncompressed_chunk + sector_offset_in_chunk * 512; > qemu_iovec_from_buf(qiov, i * 512, data, 512); > I have never worked with qiov before, are there any places where I can refer to inside other drivers to get the idea of how to use it directly (I am searching by myself in the meantime...)? I got clearly what you are trying to say, but don't know how to implement it. I think, don't we already do that for the zeroed chunks in DMG in dmg_co_preadv()? > > Compressed reads still buffers. I suggest the following buffers: > > 1. compressed_buf - compressed data is read into this buffer from file > 2. uncompressed_buf - a place to discard decompressed data while > simulating a seek operation > Yes, these are the buffers whose size I can hard-code as discussed above? You can suggest the preferred size to me. > Data is read from compressed chunks by reading a reasonable amount > (64k?) into compressed_buf. If the user wishes to read at an offset > into this chunk then a loop decompresses data we are seeking over into > uncompressed_buf (and refills compressed_buf if it becomes empty) until > the desired offset is reached. Then decompression can continue > directly into the user's qiov and uncompressed_buf isn't used to > decompress the data requested by the user. > Yes, this series does exactly that but keeps using the "uncompressed" buffer once we reach the desired offset. Once, I understand to use qiov directly, we can do this. Also, Kevin did suggest me (as I remember vaguely) that in reality we never actually get the read request at a particular offset because DMG driver is generally used with "qemu-img convert", which means all read requests are from the top. > > Sequential compressed reads can be optimized by keeping the compression > state across read calls. That means the zlib/bz2 state plus > compressed_buf and the current offset. That way we don't need to > re-seek into the current compressed chunk to handle sequential reads. > I guess, that's what I implemented with this series so now I can reuse the "caching access point" part in the next series to implement this optimization. Thanks Ashijeet
Re: [Qemu-block] [PATCH v8 0/8] Optimize VMDK I/O by allocating multiple clusters
On Thu, Aug 10, 2017 at 1:41 PM, Stefan Hajnoczi wrote: > On Thu, Jul 27, 2017 at 3:33 PM, Ashijeet Acharya > wrote: > > Previously posted series patches: > > v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017- > 03/msg02044.html > > v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017- > 03/msg05080.html > > v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017- > 04/msg00074.html > > v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017- > 04/msg03851.html > > v5 - http://lists.nongnu.org/archive/html/qemu-devel/2017- > 06/msg00929.html > > v6 - http://lists.nongnu.org/archive/html/qemu-devel/2017- > 06/msg00947.html > > v7 - http://lists.nongnu.org/archive/html/qemu-devel/2017- > 06/msg06600.html > > > > This series helps to optimize the I/O performance of VMDK driver. > > > > Patch 1 helps us to move vmdk_find_offset_in_cluster. > > > > Patch 2 & 3 perform a simple function re-naming tasks. > > > > Patch 4 is used to factor out metadata loading code and implement it in > separate > > functions. This will help us to avoid code duplication in future patches > of this > > series. > > > > Patch 5 helps to set the upper limit of the bytes handled in one cycle. > > > > Patch 6 adds new functions to help us allocate multiple clusters > according to > > the size requested, perform COW if required and return the offset of the > first > > newly allocated cluster. > > > > Patch 7 changes the metadata update code to update the L2 tables for > multiple > > clusters at once. > > > > Patch 8 helps us to finally change vmdk_get_cluster_offset() to find > cluster > > offset only as cluster allocation task is now handled by > vmdk_alloc_clusters() > > > > Optimization test results: > > > > This patch series improves 128 KB sequential write performance to an > > empty VMDK file by 54% > > > > Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f > > vmdk test.vmdk > > > > Changes in v8: > > - fix minor variable naming issue in patch 6 > > Fam: Ping? > > Ashijeet: Feel free to send a ping reply if no one reviews your > patches within a few days. > Hi Stefan, I had a chat with Fam on #qemu-block before submitting this series and he said he will be merging it soon when the freeze is over (I am not sure if it is yet) since all the patches are already reviewed :-) Ashijeet
[Qemu-block] [PATCH v8 7/8] vmdk: Update metadata for multiple clusters
Include a next pointer in VmdkMetaData struct to point to the previous allocated L2 table. Modify vmdk_L2update to start updating metadata for allocation of multiple clusters at once. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 128 ++- 1 file changed, 101 insertions(+), 27 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 5f27dbb..4a59ca4 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -137,6 +137,8 @@ typedef struct VmdkMetaData { int valid; uint32_t *l2_cache_entry; uint32_t nb_clusters; +uint32_t offset; +struct VmdkMetaData *next; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1116,34 +1118,87 @@ exit: return ret; } -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, - uint32_t offset) +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent, + VmdkMetaData *m_data, bool zeroed) { -offset = cpu_to_le32(offset); +int i; +uint32_t offset, temp_offset; +int *l2_table_array; +int l2_array_size; + +if (zeroed) { +temp_offset = VMDK_GTE_ZEROED; +} else { +temp_offset = m_data->offset; +} + +l2_array_size = sizeof(uint32_t) * m_data->nb_clusters; +l2_table_array = qemu_try_blockalign(extent->file->bs, + QEMU_ALIGN_UP(l2_array_size, + BDRV_SECTOR_SIZE)); +if (l2_table_array == NULL) { +return VMDK_ERROR; +} +memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE)); /* update L2 table */ +offset = temp_offset; +for (i = 0; i < m_data->nb_clusters; i++) { +l2_table_array[i] = cpu_to_le32(offset); +if (!zeroed) { +offset += extent->cluster_sectors; +} +} if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } /* update backup L2 table */ if (extent->l1_backup_table_offset != 0) { m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } } + +offset = temp_offset; if (m_data->l2_cache_entry) { -*m_data->l2_cache_entry = offset; +for (i = 0; i < m_data->nb_clusters; i++) { +*m_data->l2_cache_entry = cpu_to_le32(offset); +m_data->l2_cache_entry++; + +if (!zeroed) { +offset += extent->cluster_sectors; +} +} } +qemu_vfree(l2_table_array); return VMDK_OK; } +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + bool zeroed) +{ +int ret; + +while (m_data->next != NULL) { + +ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed); +if (ret < 0) { +return ret; +} + +m_data = m_data->next; + } + + return VMDK_OK; +} + /* * vmdk_l2load * @@ -1260,9 +1315,10 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, * * VMDK_ERROR:in error cases */ + static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint64_t offset, uint64_t *cluster_offset, - int64_t *bytes, VmdkMetaData *m_data, + int64_t *bytes, VmdkMetaData **m_data, bool allocate, uint32_t *alloc_clusters_counter) { int l1_index, l2_offset, l2_index; @@ -1271,6 +1327,7 @@ static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint32_t nb_clusters; bool zeroed = false; uint64_t skip_start_bytes, skip_end_bytes; +VmdkMetaData *old_m_data; int ret; ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, @@ -1331,13 +1388,21 @@ static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, if (ret < 0) { return ret; } -if (m_data) { -m_data->valid = 1;
[Qemu-block] [PATCH v8 6/8] vmdk: New functions to assist allocating multiple clusters
Introduce two new helper functions handle_alloc() and vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple clusters at once starting from a given offset on disk and performs COW if necessary for first and last allocated clusters. vmdk_alloc_cluster_offset() helps to return the offset of the first of the many newly allocated clusters. Also, provide proper documentation for both. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 201 --- 1 file changed, 191 insertions(+), 10 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index fe2046b..5f27dbb 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -136,6 +136,7 @@ typedef struct VmdkMetaData { unsigned int l2_offset; int valid; uint32_t *l2_cache_entry; +uint32_t nb_clusters; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1242,6 +1243,183 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, return VMDK_OK; } +/* + * vmdk_handle_alloc + * + * Allocate new clusters for an area that either is yet unallocated or needs a + * copy on write. + * + * Returns: + * VMDK_OK: if new clusters were allocated, *bytes may be decreased if + * the new allocation doesn't cover all of the requested area. + * *cluster_offset is updated to contain the offset of the + * first newly allocated cluster. + * + * VMDK_UNALLOC: if no clusters could be allocated. *cluster_offset is left + * unchanged. + * + * VMDK_ERROR:in error cases + */ +static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, + uint64_t offset, uint64_t *cluster_offset, + int64_t *bytes, VmdkMetaData *m_data, + bool allocate, uint32_t *alloc_clusters_counter) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +uint32_t cluster_sector; +uint32_t nb_clusters; +bool zeroed = false; +uint64_t skip_start_bytes, skip_end_bytes; +int ret; + +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, +&l2_index, &l2_table); +if (ret < 0) { +return ret; +} + +cluster_sector = le32_to_cpu(l2_table[l2_index]); + +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset); +/* Calculate the number of clusters to look for. Here we truncate the last + * cluster, i.e. 1 less than the actual value calculated as we may need to + * perform COW for the last one. */ +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes, + extent->cluster_sectors << BDRV_SECTOR_BITS) - 1; + +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index); +assert(nb_clusters <= INT_MAX); + +/* update bytes according to final nb_clusters value */ +if (nb_clusters != 0) { +*bytes = ((nb_clusters * extent->cluster_sectors) << BDRV_SECTOR_BITS) + - skip_start_bytes; +} else { +nb_clusters = 1; +} +*alloc_clusters_counter += nb_clusters; + +/* we need to use MIN() for basically 3 cases that arise : + * 1. alloc very first cluster : here skip_start_bytes >= 0 and + **bytes <= cluster_size. + * 2. alloc middle clusters : here *bytes is a perfect multiple of + *cluster_size and skip_start_bytes is 0. + * 3. alloc very last cluster : here *bytes <= cluster_size and + *skip_start_bytes is 0 + */ +skip_end_bytes = skip_start_bytes + MIN(*bytes, + extent->cluster_sectors * BDRV_SECTOR_SIZE +- skip_start_bytes); + +if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { +zeroed = true; +} + +if (!cluster_sector || zeroed) { +if (!allocate) { +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; +} + +cluster_sector = extent->next_cluster_sector; +extent->next_cluster_sector += extent->cluster_sectors +* nb_clusters; + +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, + offset, skip_start_bytes, + skip_end_bytes); +if (ret < 0) { +return ret; +} +if (m_data) { +m_data->valid = 1; +m_data->l1_index = l1_index; +m_data->l2_index = l2_index; +m_data->l2_offset = l2_offset; +m_data->l2_cache_entry = &l2_table[l2_index]; +m_data->nb_clusters = nb_clusters; +} +} +*cluster_offset = cluster_sector << BDRV_SECTOR_BITS; +return VMDK_OK; +} + +/* + * vmdk_alloc_clusters + * + * For a given offset on
[Qemu-block] [PATCH v8 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only
vmdk_alloc_clusters() introduced earlier now handles the task of allocating clusters and performing COW when needed. Thus we can change vmdk_get_cluster_offset() to stick to the sole purpose of returning cluster offset using sector number. Update the changes at all call sites. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 56 1 file changed, 12 insertions(+), 44 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 4a59ca4..a84b26c 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1494,25 +1494,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs, * For flat extents, the start offset as parsed from the description file is * returned. * - * For sparse extents, look up in L1, L2 table. If allocate is true, return an - * offset for a new cluster and update L2 cache. If there is a backing file, - * COW is done before returning; otherwise, zeroes are written to the allocated - * cluster. Both COW and zero writing skips the sector range - * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller - * has new data to write there. + * For sparse extents, look up the L1, L2 table. * * Returns: VMDK_OK if cluster exists and mapped in the image. - * VMDK_UNALLOC if cluster is not mapped and @allocate is false. - * VMDK_ERROR if failed. + * VMDK_UNALLOC if cluster is not mapped. + * VMDK_ERROR if failed */ static int vmdk_get_cluster_offset(BlockDriverState *bs, VmdkExtent *extent, - VmdkMetaData *m_data, uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) + uint64_t *cluster_offset) { int l1_index, l2_offset, l2_index; uint32_t *l2_table; @@ -1537,31 +1528,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, } if (!cluster_sector || zeroed) { -if (!allocate) { -return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; -} - -cluster_sector = extent->next_cluster_sector; -extent->next_cluster_sector += extent->cluster_sectors; - -/* First of all we write grain itself, to avoid race condition - * that may to corrupt the image. - * This problem may occur because of insufficient space on host disk - * or inappropriate VM shutdown. - */ -ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, -offset, skip_start_bytes, skip_end_bytes); -if (ret) { -return ret; -} -if (m_data) { -m_data->valid = 1; -m_data->l1_index = l1_index; -m_data->l2_index = l2_index; -m_data->l2_offset = l2_offset; -m_data->l2_cache_entry = &l2_table[l2_index]; -} +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; } + *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; return VMDK_OK; } @@ -1604,9 +1573,7 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = vmdk_get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1797,13 +1764,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); +ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset); + if (ret != VMDK_OK) { /* if not allocated, try to read from parent image, if exist */ if (bs->backing && ret != VMDK_ZEROED) { @@ -2550,9 +2518,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, sector_num); break; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, +ret = vmdk_get_cluster_offset(bs, extent, sector_num << BDRV_SECTOR_BITS, - false, &clu
[Qemu-block] [PATCH v8 5/8] vmdk: Set maximum bytes allocated in one cycle
Set the maximum bytes allowed to get allocated at once to be not more than the extent size boundary to handle writes at two separate extents appropriately. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 5647f53..fe2046b 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1624,6 +1624,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t cluster_offset; uint64_t bytes_done = 0; VmdkMetaData m_data; +uint64_t extent_end; if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) { error_report("Wrong offset: offset=0x%" PRIx64 @@ -1637,9 +1638,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, if (!extent) { return -EIO; } +extent_end = extent->end_sector * BDRV_SECTOR_SIZE; + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - - offset_in_cluster); + +/* truncate n_bytes to first cluster because we need to perform COW */ +if (offset_in_cluster > 0) { +n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); +} else { +n_bytes = MIN(bytes, extent_end - offset); +} ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, !(extent->compressed || zeroed), -- 2.6.2
[Qemu-block] [PATCH v8 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
Rename the existing get_cluster_offset() to vmdk_get_cluster_offset() and update name in all the callers accordingly. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 46 +++--- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 73ae786..f403981 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1144,7 +1144,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, } /** - * get_cluster_offset + * vmdk_get_cluster_offset * * Look up cluster offset in extent file by sector number, and store in * @cluster_offset. @@ -1163,14 +1163,14 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, * VMDK_UNALLOC if cluster is not mapped and @allocate is false. * VMDK_ERROR if failed. */ -static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_get_cluster_offset(BlockDriverState *bs, + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + bool allocate, + uint64_t *cluster_offset, + uint64_t skip_start_bytes, + uint64_t skip_end_bytes) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; @@ -1304,9 +1304,9 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num * 512, false, &offset, + 0, 0); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1497,8 +1497,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + offset, false, &cluster_offset, 0, 0); offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE @@ -1584,10 +1584,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); -ret = get_cluster_offset(bs, extent, &m_data, offset, - !(extent->compressed || zeroed), - &cluster_offset, offset_in_cluster, - offset_in_cluster + n_bytes); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + !(extent->compressed || zeroed), + &cluster_offset, offset_in_cluster, + offset_in_cluster + n_bytes); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1596,8 +1596,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, return -EIO; } else { /* allocate */ -ret = get_cluster_offset(bs, extent, &m_data, offset, - true, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + true, &cluster_offset, 0, 0); } } if (ret == VMDK_ERROR) { @@ -2229,9 +2229,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, sector_num); break; } -ret = get_cluster_offset(bs, extent, NULL, - sector_num << BDRV_SECTOR_BITS, - false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num << BDRV_SECTOR_BITS, +
[Qemu-block] [PATCH v8 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()
Move the cluster tables loading code out of the existing vmdk_get_cluster_offset() function and implement it in separate get_cluster_table() and vmdk_l2load() functions. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 153 --- 1 file changed, 105 insertions(+), 48 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index f403981..5647f53 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1143,6 +1143,105 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, return VMDK_OK; } +/* + * vmdk_l2load + * + * Load a new L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns: + * VMDK_OK: on success + * VMDK_ERROR:in error cases + */ +static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset, + uint32_t **new_l2_table, int *new_l2_index) +{ +int min_index, i, j; +uint32_t *l2_table; +uint32_t min_count; + +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (l2_offset == extent->l2_cache_offsets[i]) { +/* increment the hit count */ +if (++extent->l2_cache_counts[i] == UINT32_MAX) { +for (j = 0; j < L2_CACHE_SIZE; j++) { +extent->l2_cache_counts[j] >>= 1; +} +} +l2_table = extent->l2_cache + (i * extent->l2_size); +goto found; +} +} +/* not found: load a new entry in the least used one */ +min_index = 0; +min_count = UINT32_MAX; +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (extent->l2_cache_counts[i] < min_count) { +min_count = extent->l2_cache_counts[i]; +min_index = i; +} +} +l2_table = extent->l2_cache + (min_index * extent->l2_size); +if (bdrv_pread(extent->file, +(int64_t)l2_offset * 512, +l2_table, +extent->l2_size * sizeof(uint32_t) +) != extent->l2_size * sizeof(uint32_t)) { +return VMDK_ERROR; +} + +extent->l2_cache_offsets[min_index] = l2_offset; +extent->l2_cache_counts[min_index] = 1; +found: +*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; +*new_l2_table = l2_table; + +return VMDK_OK; +} + +/* + * get_cluster_table + * + * For a given offset, load (and allocate if needed) the l2 table. + * + * Returns: + * VMDK_OK:on success + * + * VMDK_UNALLOC: if cluster is not mapped + * + * VMDK_ERROR: in error cases + */ +static int get_cluster_table(VmdkExtent *extent, uint64_t offset, + int *new_l1_index, int *new_l2_offset, + int *new_l2_index, uint32_t **new_l2_table) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +int ret; + +offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; +l1_index = (offset >> 9) / extent->l1_entry_sectors; +if (l1_index >= extent->l1_size) { +return VMDK_ERROR; +} +l2_offset = extent->l1_table[l1_index]; +if (!l2_offset) { +return VMDK_UNALLOC; +} + +ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index); +if (ret < 0) { +return ret; +} + +*new_l1_index = l1_index; +*new_l2_offset = l2_offset; +*new_l2_index = l2_index; +*new_l2_table = l2_table; + +return VMDK_OK; +} + /** * vmdk_get_cluster_offset * @@ -1172,66 +1271,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, uint64_t skip_start_bytes, uint64_t skip_end_bytes) { -unsigned int l1_index, l2_offset, l2_index; -int min_index, i, j; -uint32_t min_count, *l2_table; +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; bool zeroed = false; int64_t ret; int64_t cluster_sector; -if (m_data) { -m_data->valid = 0; -} if (extent->flat) { *cluster_offset = extent->flat_start_offset; return VMDK_OK; } -offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; -l1_index = (offset >> 9) / extent->l1_entry_sectors; -if (l1_index >= extent->l1_size) { -return VMDK_ERROR; -} -l2_offset = extent->l1_table[l1_index]; -if (!l2_offset) { -return VMDK_UNALLOC; -} -for (i = 0; i < L2_CACHE_SIZE; i++) { -if (l2_offset == extent->l2_cache_offsets[i]) { -/* increment the hit count */ -if (++extent->l2_cache_counts[i] == 0x) { -for (j = 0; j < L2_CACHE_SIZE; j++) { -extent->l2_cache_counts[j] >>= 1; -} -
[Qemu-block] [PATCH v8 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
Rename the existing function get_whole_cluster() to vmdk_perform_cow() as its sole purpose is to perform COW for the first and the last allocated clusters if needed. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 23 ++- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 22be887..73ae786 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } -/** - * get_whole_cluster +/* + * vmdk_perform_cow * * Copy backing file's cluster that covers @sector_num, otherwise write zero, * to the cluster at @cluster_sector_num. @@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) * If @skip_start_sector < @skip_end_sector, the relative range * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave * it for call to write user data in the request. + * + * Returns: + * VMDK_OK: on success + * + * VMDK_ERROR:in error cases */ -static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_offset, - uint64_t offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_perform_cow(BlockDriverState *bs, +VmdkExtent *extent, +uint64_t cluster_offset, +uint64_t offset, +uint64_t skip_start_bytes, +uint64_t skip_end_bytes) { int ret = VMDK_OK; int64_t cluster_bytes; @@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs, * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ -ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, offset, skip_start_bytes, skip_end_bytes); if (ret) { return ret; -- 2.6.2
[Qemu-block] [PATCH v8 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top
Move the existing vmdk_find_offset_in_cluster() function to the top of the driver. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index a9bd22b..22be887 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs) s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); } +static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, + int64_t offset) +{ +uint64_t extent_begin_offset, extent_relative_offset; +uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; + +extent_begin_offset = +(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; +extent_relative_offset = offset - extent_begin_offset; +return extent_relative_offset % cluster_size; +} + static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { char *desc; @@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } -static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, - int64_t offset) -{ -uint64_t extent_begin_offset, extent_relative_offset; -uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; - -extent_begin_offset = -(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; -extent_relative_offset = offset - extent_begin_offset; -return extent_relative_offset % cluster_size; -} - static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, int64_t sector_num) { -- 2.6.2
[Qemu-block] [PATCH v8 0/8] Optimize VMDK I/O by allocating multiple clusters
Previously posted series patches: v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html v5 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00929.html v6 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00947.html v7 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg06600.html This series helps to optimize the I/O performance of VMDK driver. Patch 1 helps us to move vmdk_find_offset_in_cluster. Patch 2 & 3 perform a simple function re-naming tasks. Patch 4 is used to factor out metadata loading code and implement it in separate functions. This will help us to avoid code duplication in future patches of this series. Patch 5 helps to set the upper limit of the bytes handled in one cycle. Patch 6 adds new functions to help us allocate multiple clusters according to the size requested, perform COW if required and return the offset of the first newly allocated cluster. Patch 7 changes the metadata update code to update the L2 tables for multiple clusters at once. Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster offset only as cluster allocation task is now handled by vmdk_alloc_clusters() Optimization test results: This patch series improves 128 KB sequential write performance to an empty VMDK file by 54% Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f vmdk test.vmdk Changes in v8: - fix minor variable naming issue in patch 6 Changes in v7: - comment the use of MIN() in calculating skip_end_bytes - use extent->cluster_sectors instead of 128 - place check for m_data != NULL - use g_new0(VmdkMetaData, 1) instead of g_malloc0(sizeof(*m_data)) Changes in v6: - rename total_alloc_clusters as alloc_clusters_counter (fam) Changes in v5: - fix commit message and comment in patch 4 (fam) - add vmdk_ prefix to handle_alloc() (fam) - fix alignment issue in patch 6 (fam) - use BDRV_SECTOR_BITS (fam) - fix endianness calculation in patch 7 (fam) Changes in v4: - fix commit message in patch 1 (fam) - drop size_to_clusters() function (fam) - fix grammatical errors in function documentations (fam) - factor out metadata loading coding in a separate patch (patch 4) (fam) - rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam) - break patch 4(in v3) into separate patches (patch 3 and 8) (fam) - rename extent_size to extent_end (fam) - use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam) - drop next and simply do m_data = m_data->next (fam) Changes in v3: - move size_to_clusters() from patch 1 to 3 (fam) - use DIV_ROUND_UP in size_to_clusters (fam) - make patch 2 compilable (fam) - rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam) - combine patch 3 and patch 4 (as in v2) to make them compilable (fam) - call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam) Changes in v2: - segregate the ugly Patch 1 in v1 into 6 readable and sensible patches - include benchmark test results in v2 Ashijeet Acharya (8): vmdk: Move vmdk_find_offset_in_cluster() to the top vmdk: Rename get_whole_cluster() to vmdk_perform_cow() vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset() vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset() vmdk: Set maximum bytes allocated in one cycle vmdk: New functions to assist allocating multiple clusters vmdk: Update metadata for multiple clusters vmdk: Make vmdk_get_cluster_offset() return cluster offset only block/vmdk.c | 538 +-- 1 file changed, 416 insertions(+), 122 deletions(-) -- 2.6.2
[Qemu-block] [PATCH v7 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only
vmdk_alloc_clusters() introduced earlier now handles the task of allocating clusters and performing COW when needed. Thus we can change vmdk_get_cluster_offset() to stick to the sole purpose of returning cluster offset using sector number. Update the changes at all call sites. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 56 1 file changed, 12 insertions(+), 44 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 60b8adc..d41fde9 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1493,25 +1493,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs, * For flat extents, the start offset as parsed from the description file is * returned. * - * For sparse extents, look up in L1, L2 table. If allocate is true, return an - * offset for a new cluster and update L2 cache. If there is a backing file, - * COW is done before returning; otherwise, zeroes are written to the allocated - * cluster. Both COW and zero writing skips the sector range - * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller - * has new data to write there. + * For sparse extents, look up the L1, L2 table. * * Returns: VMDK_OK if cluster exists and mapped in the image. - * VMDK_UNALLOC if cluster is not mapped and @allocate is false. - * VMDK_ERROR if failed. + * VMDK_UNALLOC if cluster is not mapped. + * VMDK_ERROR if failed */ static int vmdk_get_cluster_offset(BlockDriverState *bs, VmdkExtent *extent, - VmdkMetaData *m_data, uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) + uint64_t *cluster_offset) { int l1_index, l2_offset, l2_index; uint32_t *l2_table; @@ -1536,31 +1527,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, } if (!cluster_sector || zeroed) { -if (!allocate) { -return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; -} - -cluster_sector = extent->next_cluster_sector; -extent->next_cluster_sector += extent->cluster_sectors; - -/* First of all we write grain itself, to avoid race condition - * that may to corrupt the image. - * This problem may occur because of insufficient space on host disk - * or inappropriate VM shutdown. - */ -ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, -offset, skip_start_bytes, skip_end_bytes); -if (ret) { -return ret; -} -if (m_data) { -m_data->valid = 1; -m_data->l1_index = l1_index; -m_data->l2_index = l2_index; -m_data->l2_offset = l2_offset; -m_data->l2_cache_entry = &l2_table[l2_index]; -} +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; } + *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; return VMDK_OK; } @@ -1603,9 +1572,7 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = vmdk_get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1796,13 +1763,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); +ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset); + if (ret != VMDK_OK) { /* if not allocated, try to read from parent image, if exist */ if (bs->backing && ret != VMDK_ZEROED) { @@ -2549,9 +2517,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, sector_num); break; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, +ret = vmdk_get_cluster_offset(bs, extent, sector_num << BDRV_SECTOR_BITS, - false, &clu
[Qemu-block] [PATCH v7 7/8] vmdk: Update metadata for multiple clusters
Include a next pointer in VmdkMetaData struct to point to the previous allocated L2 table. Modify vmdk_L2update to start updating metadata for allocation of multiple clusters at once. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 128 ++- 1 file changed, 101 insertions(+), 27 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 277db16..60b8adc 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -137,6 +137,8 @@ typedef struct VmdkMetaData { int valid; uint32_t *l2_cache_entry; uint32_t nb_clusters; +uint32_t offset; +struct VmdkMetaData *next; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1116,34 +1118,87 @@ exit: return ret; } -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, - uint32_t offset) +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent, + VmdkMetaData *m_data, bool zeroed) { -offset = cpu_to_le32(offset); +int i; +uint32_t offset, temp_offset; +int *l2_table_array; +int l2_array_size; + +if (zeroed) { +temp_offset = VMDK_GTE_ZEROED; +} else { +temp_offset = m_data->offset; +} + +l2_array_size = sizeof(uint32_t) * m_data->nb_clusters; +l2_table_array = qemu_try_blockalign(extent->file->bs, + QEMU_ALIGN_UP(l2_array_size, + BDRV_SECTOR_SIZE)); +if (l2_table_array == NULL) { +return VMDK_ERROR; +} +memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE)); /* update L2 table */ +offset = temp_offset; +for (i = 0; i < m_data->nb_clusters; i++) { +l2_table_array[i] = cpu_to_le32(offset); +if (!zeroed) { +offset += extent->cluster_sectors; +} +} if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } /* update backup L2 table */ if (extent->l1_backup_table_offset != 0) { m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } } + +offset = temp_offset; if (m_data->l2_cache_entry) { -*m_data->l2_cache_entry = offset; +for (i = 0; i < m_data->nb_clusters; i++) { +*m_data->l2_cache_entry = cpu_to_le32(offset); +m_data->l2_cache_entry++; + +if (!zeroed) { +offset += extent->cluster_sectors; +} +} } +qemu_vfree(l2_table_array); return VMDK_OK; } +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + bool zeroed) +{ +int ret; + +while (m_data->next != NULL) { + +ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed); +if (ret < 0) { +return ret; +} + +m_data = m_data->next; + } + + return VMDK_OK; +} + /* * vmdk_l2load * @@ -1260,9 +1315,10 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, * * VMDK_ERROR:in error cases */ + static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint64_t offset, uint64_t *cluster_offset, - int64_t *bytes, VmdkMetaData *m_data, + int64_t *bytes, VmdkMetaData **m_data, bool allocate, uint32_t *total_alloc_clusters) { int l1_index, l2_offset, l2_index; @@ -1271,6 +1327,7 @@ static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint32_t nb_clusters; bool zeroed = false; uint64_t skip_start_bytes, skip_end_bytes; +VmdkMetaData *old_m_data; int ret; ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, @@ -1331,13 +1388,21 @@ static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, if (ret < 0) { return ret; } -if (m_data) { -m_data->valid = 1; -m_data-
[Qemu-block] [PATCH v7 6/8] vmdk: New functions to assist allocating multiple clusters
Introduce two new helper functions handle_alloc() and vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple clusters at once starting from a given offset on disk and performs COW if necessary for first and last allocated clusters. vmdk_alloc_cluster_offset() helps to return the offset of the first of the many newly allocated clusters. Also, provide proper documentation for both. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 200 --- 1 file changed, 190 insertions(+), 10 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index fe2046b..277db16 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -136,6 +136,7 @@ typedef struct VmdkMetaData { unsigned int l2_offset; int valid; uint32_t *l2_cache_entry; +uint32_t nb_clusters; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1242,6 +1243,182 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, return VMDK_OK; } +/* + * vmdk_handle_alloc + * + * Allocate new clusters for an area that either is yet unallocated or needs a + * copy on write. + * + * Returns: + * VMDK_OK: if new clusters were allocated, *bytes may be decreased if + * the new allocation doesn't cover all of the requested area. + * *cluster_offset is updated to contain the offset of the + * first newly allocated cluster. + * + * VMDK_UNALLOC: if no clusters could be allocated. *cluster_offset is left + * unchanged. + * + * VMDK_ERROR:in error cases + */ +static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, + uint64_t offset, uint64_t *cluster_offset, + int64_t *bytes, VmdkMetaData *m_data, + bool allocate, uint32_t *total_alloc_clusters) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +uint32_t cluster_sector; +uint32_t nb_clusters; +bool zeroed = false; +uint64_t skip_start_bytes, skip_end_bytes; +int ret; + +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, +&l2_index, &l2_table); +if (ret < 0) { +return ret; +} + +cluster_sector = le32_to_cpu(l2_table[l2_index]); + +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset); +/* Calculate the number of clusters to look for. Here we truncate the last + * cluster, i.e. 1 less than the actual value calculated as we may need to + * perform COW for the last one. */ +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes, + extent->cluster_sectors << BDRV_SECTOR_BITS) - 1; + +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index); +assert(nb_clusters <= INT_MAX); + +/* update bytes according to final nb_clusters value */ +if (nb_clusters != 0) { +*bytes = ((nb_clusters * extent->cluster_sectors) << BDRV_SECTOR_BITS) + - skip_start_bytes; +} else { +nb_clusters = 1; +} +*total_alloc_clusters += nb_clusters; + +/* we need to use MIN() for basically 3 cases that arise : + * 1. alloc very first cluster : here skip_start_bytes >= 0 and + **bytes <= cluster_size. + * 2. alloc middle clusters : here *bytes is a perfect multiple of + *cluster_size and skip_start_bytes is 0. + * 3. alloc very last cluster : here *bytes <= cluster_size and + *skip_start_bytes is 0 + */ +skip_end_bytes = skip_start_bytes + MIN(*bytes, + extent->cluster_sectors * BDRV_SECTOR_SIZE +- skip_start_bytes); + +if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { +zeroed = true; +} + +if (!cluster_sector || zeroed) { +if (!allocate) { +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; +} + +cluster_sector = extent->next_cluster_sector; +extent->next_cluster_sector += extent->cluster_sectors +* nb_clusters; + +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, + offset, skip_start_bytes, + skip_end_bytes); +if (ret < 0) { +return ret; +} +if (m_data) { +m_data->valid = 1; +m_data->l1_index = l1_index; +m_data->l2_index = l2_index; +m_data->l2_offset = l2_offset; +m_data->l2_cache_entry = &l2_table[l2_index]; +m_data->nb_clusters = nb_clusters; +} +} +*cluster_offset = cluster_sector << BDRV_SECTOR_BITS; +return VMDK_OK; +} + +/* + * vmdk_alloc_clusters + * + * For a given offset on the virtual disk, find the clust
[Qemu-block] [PATCH v7 5/8] vmdk: Set maximum bytes allocated in one cycle
Set the maximum bytes allowed to get allocated at once to be not more than the extent size boundary to handle writes at two separate extents appropriately. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 5647f53..fe2046b 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1624,6 +1624,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t cluster_offset; uint64_t bytes_done = 0; VmdkMetaData m_data; +uint64_t extent_end; if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) { error_report("Wrong offset: offset=0x%" PRIx64 @@ -1637,9 +1638,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, if (!extent) { return -EIO; } +extent_end = extent->end_sector * BDRV_SECTOR_SIZE; + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - - offset_in_cluster); + +/* truncate n_bytes to first cluster because we need to perform COW */ +if (offset_in_cluster > 0) { +n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); +} else { +n_bytes = MIN(bytes, extent_end - offset); +} ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, !(extent->compressed || zeroed), -- 2.6.2
[Qemu-block] [PATCH v7 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
Rename the existing get_cluster_offset() to vmdk_get_cluster_offset() and update name in all the callers accordingly. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 46 +++--- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 73ae786..f403981 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1144,7 +1144,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, } /** - * get_cluster_offset + * vmdk_get_cluster_offset * * Look up cluster offset in extent file by sector number, and store in * @cluster_offset. @@ -1163,14 +1163,14 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, * VMDK_UNALLOC if cluster is not mapped and @allocate is false. * VMDK_ERROR if failed. */ -static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_get_cluster_offset(BlockDriverState *bs, + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + bool allocate, + uint64_t *cluster_offset, + uint64_t skip_start_bytes, + uint64_t skip_end_bytes) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; @@ -1304,9 +1304,9 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num * 512, false, &offset, + 0, 0); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1497,8 +1497,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + offset, false, &cluster_offset, 0, 0); offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE @@ -1584,10 +1584,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); -ret = get_cluster_offset(bs, extent, &m_data, offset, - !(extent->compressed || zeroed), - &cluster_offset, offset_in_cluster, - offset_in_cluster + n_bytes); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + !(extent->compressed || zeroed), + &cluster_offset, offset_in_cluster, + offset_in_cluster + n_bytes); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1596,8 +1596,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, return -EIO; } else { /* allocate */ -ret = get_cluster_offset(bs, extent, &m_data, offset, - true, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + true, &cluster_offset, 0, 0); } } if (ret == VMDK_ERROR) { @@ -2229,9 +2229,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, sector_num); break; } -ret = get_cluster_offset(bs, extent, NULL, - sector_num << BDRV_SECTOR_BITS, - false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num << BDRV_SECTOR_BITS, +
[Qemu-block] [PATCH v7 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
Rename the existing function get_whole_cluster() to vmdk_perform_cow() as its sole purpose is to perform COW for the first and the last allocated clusters if needed. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 23 ++- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 22be887..73ae786 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } -/** - * get_whole_cluster +/* + * vmdk_perform_cow * * Copy backing file's cluster that covers @sector_num, otherwise write zero, * to the cluster at @cluster_sector_num. @@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) * If @skip_start_sector < @skip_end_sector, the relative range * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave * it for call to write user data in the request. + * + * Returns: + * VMDK_OK: on success + * + * VMDK_ERROR:in error cases */ -static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_offset, - uint64_t offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_perform_cow(BlockDriverState *bs, +VmdkExtent *extent, +uint64_t cluster_offset, +uint64_t offset, +uint64_t skip_start_bytes, +uint64_t skip_end_bytes) { int ret = VMDK_OK; int64_t cluster_bytes; @@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs, * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ -ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, offset, skip_start_bytes, skip_end_bytes); if (ret) { return ret; -- 2.6.2
[Qemu-block] [PATCH v7 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top
Move the existing vmdk_find_offset_in_cluster() function to the top of the driver. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index a9bd22b..22be887 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs) s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); } +static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, + int64_t offset) +{ +uint64_t extent_begin_offset, extent_relative_offset; +uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; + +extent_begin_offset = +(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; +extent_relative_offset = offset - extent_begin_offset; +return extent_relative_offset % cluster_size; +} + static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { char *desc; @@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } -static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, - int64_t offset) -{ -uint64_t extent_begin_offset, extent_relative_offset; -uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; - -extent_begin_offset = -(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; -extent_relative_offset = offset - extent_begin_offset; -return extent_relative_offset % cluster_size; -} - static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, int64_t sector_num) { -- 2.6.2
[Qemu-block] [PATCH v7 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()
Move the cluster tables loading code out of the existing vmdk_get_cluster_offset() function and implement it in separate get_cluster_table() and vmdk_l2load() functions. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 153 --- 1 file changed, 105 insertions(+), 48 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index f403981..5647f53 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1143,6 +1143,105 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, return VMDK_OK; } +/* + * vmdk_l2load + * + * Load a new L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns: + * VMDK_OK: on success + * VMDK_ERROR:in error cases + */ +static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset, + uint32_t **new_l2_table, int *new_l2_index) +{ +int min_index, i, j; +uint32_t *l2_table; +uint32_t min_count; + +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (l2_offset == extent->l2_cache_offsets[i]) { +/* increment the hit count */ +if (++extent->l2_cache_counts[i] == UINT32_MAX) { +for (j = 0; j < L2_CACHE_SIZE; j++) { +extent->l2_cache_counts[j] >>= 1; +} +} +l2_table = extent->l2_cache + (i * extent->l2_size); +goto found; +} +} +/* not found: load a new entry in the least used one */ +min_index = 0; +min_count = UINT32_MAX; +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (extent->l2_cache_counts[i] < min_count) { +min_count = extent->l2_cache_counts[i]; +min_index = i; +} +} +l2_table = extent->l2_cache + (min_index * extent->l2_size); +if (bdrv_pread(extent->file, +(int64_t)l2_offset * 512, +l2_table, +extent->l2_size * sizeof(uint32_t) +) != extent->l2_size * sizeof(uint32_t)) { +return VMDK_ERROR; +} + +extent->l2_cache_offsets[min_index] = l2_offset; +extent->l2_cache_counts[min_index] = 1; +found: +*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; +*new_l2_table = l2_table; + +return VMDK_OK; +} + +/* + * get_cluster_table + * + * For a given offset, load (and allocate if needed) the l2 table. + * + * Returns: + * VMDK_OK:on success + * + * VMDK_UNALLOC: if cluster is not mapped + * + * VMDK_ERROR: in error cases + */ +static int get_cluster_table(VmdkExtent *extent, uint64_t offset, + int *new_l1_index, int *new_l2_offset, + int *new_l2_index, uint32_t **new_l2_table) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +int ret; + +offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; +l1_index = (offset >> 9) / extent->l1_entry_sectors; +if (l1_index >= extent->l1_size) { +return VMDK_ERROR; +} +l2_offset = extent->l1_table[l1_index]; +if (!l2_offset) { +return VMDK_UNALLOC; +} + +ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index); +if (ret < 0) { +return ret; +} + +*new_l1_index = l1_index; +*new_l2_offset = l2_offset; +*new_l2_index = l2_index; +*new_l2_table = l2_table; + +return VMDK_OK; +} + /** * vmdk_get_cluster_offset * @@ -1172,66 +1271,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, uint64_t skip_start_bytes, uint64_t skip_end_bytes) { -unsigned int l1_index, l2_offset, l2_index; -int min_index, i, j; -uint32_t min_count, *l2_table; +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; bool zeroed = false; int64_t ret; int64_t cluster_sector; -if (m_data) { -m_data->valid = 0; -} if (extent->flat) { *cluster_offset = extent->flat_start_offset; return VMDK_OK; } -offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; -l1_index = (offset >> 9) / extent->l1_entry_sectors; -if (l1_index >= extent->l1_size) { -return VMDK_ERROR; -} -l2_offset = extent->l1_table[l1_index]; -if (!l2_offset) { -return VMDK_UNALLOC; -} -for (i = 0; i < L2_CACHE_SIZE; i++) { -if (l2_offset == extent->l2_cache_offsets[i]) { -/* increment the hit count */ -if (++extent->l2_cache_counts[i] == 0x) { -for (j = 0; j < L2_CACHE_SIZE; j++) { -extent->l2_cache_counts[j] >>= 1; -} -
[Qemu-block] [PATCH v7 0/8] Optimize VMDK I/O by allocating multiple clusters
Previously posted series patches: v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html v5 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00929.html v6 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00947.html This series helps to optimize the I/O performance of VMDK driver. Patch 1 helps us to move vmdk_find_offset_in_cluster. Patch 2 & 3 perform a simple function re-naming tasks. Patch 4 is used to factor out metadata loading code and implement it in separate functions. This will help us to avoid code duplication in future patches of this series. Patch 5 helps to set the upper limit of the bytes handled in one cycle. Patch 6 adds new functions to help us allocate multiple clusters according to the size requested, perform COW if required and return the offset of the first newly allocated cluster. Patch 7 changes the metadata update code to update the L2 tables for multiple clusters at once. Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster offset only as cluster allocation task is now handled by vmdk_alloc_clusters() Optimization test results: This patch series improves 128 KB sequential write performance to an empty VMDK file by 54% Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f vmdk test.vmdk Changes in v7: - comment the use of MIN() in calculating skip_end_bytes - use extent->cluster_sectors instead of 128 - place check for m_data != NULL - use g_new0(VmdkMetaData, 1) instead of g_malloc0(sizeof(*m_data)) Changes in v6: - rename total_alloc_clusters as alloc_clusters_counter (fam) Changes in v5: - fix commit message and comment in patch 4 (fam) - add vmdk_ prefix to handle_alloc() (fam) - fix alignment issue in patch 6 (fam) - use BDRV_SECTOR_BITS (fam) - fix endianness calculation in patch 7 (fam) Changes in v4: - fix commit message in patch 1 (fam) - drop size_to_clusters() function (fam) - fix grammatical errors in function documentations (fam) - factor out metadata loading coding in a separate patch (patch 4) (fam) - rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam) - break patch 4(in v3) into separate patches (patch 3 and 8) (fam) - rename extent_size to extent_end (fam) - use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam) - drop next and simply do m_data = m_data->next (fam) Changes in v3: - move size_to_clusters() from patch 1 to 3 (fam) - use DIV_ROUND_UP in size_to_clusters (fam) - make patch 2 compilable (fam) - rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam) - combine patch 3 and patch 4 (as in v2) to make them compilable (fam) - call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam) Changes in v2: - segregate the ugly Patch 1 in v1 into 6 readable and sensible patches - include benchmark test results in v2 Ashijeet Acharya (8): vmdk: Move vmdk_find_offset_in_cluster() to the top vmdk: Rename get_whole_cluster() to vmdk_perform_cow() vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset() vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset() vmdk: Set maximum bytes allocated in one cycle vmdk: New functions to assist allocating multiple clusters vmdk: Update metadata for multiple clusters vmdk: Make vmdk_get_cluster_offset() return cluster offset only block/vmdk.c | 537 +-- 1 file changed, 415 insertions(+), 122 deletions(-) -- 2.6.2
Re: [Qemu-block] [PATCH v6 7/8] vmdk: Update metadata for multiple clusters
On Tue, Jun 27, 2017 at 1:34 PM, Fam Zheng wrote: > On Mon, 06/05 13:22, Ashijeet Acharya wrote: >> @@ -1876,6 +1942,13 @@ static int vmdk_pwritev(BlockDriverState *bs, >> uint64_t offset, >> offset += n_bytes; >> bytes_done += n_bytes; >> >> +while (m_data->next != NULL) { > > If you do > >while (m_data) { > >> +VmdkMetaData *next; >> +next = m_data->next; >> +g_free(m_data); >> +m_data = next; >> +} >> + >> /* update CID on the first write every time the virtual disk is >> * opened */ >> if (!s->cid_updated) { >> @@ -1886,6 +1959,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t >> offset, >> s->cid_updated = true; >> } >> } >> +g_free(m_data); > > then you can remove this line. As I explained last time, I can't do this because I am reusing the first allocated m_data. If I am to do it the way you suggest, I will have to move the allocation of first m_data (m_data = g_new0(VmdkMetaData, 1)) inside the outer while loop, otherwise things will segfault. Ashijeet
Re: [Qemu-block] [PATCH v6 6/8] vmdk: New functions to assist allocating multiple clusters
On Tue, Jun 27, 2017 at 1:32 PM, Fam Zheng wrote: > On Mon, 06/05 13:22, Ashijeet Acharya wrote: >> +/* >> + * vmdk_handle_alloc >> + * >> + * Allocate new clusters for an area that either is yet unallocated or >> needs a >> + * copy on write. If *cluster_offset is non_zero, clusters are only >> allocated if >> + * the new allocation can match the specified host offset. > > I don't think this matches the function body, the passed in *cluster_offset > value is ignored. > >> + * >> + * Returns: >> + * VMDK_OK: if new clusters were allocated, *bytes may be decreased >> if >> + * the new allocation doesn't cover all of the requested >> area. >> + * *cluster_offset is updated to contain the offset of the >> + * first newly allocated cluster. >> + * >> + * VMDK_UNALLOC: if no clusters could be allocated. *cluster_offset is >> left >> + * unchanged. >> + * >> + * VMDK_ERROR:in error cases >> + */ >> +static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, >> + uint64_t offset, uint64_t *cluster_offset, >> + int64_t *bytes, VmdkMetaData *m_data, >> + bool allocate, uint32_t >> *alloc_clusters_counter) >> +{ >> +int l1_index, l2_offset, l2_index; >> +uint32_t *l2_table; >> +uint32_t cluster_sector; >> +uint32_t nb_clusters; >> +bool zeroed = false; >> +uint64_t skip_start_bytes, skip_end_bytes; >> +int ret; >> + >> +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, >> +&l2_index, &l2_table); >> +if (ret < 0) { >> +return ret; >> +} >> + >> +cluster_sector = le32_to_cpu(l2_table[l2_index]); >> + >> +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset); >> +/* Calculate the number of clusters to look for. Here we truncate the >> last >> + * cluster, i.e. 1 less than the actual value calculated as we may need >> to >> + * perform COW for the last one. */ >> +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes, >> + extent->cluster_sectors << BDRV_SECTOR_BITS) >> - 1; >> + >> +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index); >> +assert(nb_clusters <= INT_MAX); >> + >> +/* update bytes according to final nb_clusters value */ >> +if (nb_clusters != 0) { >> +*bytes = ((nb_clusters * extent->cluster_sectors) << >> BDRV_SECTOR_BITS) >> + - skip_start_bytes; >> +} else { >> +nb_clusters = 1; >> +} >> +*alloc_clusters_counter += nb_clusters; >> +skip_end_bytes = skip_start_bytes + MIN(*bytes, >> + extent->cluster_sectors * BDRV_SECTOR_SIZE >> +- skip_start_bytes); > > I don't understand the MIN part, shouldn't skip_end_bytes simply be > skip_start_bytes + *bytes? > >> + >> +if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { >> +zeroed = true; >> +} >> + >> +if (!cluster_sector || zeroed) { >> +if (!allocate) { >> +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; >> +} >> + >> +cluster_sector = extent->next_cluster_sector; >> +extent->next_cluster_sector += extent->cluster_sectors >> +* nb_clusters; >> + >> +ret = vmdk_perform_cow(bs, extent, cluster_sector * >> BDRV_SECTOR_SIZE, >> + offset, skip_start_bytes, >> + skip_end_bytes); >> +if (ret < 0) { >> +return ret; >> +} >> +if (m_data) { >> +m_data->valid = 1; >> +m_data->l1_index = l1_index; >> +m_data->l2_index = l2_index; >> +m_data->l2_offset = l2_offset; >> +m_data->l2_cache_entry = &l2_table[l2_index]; >> +m_data->nb_clusters = nb_clusters; >> +} >> +} >> +*cluster_offset = cluster_sector << BDRV_SECTOR_BITS; >> +return VMDK_OK; >> +} >> + >> +/* >> + * vmdk_alloc_clusters >> + * >> + * For a given offs
[Qemu-block] [PATCH v6 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only
vmdk_alloc_clusters() introduced earlier now handles the task of allocating clusters and performing COW when needed. Thus we can change vmdk_get_cluster_offset() to stick to the sole purpose of returning cluster offset using sector number. Update the changes at all call sites. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 56 1 file changed, 12 insertions(+), 44 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 9fa2414..accf1c3 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1485,25 +1485,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs, * For flat extents, the start offset as parsed from the description file is * returned. * - * For sparse extents, look up in L1, L2 table. If allocate is true, return an - * offset for a new cluster and update L2 cache. If there is a backing file, - * COW is done before returning; otherwise, zeroes are written to the allocated - * cluster. Both COW and zero writing skips the sector range - * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller - * has new data to write there. + * For sparse extents, look up the L1, L2 table. * * Returns: VMDK_OK if cluster exists and mapped in the image. - * VMDK_UNALLOC if cluster is not mapped and @allocate is false. - * VMDK_ERROR if failed. + * VMDK_UNALLOC if cluster is not mapped. + * VMDK_ERROR if failed */ static int vmdk_get_cluster_offset(BlockDriverState *bs, VmdkExtent *extent, - VmdkMetaData *m_data, uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) + uint64_t *cluster_offset) { int l1_index, l2_offset, l2_index; uint32_t *l2_table; @@ -1528,31 +1519,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, } if (!cluster_sector || zeroed) { -if (!allocate) { -return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; -} - -cluster_sector = extent->next_cluster_sector; -extent->next_cluster_sector += extent->cluster_sectors; - -/* First of all we write grain itself, to avoid race condition - * that may to corrupt the image. - * This problem may occur because of insufficient space on host disk - * or inappropriate VM shutdown. - */ -ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, -offset, skip_start_bytes, skip_end_bytes); -if (ret) { -return ret; -} -if (m_data) { -m_data->valid = 1; -m_data->l1_index = l1_index; -m_data->l2_index = l2_index; -m_data->l2_offset = l2_offset; -m_data->l2_cache_entry = &l2_table[l2_index]; -} +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; } + *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; return VMDK_OK; } @@ -1595,9 +1564,7 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = vmdk_get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1788,13 +1755,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); +ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset); + if (ret != VMDK_OK) { /* if not allocated, try to read from parent image, if exist */ if (bs->backing && ret != VMDK_ZEROED) { @@ -2541,9 +2509,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, sector_num); break; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, +ret = vmdk_get_cluster_offset(bs, extent, sector_num << BDRV_SECTOR_BITS, - false, &cluster_offset, 0, 0);
[Qemu-block] [PATCH v6 7/8] vmdk: Update metadata for multiple clusters
Include a next pointer in VmdkMetaData struct to point to the previous allocated L2 table. Modify vmdk_L2update to start updating metadata for allocation of multiple clusters at once. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 128 ++- 1 file changed, 101 insertions(+), 27 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index b671dc9..9fa2414 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -137,6 +137,8 @@ typedef struct VmdkMetaData { int valid; uint32_t *l2_cache_entry; uint32_t nb_clusters; +uint32_t offset; +struct VmdkMetaData *next; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1116,34 +1118,87 @@ exit: return ret; } -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, - uint32_t offset) +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent, + VmdkMetaData *m_data, bool zeroed) { -offset = cpu_to_le32(offset); +int i; +uint32_t offset, temp_offset; +int *l2_table_array; +int l2_array_size; + +if (zeroed) { +temp_offset = VMDK_GTE_ZEROED; +} else { +temp_offset = m_data->offset; +} + +l2_array_size = sizeof(uint32_t) * m_data->nb_clusters; +l2_table_array = qemu_try_blockalign(extent->file->bs, + QEMU_ALIGN_UP(l2_array_size, + BDRV_SECTOR_SIZE)); +if (l2_table_array == NULL) { +return VMDK_ERROR; +} +memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE)); /* update L2 table */ +offset = temp_offset; +for (i = 0; i < m_data->nb_clusters; i++) { +l2_table_array[i] = cpu_to_le32(offset); +if (!zeroed) { +offset += 128; +} +} if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } /* update backup L2 table */ if (extent->l1_backup_table_offset != 0) { m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } } + +offset = temp_offset; if (m_data->l2_cache_entry) { -*m_data->l2_cache_entry = offset; +for (i = 0; i < m_data->nb_clusters; i++) { +*m_data->l2_cache_entry = cpu_to_le32(offset); +m_data->l2_cache_entry++; + +if (!zeroed) { +offset += 128; +} +} } +qemu_vfree(l2_table_array); return VMDK_OK; } +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + bool zeroed) +{ +int ret; + +while (m_data->next != NULL) { + +ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed); +if (ret < 0) { +return ret; +} + +m_data = m_data->next; + } + + return VMDK_OK; +} + /* * vmdk_l2load * @@ -1261,9 +1316,10 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, * * VMDK_ERROR:in error cases */ + static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint64_t offset, uint64_t *cluster_offset, - int64_t *bytes, VmdkMetaData *m_data, + int64_t *bytes, VmdkMetaData **m_data, bool allocate, uint32_t *alloc_clusters_counter) { int l1_index, l2_offset, l2_index; @@ -1272,6 +1328,7 @@ static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint32_t nb_clusters; bool zeroed = false; uint64_t skip_start_bytes, skip_end_bytes; +VmdkMetaData *old_m_data; int ret; ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, @@ -1323,13 +1380,21 @@ static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, if (ret < 0) { return ret; } -if (m_data) { -m_data->valid = 1; -m_data->l1_index = l1_index; -m_dat
[Qemu-block] [PATCH v6 6/8] vmdk: New functions to assist allocating multiple clusters
Introduce two new helper functions handle_alloc() and vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple clusters at once starting from a given offset on disk and performs COW if necessary for first and last allocated clusters. vmdk_alloc_cluster_offset() helps to return the offset of the first of the many newly allocated clusters. Also, provide proper documentation for both. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 192 +++ 1 file changed, 182 insertions(+), 10 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index fe2046b..b671dc9 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -136,6 +136,7 @@ typedef struct VmdkMetaData { unsigned int l2_offset; int valid; uint32_t *l2_cache_entry; +uint32_t nb_clusters; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1242,6 +1243,174 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, return VMDK_OK; } +/* + * vmdk_handle_alloc + * + * Allocate new clusters for an area that either is yet unallocated or needs a + * copy on write. If *cluster_offset is non_zero, clusters are only allocated if + * the new allocation can match the specified host offset. + * + * Returns: + * VMDK_OK: if new clusters were allocated, *bytes may be decreased if + * the new allocation doesn't cover all of the requested area. + * *cluster_offset is updated to contain the offset of the + * first newly allocated cluster. + * + * VMDK_UNALLOC: if no clusters could be allocated. *cluster_offset is left + * unchanged. + * + * VMDK_ERROR:in error cases + */ +static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, + uint64_t offset, uint64_t *cluster_offset, + int64_t *bytes, VmdkMetaData *m_data, + bool allocate, uint32_t *alloc_clusters_counter) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +uint32_t cluster_sector; +uint32_t nb_clusters; +bool zeroed = false; +uint64_t skip_start_bytes, skip_end_bytes; +int ret; + +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, +&l2_index, &l2_table); +if (ret < 0) { +return ret; +} + +cluster_sector = le32_to_cpu(l2_table[l2_index]); + +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset); +/* Calculate the number of clusters to look for. Here we truncate the last + * cluster, i.e. 1 less than the actual value calculated as we may need to + * perform COW for the last one. */ +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes, + extent->cluster_sectors << BDRV_SECTOR_BITS) - 1; + +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index); +assert(nb_clusters <= INT_MAX); + +/* update bytes according to final nb_clusters value */ +if (nb_clusters != 0) { +*bytes = ((nb_clusters * extent->cluster_sectors) << BDRV_SECTOR_BITS) + - skip_start_bytes; +} else { +nb_clusters = 1; +} +*alloc_clusters_counter += nb_clusters; +skip_end_bytes = skip_start_bytes + MIN(*bytes, + extent->cluster_sectors * BDRV_SECTOR_SIZE +- skip_start_bytes); + +if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { +zeroed = true; +} + +if (!cluster_sector || zeroed) { +if (!allocate) { +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; +} + +cluster_sector = extent->next_cluster_sector; +extent->next_cluster_sector += extent->cluster_sectors +* nb_clusters; + +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, + offset, skip_start_bytes, + skip_end_bytes); +if (ret < 0) { +return ret; +} +if (m_data) { +m_data->valid = 1; +m_data->l1_index = l1_index; +m_data->l2_index = l2_index; +m_data->l2_offset = l2_offset; +m_data->l2_cache_entry = &l2_table[l2_index]; +m_data->nb_clusters = nb_clusters; +} +} +*cluster_offset = cluster_sector << BDRV_SECTOR_BITS; +return VMDK_OK; +} + +/* + * vmdk_alloc_clusters + * + * For a given offset on the virtual disk, find the cluster offset in vmdk + * file. If the offset is not found, allocate a new cluster. + * + * If the cluster is newly allocated, m_data->nb_clusters is set to the number + * of contiguous clusters that have been allocated. In this case, the other + * fields of m_data are valid and contain in
[Qemu-block] [PATCH v6 5/8] vmdk: Set maximum bytes allocated in one cycle
Set the maximum bytes allowed to get allocated at once to be not more than the extent size boundary to handle writes at two separate extents appropriately. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 5647f53..fe2046b 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1624,6 +1624,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t cluster_offset; uint64_t bytes_done = 0; VmdkMetaData m_data; +uint64_t extent_end; if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) { error_report("Wrong offset: offset=0x%" PRIx64 @@ -1637,9 +1638,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, if (!extent) { return -EIO; } +extent_end = extent->end_sector * BDRV_SECTOR_SIZE; + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - - offset_in_cluster); + +/* truncate n_bytes to first cluster because we need to perform COW */ +if (offset_in_cluster > 0) { +n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); +} else { +n_bytes = MIN(bytes, extent_end - offset); +} ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, !(extent->compressed || zeroed), -- 2.6.2
[Qemu-block] [PATCH v6 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()
Move the cluster tables loading code out of the existing vmdk_get_cluster_offset() function and implement it in separate get_cluster_table() and vmdk_l2load() functions. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 153 --- 1 file changed, 105 insertions(+), 48 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index f403981..5647f53 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1143,6 +1143,105 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, return VMDK_OK; } +/* + * vmdk_l2load + * + * Load a new L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns: + * VMDK_OK: on success + * VMDK_ERROR:in error cases + */ +static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset, + uint32_t **new_l2_table, int *new_l2_index) +{ +int min_index, i, j; +uint32_t *l2_table; +uint32_t min_count; + +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (l2_offset == extent->l2_cache_offsets[i]) { +/* increment the hit count */ +if (++extent->l2_cache_counts[i] == UINT32_MAX) { +for (j = 0; j < L2_CACHE_SIZE; j++) { +extent->l2_cache_counts[j] >>= 1; +} +} +l2_table = extent->l2_cache + (i * extent->l2_size); +goto found; +} +} +/* not found: load a new entry in the least used one */ +min_index = 0; +min_count = UINT32_MAX; +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (extent->l2_cache_counts[i] < min_count) { +min_count = extent->l2_cache_counts[i]; +min_index = i; +} +} +l2_table = extent->l2_cache + (min_index * extent->l2_size); +if (bdrv_pread(extent->file, +(int64_t)l2_offset * 512, +l2_table, +extent->l2_size * sizeof(uint32_t) +) != extent->l2_size * sizeof(uint32_t)) { +return VMDK_ERROR; +} + +extent->l2_cache_offsets[min_index] = l2_offset; +extent->l2_cache_counts[min_index] = 1; +found: +*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; +*new_l2_table = l2_table; + +return VMDK_OK; +} + +/* + * get_cluster_table + * + * For a given offset, load (and allocate if needed) the l2 table. + * + * Returns: + * VMDK_OK:on success + * + * VMDK_UNALLOC: if cluster is not mapped + * + * VMDK_ERROR: in error cases + */ +static int get_cluster_table(VmdkExtent *extent, uint64_t offset, + int *new_l1_index, int *new_l2_offset, + int *new_l2_index, uint32_t **new_l2_table) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +int ret; + +offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; +l1_index = (offset >> 9) / extent->l1_entry_sectors; +if (l1_index >= extent->l1_size) { +return VMDK_ERROR; +} +l2_offset = extent->l1_table[l1_index]; +if (!l2_offset) { +return VMDK_UNALLOC; +} + +ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index); +if (ret < 0) { +return ret; +} + +*new_l1_index = l1_index; +*new_l2_offset = l2_offset; +*new_l2_index = l2_index; +*new_l2_table = l2_table; + +return VMDK_OK; +} + /** * vmdk_get_cluster_offset * @@ -1172,66 +1271,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, uint64_t skip_start_bytes, uint64_t skip_end_bytes) { -unsigned int l1_index, l2_offset, l2_index; -int min_index, i, j; -uint32_t min_count, *l2_table; +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; bool zeroed = false; int64_t ret; int64_t cluster_sector; -if (m_data) { -m_data->valid = 0; -} if (extent->flat) { *cluster_offset = extent->flat_start_offset; return VMDK_OK; } -offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; -l1_index = (offset >> 9) / extent->l1_entry_sectors; -if (l1_index >= extent->l1_size) { -return VMDK_ERROR; -} -l2_offset = extent->l1_table[l1_index]; -if (!l2_offset) { -return VMDK_UNALLOC; -} -for (i = 0; i < L2_CACHE_SIZE; i++) { -if (l2_offset == extent->l2_cache_offsets[i]) { -/* increment the hit count */ -if (++extent->l2_cache_counts[i] == 0x) { -for (j = 0; j < L2_CACHE_SIZE; j++) { -extent->l2_cache_counts[j] >>= 1; -} -
[Qemu-block] [PATCH v6 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
Rename the existing function get_whole_cluster() to vmdk_perform_cow() as its sole purpose is to perform COW for the first and the last allocated clusters if needed. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 23 ++- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 22be887..73ae786 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } -/** - * get_whole_cluster +/* + * vmdk_perform_cow * * Copy backing file's cluster that covers @sector_num, otherwise write zero, * to the cluster at @cluster_sector_num. @@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) * If @skip_start_sector < @skip_end_sector, the relative range * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave * it for call to write user data in the request. + * + * Returns: + * VMDK_OK: on success + * + * VMDK_ERROR:in error cases */ -static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_offset, - uint64_t offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_perform_cow(BlockDriverState *bs, +VmdkExtent *extent, +uint64_t cluster_offset, +uint64_t offset, +uint64_t skip_start_bytes, +uint64_t skip_end_bytes) { int ret = VMDK_OK; int64_t cluster_bytes; @@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs, * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ -ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, offset, skip_start_bytes, skip_end_bytes); if (ret) { return ret; -- 2.6.2
[Qemu-block] [PATCH v6 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
Rename the existing get_cluster_offset() to vmdk_get_cluster_offset() and update name in all the callers accordingly. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 46 +++--- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 73ae786..f403981 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1144,7 +1144,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, } /** - * get_cluster_offset + * vmdk_get_cluster_offset * * Look up cluster offset in extent file by sector number, and store in * @cluster_offset. @@ -1163,14 +1163,14 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, * VMDK_UNALLOC if cluster is not mapped and @allocate is false. * VMDK_ERROR if failed. */ -static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_get_cluster_offset(BlockDriverState *bs, + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + bool allocate, + uint64_t *cluster_offset, + uint64_t skip_start_bytes, + uint64_t skip_end_bytes) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; @@ -1304,9 +1304,9 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num * 512, false, &offset, + 0, 0); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1497,8 +1497,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + offset, false, &cluster_offset, 0, 0); offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE @@ -1584,10 +1584,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); -ret = get_cluster_offset(bs, extent, &m_data, offset, - !(extent->compressed || zeroed), - &cluster_offset, offset_in_cluster, - offset_in_cluster + n_bytes); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + !(extent->compressed || zeroed), + &cluster_offset, offset_in_cluster, + offset_in_cluster + n_bytes); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1596,8 +1596,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, return -EIO; } else { /* allocate */ -ret = get_cluster_offset(bs, extent, &m_data, offset, - true, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + true, &cluster_offset, 0, 0); } } if (ret == VMDK_ERROR) { @@ -2229,9 +2229,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, sector_num); break; } -ret = get_cluster_offset(bs, extent, NULL, - sector_num << BDRV_SECTOR_BITS, - false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num << BDRV_SECTOR_BITS, +
[Qemu-block] [PATCH v6 0/8] Optimize VMDK I/O by allocating multiple clusters
Previously posted series patches: v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html v5 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00929.html This series helps to optimize the I/O performance of VMDK driver. Patch 1 helps us to move vmdk_find_offset_in_cluster. Patch 2 & 3 perform a simple function re-naming tasks. Patch 4 is used to factor out metadata loading code and implement it in separate functions. This will help us to avoid code duplication in future patches of this series. Patch 5 helps to set the upper limit of the bytes handled in one cycle. Patch 6 adds new functions to help us allocate multiple clusters according to the size requested, perform COW if required and return the offset of the first newly allocated cluster. Patch 7 changes the metadata update code to update the L2 tables for multiple clusters at once. Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster offset only as cluster allocation task is now handled by vmdk_alloc_clusters() Optimization test results: This patch series improves 128 KB sequential write performance to an empty VMDK file by 54% Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f vmdk test.vmdk Changes in v6: - rename total_alloc_clusters as alloc_clusters_counter (fam) Changes in v5: - fix commit message and comment in patch 4 (fam) - add vmdk_ prefix to handle_alloc() (fam) - fix alignment issue in patch 6 (fam) - use BDRV_SECTOR_BITS (fam) - fix endianness calculation in patch 7 (fam) Changes in v4: - fix commit message in patch 1 (fam) - drop size_to_clusters() function (fam) - fix grammatical errors in function documentations (fam) - factor out metadata loading coding in a separate patch (patch 4) (fam) - rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam) - break patch 4(in v3) into separate patches (patch 3 and 8) (fam) - rename extent_size to extent_end (fam) - use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam) - drop next and simply do m_data = m_data->next (fam) Changes in v3: - move size_to_clusters() from patch 1 to 3 (fam) - use DIV_ROUND_UP in size_to_clusters (fam) - make patch 2 compilable (fam) - rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam) - combine patch 3 and patch 4 (as in v2) to make them compilable (fam) - call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam) Changes in v2: - segregate the ugly Patch 1 in v1 into 6 readable and sensible patches - include benchmark test results in v2 Ashijeet Acharya (8): vmdk: Move vmdk_find_offset_in_cluster() to the top vmdk: Rename get_whole_cluster() to vmdk_perform_cow() vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset() vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset() vmdk: Set maximum bytes allocated in one cycle vmdk: New functions to assist allocating multiple clusters vmdk: Update metadata for multiple clusters vmdk: Make vmdk_get_cluster_offset() return cluster offset only block/vmdk.c | 529 +-- 1 file changed, 407 insertions(+), 122 deletions(-) -- 2.6.2
[Qemu-block] [PATCH v6 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top
Move the existing vmdk_find_offset_in_cluster() function to the top of the driver. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index a9bd22b..22be887 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs) s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); } +static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, + int64_t offset) +{ +uint64_t extent_begin_offset, extent_relative_offset; +uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; + +extent_begin_offset = +(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; +extent_relative_offset = offset - extent_begin_offset; +return extent_relative_offset % cluster_size; +} + static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { char *desc; @@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } -static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, - int64_t offset) -{ -uint64_t extent_begin_offset, extent_relative_offset; -uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; - -extent_begin_offset = -(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; -extent_relative_offset = offset - extent_begin_offset; -return extent_relative_offset % cluster_size; -} - static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, int64_t sector_num) { -- 2.6.2
Re: [Qemu-block] [PATCH v5 6/8] vmdk: New functions to assist allocating multiple clusters
On Mon, Jun 5, 2017 at 11:23 AM, Ashijeet Acharya wrote: > Introduce two new helper functions handle_alloc() and > vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple > clusters at once starting from a given offset on disk and performs COW > if necessary for first and last allocated clusters. > vmdk_alloc_cluster_offset() helps to return the offset of the first of > the many newly allocated clusters. Also, provide proper documentation > for both. > > Signed-off-by: Ashijeet Acharya > --- > block/vmdk.c | 192 > +++ > 1 file changed, 182 insertions(+), 10 deletions(-) > > diff --git a/block/vmdk.c b/block/vmdk.c > index fe2046b..b671dc9 100644 > --- a/block/vmdk.c > +++ b/block/vmdk.c > @@ -136,6 +136,7 @@ typedef struct VmdkMetaData { > unsigned int l2_offset; > int valid; > uint32_t *l2_cache_entry; > +uint32_t nb_clusters; > } VmdkMetaData; > > typedef struct VmdkGrainMarker { > @@ -1242,6 +1243,174 @@ static int get_cluster_table(VmdkExtent *extent, > uint64_t offset, > return VMDK_OK; > } > > +/* > + * vmdk_handle_alloc > + * > + * Allocate new clusters for an area that either is yet unallocated or needs > a > + * copy on write. If *cluster_offset is non_zero, clusters are only > allocated if > + * the new allocation can match the specified host offset. > + * > + * Returns: > + * VMDK_OK: if new clusters were allocated, *bytes may be decreased > if > + * the new allocation doesn't cover all of the requested > area. > + * *cluster_offset is updated to contain the offset of the > + * first newly allocated cluster. > + * > + * VMDK_UNALLOC: if no clusters could be allocated. *cluster_offset is > left > + * unchanged. > + * > + * VMDK_ERROR:in error cases > + */ > +static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, > + uint64_t offset, uint64_t *cluster_offset, > + int64_t *bytes, VmdkMetaData *m_data, > + bool allocate, uint32_t *total_alloc_clusters) > +{ > +int l1_index, l2_offset, l2_index; > +uint32_t *l2_table; > +uint32_t cluster_sector; > +uint32_t nb_clusters; > +bool zeroed = false; > +uint64_t skip_start_bytes, skip_end_bytes; > +int ret; > + > +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, > +&l2_index, &l2_table); > +if (ret < 0) { > +return ret; > +} > + > +cluster_sector = le32_to_cpu(l2_table[l2_index]); > + > +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset); > +/* Calculate the number of clusters to look for. Here we truncate the > last > + * cluster, i.e. 1 less than the actual value calculated as we may need > to > + * perform COW for the last one. */ > +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes, > + extent->cluster_sectors << BDRV_SECTOR_BITS) > - 1; > + > +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index); > +assert(nb_clusters <= INT_MAX); > + > +/* update bytes according to final nb_clusters value */ > +if (nb_clusters != 0) { > +*bytes = ((nb_clusters * extent->cluster_sectors) << > BDRV_SECTOR_BITS) > + - skip_start_bytes; > +} else { > +nb_clusters = 1; > +} > +*total_alloc_clusters += nb_clusters; I have left it as it is for now as I wasn't sure about what you meant. You can check my query I posted on v4 of this thread for more so that we can solve this soon. :-) Ashijeet > +skip_end_bytes = skip_start_bytes + MIN(*bytes, > + extent->cluster_sectors * BDRV_SECTOR_SIZE > +- skip_start_bytes); > + > +if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { > +zeroed = true; > +} > + > +if (!cluster_sector || zeroed) { > +if (!allocate) { > +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; > +} > + > +cluster_sector = extent->next_cluster_sector; > +extent->next_cluster_sector += extent->cluster_sectors > +* nb_clusters; > + > +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, > + offset, skip_start_bytes, > + skip_end_bytes); > +if (ret &l
[Qemu-block] [PATCH v5 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only
vmdk_alloc_clusters() introduced earlier now handles the task of allocating clusters and performing COW when needed. Thus we can change vmdk_get_cluster_offset() to stick to the sole purpose of returning cluster offset using sector number. Update the changes at all call sites. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 56 1 file changed, 12 insertions(+), 44 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 9fa2414..accf1c3 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1485,25 +1485,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs, * For flat extents, the start offset as parsed from the description file is * returned. * - * For sparse extents, look up in L1, L2 table. If allocate is true, return an - * offset for a new cluster and update L2 cache. If there is a backing file, - * COW is done before returning; otherwise, zeroes are written to the allocated - * cluster. Both COW and zero writing skips the sector range - * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller - * has new data to write there. + * For sparse extents, look up the L1, L2 table. * * Returns: VMDK_OK if cluster exists and mapped in the image. - * VMDK_UNALLOC if cluster is not mapped and @allocate is false. - * VMDK_ERROR if failed. + * VMDK_UNALLOC if cluster is not mapped. + * VMDK_ERROR if failed */ static int vmdk_get_cluster_offset(BlockDriverState *bs, VmdkExtent *extent, - VmdkMetaData *m_data, uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) + uint64_t *cluster_offset) { int l1_index, l2_offset, l2_index; uint32_t *l2_table; @@ -1528,31 +1519,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, } if (!cluster_sector || zeroed) { -if (!allocate) { -return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; -} - -cluster_sector = extent->next_cluster_sector; -extent->next_cluster_sector += extent->cluster_sectors; - -/* First of all we write grain itself, to avoid race condition - * that may to corrupt the image. - * This problem may occur because of insufficient space on host disk - * or inappropriate VM shutdown. - */ -ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, -offset, skip_start_bytes, skip_end_bytes); -if (ret) { -return ret; -} -if (m_data) { -m_data->valid = 1; -m_data->l1_index = l1_index; -m_data->l2_index = l2_index; -m_data->l2_offset = l2_offset; -m_data->l2_cache_entry = &l2_table[l2_index]; -} +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; } + *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; return VMDK_OK; } @@ -1595,9 +1564,7 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = vmdk_get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1788,13 +1755,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); +ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset); + if (ret != VMDK_OK) { /* if not allocated, try to read from parent image, if exist */ if (bs->backing && ret != VMDK_ZEROED) { @@ -2541,9 +2509,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, sector_num); break; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, +ret = vmdk_get_cluster_offset(bs, extent, sector_num << BDRV_SECTOR_BITS, - false, &cluster_offset, 0, 0);
[Qemu-block] [PATCH v5 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()
Move the cluster tables loading code out of the existing vmdk_get_cluster_offset() function and implement it in separate get_cluster_table() and vmdk_l2load() functions. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 153 --- 1 file changed, 105 insertions(+), 48 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index f403981..5647f53 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1143,6 +1143,105 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, return VMDK_OK; } +/* + * vmdk_l2load + * + * Load a new L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns: + * VMDK_OK: on success + * VMDK_ERROR:in error cases + */ +static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset, + uint32_t **new_l2_table, int *new_l2_index) +{ +int min_index, i, j; +uint32_t *l2_table; +uint32_t min_count; + +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (l2_offset == extent->l2_cache_offsets[i]) { +/* increment the hit count */ +if (++extent->l2_cache_counts[i] == UINT32_MAX) { +for (j = 0; j < L2_CACHE_SIZE; j++) { +extent->l2_cache_counts[j] >>= 1; +} +} +l2_table = extent->l2_cache + (i * extent->l2_size); +goto found; +} +} +/* not found: load a new entry in the least used one */ +min_index = 0; +min_count = UINT32_MAX; +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (extent->l2_cache_counts[i] < min_count) { +min_count = extent->l2_cache_counts[i]; +min_index = i; +} +} +l2_table = extent->l2_cache + (min_index * extent->l2_size); +if (bdrv_pread(extent->file, +(int64_t)l2_offset * 512, +l2_table, +extent->l2_size * sizeof(uint32_t) +) != extent->l2_size * sizeof(uint32_t)) { +return VMDK_ERROR; +} + +extent->l2_cache_offsets[min_index] = l2_offset; +extent->l2_cache_counts[min_index] = 1; +found: +*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; +*new_l2_table = l2_table; + +return VMDK_OK; +} + +/* + * get_cluster_table + * + * For a given offset, load (and allocate if needed) the l2 table. + * + * Returns: + * VMDK_OK:on success + * + * VMDK_UNALLOC: if cluster is not mapped + * + * VMDK_ERROR: in error cases + */ +static int get_cluster_table(VmdkExtent *extent, uint64_t offset, + int *new_l1_index, int *new_l2_offset, + int *new_l2_index, uint32_t **new_l2_table) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +int ret; + +offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; +l1_index = (offset >> 9) / extent->l1_entry_sectors; +if (l1_index >= extent->l1_size) { +return VMDK_ERROR; +} +l2_offset = extent->l1_table[l1_index]; +if (!l2_offset) { +return VMDK_UNALLOC; +} + +ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index); +if (ret < 0) { +return ret; +} + +*new_l1_index = l1_index; +*new_l2_offset = l2_offset; +*new_l2_index = l2_index; +*new_l2_table = l2_table; + +return VMDK_OK; +} + /** * vmdk_get_cluster_offset * @@ -1172,66 +1271,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, uint64_t skip_start_bytes, uint64_t skip_end_bytes) { -unsigned int l1_index, l2_offset, l2_index; -int min_index, i, j; -uint32_t min_count, *l2_table; +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; bool zeroed = false; int64_t ret; int64_t cluster_sector; -if (m_data) { -m_data->valid = 0; -} if (extent->flat) { *cluster_offset = extent->flat_start_offset; return VMDK_OK; } -offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; -l1_index = (offset >> 9) / extent->l1_entry_sectors; -if (l1_index >= extent->l1_size) { -return VMDK_ERROR; -} -l2_offset = extent->l1_table[l1_index]; -if (!l2_offset) { -return VMDK_UNALLOC; -} -for (i = 0; i < L2_CACHE_SIZE; i++) { -if (l2_offset == extent->l2_cache_offsets[i]) { -/* increment the hit count */ -if (++extent->l2_cache_counts[i] == 0x) { -for (j = 0; j < L2_CACHE_SIZE; j++) { -extent->l2_cache_counts[j] >>= 1; -} -
[Qemu-block] [PATCH v5 5/8] vmdk: Set maximum bytes allocated in one cycle
Set the maximum bytes allowed to get allocated at once to be not more than the extent size boundary to handle writes at two separate extents appropriately. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 5647f53..fe2046b 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1624,6 +1624,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t cluster_offset; uint64_t bytes_done = 0; VmdkMetaData m_data; +uint64_t extent_end; if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) { error_report("Wrong offset: offset=0x%" PRIx64 @@ -1637,9 +1638,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, if (!extent) { return -EIO; } +extent_end = extent->end_sector * BDRV_SECTOR_SIZE; + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - - offset_in_cluster); + +/* truncate n_bytes to first cluster because we need to perform COW */ +if (offset_in_cluster > 0) { +n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); +} else { +n_bytes = MIN(bytes, extent_end - offset); +} ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, !(extent->compressed || zeroed), -- 2.6.2
[Qemu-block] [PATCH v5 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
Rename the existing get_cluster_offset() to vmdk_get_cluster_offset() and update name in all the callers accordingly. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 46 +++--- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 73ae786..f403981 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1144,7 +1144,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, } /** - * get_cluster_offset + * vmdk_get_cluster_offset * * Look up cluster offset in extent file by sector number, and store in * @cluster_offset. @@ -1163,14 +1163,14 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, * VMDK_UNALLOC if cluster is not mapped and @allocate is false. * VMDK_ERROR if failed. */ -static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_get_cluster_offset(BlockDriverState *bs, + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + bool allocate, + uint64_t *cluster_offset, + uint64_t skip_start_bytes, + uint64_t skip_end_bytes) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; @@ -1304,9 +1304,9 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num * 512, false, &offset, + 0, 0); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1497,8 +1497,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + offset, false, &cluster_offset, 0, 0); offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE @@ -1584,10 +1584,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); -ret = get_cluster_offset(bs, extent, &m_data, offset, - !(extent->compressed || zeroed), - &cluster_offset, offset_in_cluster, - offset_in_cluster + n_bytes); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + !(extent->compressed || zeroed), + &cluster_offset, offset_in_cluster, + offset_in_cluster + n_bytes); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1596,8 +1596,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, return -EIO; } else { /* allocate */ -ret = get_cluster_offset(bs, extent, &m_data, offset, - true, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + true, &cluster_offset, 0, 0); } } if (ret == VMDK_ERROR) { @@ -2229,9 +2229,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, sector_num); break; } -ret = get_cluster_offset(bs, extent, NULL, - sector_num << BDRV_SECTOR_BITS, - false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num << BDRV_SECTOR_BITS, +
[Qemu-block] [PATCH v5 6/8] vmdk: New functions to assist allocating multiple clusters
Introduce two new helper functions handle_alloc() and vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple clusters at once starting from a given offset on disk and performs COW if necessary for first and last allocated clusters. vmdk_alloc_cluster_offset() helps to return the offset of the first of the many newly allocated clusters. Also, provide proper documentation for both. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 192 +++ 1 file changed, 182 insertions(+), 10 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index fe2046b..b671dc9 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -136,6 +136,7 @@ typedef struct VmdkMetaData { unsigned int l2_offset; int valid; uint32_t *l2_cache_entry; +uint32_t nb_clusters; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1242,6 +1243,174 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, return VMDK_OK; } +/* + * vmdk_handle_alloc + * + * Allocate new clusters for an area that either is yet unallocated or needs a + * copy on write. If *cluster_offset is non_zero, clusters are only allocated if + * the new allocation can match the specified host offset. + * + * Returns: + * VMDK_OK: if new clusters were allocated, *bytes may be decreased if + * the new allocation doesn't cover all of the requested area. + * *cluster_offset is updated to contain the offset of the + * first newly allocated cluster. + * + * VMDK_UNALLOC: if no clusters could be allocated. *cluster_offset is left + * unchanged. + * + * VMDK_ERROR:in error cases + */ +static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, + uint64_t offset, uint64_t *cluster_offset, + int64_t *bytes, VmdkMetaData *m_data, + bool allocate, uint32_t *total_alloc_clusters) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +uint32_t cluster_sector; +uint32_t nb_clusters; +bool zeroed = false; +uint64_t skip_start_bytes, skip_end_bytes; +int ret; + +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, +&l2_index, &l2_table); +if (ret < 0) { +return ret; +} + +cluster_sector = le32_to_cpu(l2_table[l2_index]); + +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset); +/* Calculate the number of clusters to look for. Here we truncate the last + * cluster, i.e. 1 less than the actual value calculated as we may need to + * perform COW for the last one. */ +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes, + extent->cluster_sectors << BDRV_SECTOR_BITS) - 1; + +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index); +assert(nb_clusters <= INT_MAX); + +/* update bytes according to final nb_clusters value */ +if (nb_clusters != 0) { +*bytes = ((nb_clusters * extent->cluster_sectors) << BDRV_SECTOR_BITS) + - skip_start_bytes; +} else { +nb_clusters = 1; +} +*total_alloc_clusters += nb_clusters; +skip_end_bytes = skip_start_bytes + MIN(*bytes, + extent->cluster_sectors * BDRV_SECTOR_SIZE +- skip_start_bytes); + +if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { +zeroed = true; +} + +if (!cluster_sector || zeroed) { +if (!allocate) { +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; +} + +cluster_sector = extent->next_cluster_sector; +extent->next_cluster_sector += extent->cluster_sectors +* nb_clusters; + +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, + offset, skip_start_bytes, + skip_end_bytes); +if (ret < 0) { +return ret; +} +if (m_data) { +m_data->valid = 1; +m_data->l1_index = l1_index; +m_data->l2_index = l2_index; +m_data->l2_offset = l2_offset; +m_data->l2_cache_entry = &l2_table[l2_index]; +m_data->nb_clusters = nb_clusters; +} +} +*cluster_offset = cluster_sector << BDRV_SECTOR_BITS; +return VMDK_OK; +} + +/* + * vmdk_alloc_clusters + * + * For a given offset on the virtual disk, find the cluster offset in vmdk + * file. If the offset is not found, allocate a new cluster. + * + * If the cluster is newly allocated, m_data->nb_clusters is set to the number + * of contiguous clusters that have been allocated. In this case, the other + * fields of m_data are valid and contain inform
[Qemu-block] [PATCH v5 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
Rename the existing function get_whole_cluster() to vmdk_perform_cow() as its sole purpose is to perform COW for the first and the last allocated clusters if needed. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 23 ++- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 22be887..73ae786 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } -/** - * get_whole_cluster +/* + * vmdk_perform_cow * * Copy backing file's cluster that covers @sector_num, otherwise write zero, * to the cluster at @cluster_sector_num. @@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) * If @skip_start_sector < @skip_end_sector, the relative range * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave * it for call to write user data in the request. + * + * Returns: + * VMDK_OK: on success + * + * VMDK_ERROR:in error cases */ -static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_offset, - uint64_t offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_perform_cow(BlockDriverState *bs, +VmdkExtent *extent, +uint64_t cluster_offset, +uint64_t offset, +uint64_t skip_start_bytes, +uint64_t skip_end_bytes) { int ret = VMDK_OK; int64_t cluster_bytes; @@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs, * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ -ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, offset, skip_start_bytes, skip_end_bytes); if (ret) { return ret; -- 2.6.2
[Qemu-block] [PATCH v5 7/8] vmdk: Update metadata for multiple clusters
Include a next pointer in VmdkMetaData struct to point to the previous allocated L2 table. Modify vmdk_L2update to start updating metadata for allocation of multiple clusters at once. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 128 ++- 1 file changed, 101 insertions(+), 27 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index b671dc9..9fa2414 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -137,6 +137,8 @@ typedef struct VmdkMetaData { int valid; uint32_t *l2_cache_entry; uint32_t nb_clusters; +uint32_t offset; +struct VmdkMetaData *next; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1116,34 +1118,87 @@ exit: return ret; } -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, - uint32_t offset) +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent, + VmdkMetaData *m_data, bool zeroed) { -offset = cpu_to_le32(offset); +int i; +uint32_t offset, temp_offset; +int *l2_table_array; +int l2_array_size; + +if (zeroed) { +temp_offset = VMDK_GTE_ZEROED; +} else { +temp_offset = m_data->offset; +} + +l2_array_size = sizeof(uint32_t) * m_data->nb_clusters; +l2_table_array = qemu_try_blockalign(extent->file->bs, + QEMU_ALIGN_UP(l2_array_size, + BDRV_SECTOR_SIZE)); +if (l2_table_array == NULL) { +return VMDK_ERROR; +} +memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE)); /* update L2 table */ +offset = temp_offset; +for (i = 0; i < m_data->nb_clusters; i++) { +l2_table_array[i] = cpu_to_le32(offset); +if (!zeroed) { +offset += 128; +} +} if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } /* update backup L2 table */ if (extent->l1_backup_table_offset != 0) { m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } } + +offset = temp_offset; if (m_data->l2_cache_entry) { -*m_data->l2_cache_entry = offset; +for (i = 0; i < m_data->nb_clusters; i++) { +*m_data->l2_cache_entry = cpu_to_le32(offset); +m_data->l2_cache_entry++; + +if (!zeroed) { +offset += 128; +} +} } +qemu_vfree(l2_table_array); return VMDK_OK; } +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + bool zeroed) +{ +int ret; + +while (m_data->next != NULL) { + +ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed); +if (ret < 0) { +return ret; +} + +m_data = m_data->next; + } + + return VMDK_OK; +} + /* * vmdk_l2load * @@ -1261,9 +1316,10 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, * * VMDK_ERROR:in error cases */ + static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint64_t offset, uint64_t *cluster_offset, - int64_t *bytes, VmdkMetaData *m_data, + int64_t *bytes, VmdkMetaData **m_data, bool allocate, uint32_t *total_alloc_clusters) { int l1_index, l2_offset, l2_index; @@ -1272,6 +1328,7 @@ static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint32_t nb_clusters; bool zeroed = false; uint64_t skip_start_bytes, skip_end_bytes; +VmdkMetaData *old_m_data; int ret; ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, @@ -1323,13 +1380,21 @@ static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent, if (ret < 0) { return ret; } -if (m_data) { -m_data->valid = 1; -m_data->l1_index = l1_index; -m_dat
[Qemu-block] [PATCH v5 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top
Move the existing vmdk_find_offset_in_cluster() function to the top of the driver. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index a9bd22b..22be887 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs) s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); } +static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, + int64_t offset) +{ +uint64_t extent_begin_offset, extent_relative_offset; +uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; + +extent_begin_offset = +(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; +extent_relative_offset = offset - extent_begin_offset; +return extent_relative_offset % cluster_size; +} + static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { char *desc; @@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } -static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, - int64_t offset) -{ -uint64_t extent_begin_offset, extent_relative_offset; -uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; - -extent_begin_offset = -(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; -extent_relative_offset = offset - extent_begin_offset; -return extent_relative_offset % cluster_size; -} - static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, int64_t sector_num) { -- 2.6.2
[Qemu-block] [PATCH v5 0/8] Optimize VMDK I/O by allocating multiple clusters
Previously posted series patches: v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html This series helps to optimize the I/O performance of VMDK driver. Patch 1 helps us to move vmdk_find_offset_in_cluster. Patch 2 & 3 perform a simple function re-naming tasks. Patch 4 is used to factor out metadata loading code and implement it in separate functions. This will help us to avoid code duplication in future patches of this series. Patch 5 helps to set the upper limit of the bytes handled in one cycle. Patch 6 adds new functions to help us allocate multiple clusters according to the size requested, perform COW if required and return the offset of the first newly allocated cluster. Patch 7 changes the metadata update code to update the L2 tables for multiple clusters at once. Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster offset only as cluster allocation task is now handled by vmdk_alloc_clusters() Optimization test results: This patch series improves 128 KB sequential write performance to an empty VMDK file by 54% Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f vmdk test.vmdk Changes in v5: - fix commit message and comment in patch 4 (fam) - add vmdk_ prefix to handle_alloc() (fam) - fix alignment issue in patch 6 (fam) - use BDRV_SECTOR_BITS (fam) - fix endianness calculation in patch 7 (fam) Changes in v4: - fix commit message in patch 1 (fam) - drop size_to_clusters() function (fam) - fix grammatical errors in function documentations (fam) - factor out metadata loading coding in a separate patch (patch 4) (fam) - rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam) - break patch 4(in v3) into separate patches (patch 3 and 8) (fam) - rename extent_size to extent_end (fam) - use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam) - drop next and simply do m_data = m_data->next (fam) Changes in v3: - move size_to_clusters() from patch 1 to 3 (fam) - use DIV_ROUND_UP in size_to_clusters (fam) - make patch 2 compilable (fam) - rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam) - combine patch 3 and patch 4 (as in v2) to make them compilable (fam) - call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam) Changes in v2: - segregate the ugly Patch 1 in v1 into 6 readable and sensible patches - include benchmark test results in v2 Ashijeet Acharya (8): vmdk: Move vmdk_find_offset_in_cluster() to the top vmdk: Rename get_whole_cluster() to vmdk_perform_cow() vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset() vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset() vmdk: Set maximum bytes allocated in one cycle vmdk: New functions to assist allocating multiple clusters vmdk: Update metadata for multiple clusters vmdk: Make vmdk_get_cluster_offset() return cluster offset only block/vmdk.c | 529 +-- 1 file changed, 407 insertions(+), 122 deletions(-) -- 2.6.2
Re: [Qemu-block] [Qemu-devel] [PATCH v4 6/8] vmdk: New functions to assist allocating multiple clusters
On Thu, Jun 1, 2017 at 7:27 PM, Fam Zheng wrote: > On Sat, 04/22 10:43, Ashijeet Acharya wrote: >> Introduce two new helper functions handle_alloc() and >> vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple >> clusters at once starting from a given offset on disk and performs COW >> if necessary for first and last allocated clusters. >> vmdk_alloc_cluster_offset() helps to return the offset of the first of >> the many newly allocated clusters. Also, provide proper documentation >> for both. >> >> Signed-off-by: Ashijeet Acharya >> --- >> block/vmdk.c | 192 >> +++ >> 1 file changed, 182 insertions(+), 10 deletions(-) >> >> diff --git a/block/vmdk.c b/block/vmdk.c >> index 7862791..8d34cd9 100644 >> --- a/block/vmdk.c >> +++ b/block/vmdk.c >> @@ -136,6 +136,7 @@ typedef struct VmdkMetaData { >> unsigned int l2_offset; >> int valid; >> uint32_t *l2_cache_entry; >> +uint32_t nb_clusters; >> } VmdkMetaData; >> >> typedef struct VmdkGrainMarker { >> @@ -1242,6 +1243,174 @@ static int get_cluster_table(VmdkExtent *extent, >> uint64_t offset, >> return VMDK_OK; >> } >> >> +/* >> + * handle_alloc >> + * >> + * Allocate new clusters for an area that either is yet unallocated or >> needs a >> + * copy on write. If *cluster_offset is non_zero, clusters are only >> allocated if >> + * the new allocation can match the specified host offset. >> + * >> + * Returns: >> + * VMDK_OK: if new clusters were allocated, *bytes may be decreased >> if >> + * the new allocation doesn't cover all of the requested >> area. >> + * *cluster_offset is updated to contain the offset of the >> + * first newly allocated cluster. >> + * >> + * VMDK_UNALLOC: if no clusters could be allocated. *cluster_offset is >> left >> + * unchanged. >> + * >> + * VMDK_ERROR:in error cases >> + */ >> +static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent, >> +uint64_t offset, uint64_t *cluster_offset, >> +int64_t *bytes, VmdkMetaData *m_data, >> +bool allocate, uint32_t *total_alloc_clusters) > > Not super important but personally I always prefer to stick to a "vmdk_" > prefix > when naming local identifiers, so that ctags and git grep can take it easy. Done. > >> +{ >> +int l1_index, l2_offset, l2_index; >> +uint32_t *l2_table; >> +uint32_t cluster_sector; >> +uint32_t nb_clusters; >> +bool zeroed = false; >> +uint64_t skip_start_bytes, skip_end_bytes; >> +int ret; >> + >> +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, >> +&l2_index, &l2_table); >> +if (ret < 0) { >> +return ret; >> +} >> + >> +cluster_sector = le32_to_cpu(l2_table[l2_index]); >> + >> +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset); >> +/* Calculate the number of clusters to look for. Here we truncate the >> last >> + * cluster, i.e. 1 less than the actual value calculated as we may need >> to >> + * perform COW for the last one. */ >> +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes, >> +extent->cluster_sectors << BDRV_SECTOR_BITS) - >> 1; > > Alignment could be improved: here ^ Done. > >> + >> +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index); >> +assert(nb_clusters <= INT_MAX); >> + >> +/* update bytes according to final nb_clusters value */ >> +if (nb_clusters != 0) { >> +*bytes = ((nb_clusters * extent->cluster_sectors) << 9) > > Better use BDRV_SECTOR_BITS instead of 9. Done. > >> +- skip_start_bytes; >> +} else { >> +nb_clusters = 1; >> +} >> +*total_alloc_clusters += nb_clusters; > > It is weird that you increment *total_alloc_clusters instead of simply > assigning > to it, because it's not clear why before reading the caller code. > I am incrementing it because we allocate clusters in every iteration and *total_alloc_clusters contains number of clusters allocated in total and not just in one iteration. So basically it is a sum of all the m_data->nb_clusters present in every element of the linked list I prepare. > It's better if you just return nb_clusters from this function (either as a > return value, or assign to *total_alloc_clusters), then do the accumulation in > vmdk_pwritev by adding m_data->nb_clusters, which is simpler. If I understood it correctly, you mean to say that I should add all the m_data->nb_clusters present in my linked list inside vmdk_pwritev() using a loop? If yes, I think that's what I have been doing earlier by incrementing *total_alloc_clusters there itself and is better, no? Also, please correct me if I am wrong :-) Ashijeet
Re: [Qemu-block] [PATCH v2 0/7] Refactor DMG driver to have chunk size independence
On Thu, Apr 27, 2017 at 1:36 PM, Ashijeet Acharya wrote: > Previously posted series patches: > v1: http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg04641.html > > This series helps to provide chunk size independence for DMG driver to prevent > denial-of-service in cases where untrusted files are being accessed by the > user. > > This task is mentioned on the public block ToDo > Here -> http://wiki.qemu.org/ToDo/Block/DmgChunkSizeIndependence > > Patch 1 introduces a new data structure to aid caching of random access points > within a compressed stream. > > Patch 2 is an extension of patch 1 and introduces a new function to > initialize/update/reset our cached random access point. > > Patch 3 limits the output buffer size to a max of 2MB to avoid QEMU allocate > huge amounts of memory. > > Patch 4 is a simple preparatory patch to aid handling of various types of > chunks. > > Patches 5 & 6 help to handle various types of chunks. > > Patch 7 simply refactors dmg_co_preadv() to read multiple sectors at once. > > Patch 8 finally removes the error messages QEMU used to throw when an image > with > chunk sizes above 64MB were accessed by the user. John, I have squashed patch 3 and 8 (as in v1) actually and that change is represented in patch 7 (as in v2). The cover letter here is quite misleading, as I forgot to update it and simply did a ctrl-c -- ctrl-v carelessly. Ashijeet > Ashijeet Acharya (7): > dmg: Introduce a new struct to cache random access points > dmg: New function to help us cache random access point > dmg: Refactor and prepare dmg_read_chunk() to cache random access > points > dmg: Handle zlib compressed chunks > dmg: Handle bz2 compressed/raw/zeroed chunks > dmg: Refactor dmg_co_preadv() to start reading multiple sectors > dmg: Limit the output buffer size to a max of 2MB > > block/dmg.c | 214 > +++- > block/dmg.h | 10 +++ > 2 files changed, 148 insertions(+), 76 deletions(-) > > -- > 2.6.2 >
[Qemu-block] [PATCH v2 5/7] dmg: Handle bz2 compressed/raw/zeroed chunks
We do not need to cache the access point for these chunks but need to update our various supporting variables like chunk, sectors_read etc. to keep maintaining our code structure. Call cache_access_point() after reading chunks of these types. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 18 ++ 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index 073e864..f9045f9 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -680,20 +680,30 @@ update: (char *)s->uncompressed_chunk, (unsigned int) (512 * s->sectorcounts[chunk])); + if (ret < 0) { return ret; } +cache_access_point(drs, NULL, -1, chunk, sectors_read, + sector_offset); break; case 1: /* copy */ -ret = bdrv_pread(bs->file, s->offsets[chunk], - s->uncompressed_chunk, s->lengths[chunk]); -if (ret != s->lengths[chunk]) { -return -1; +if (drs->sectors_read == -1) { +ret = bdrv_pread(bs->file, s->offsets[chunk], + s->uncompressed_chunk, s->lengths[chunk]); +if (ret != s->lengths[chunk]) { +return -1; +} } +cache_access_point(drs, NULL, -1, chunk, sectors_read, + sector_offset); break; case 2: /* zero */ /* see dmg_read, it is treated specially. No buffer needs to be * pre-filled, the zeroes can be set directly. */ +cache_access_point(drs, NULL, -1, chunk, sectors_read, + sector_offset); + break; } s->current_chunk = chunk; -- 2.6.2
[Qemu-block] [PATCH v2 4/7] dmg: Handle zlib compressed chunks
Set the output buffer size to be equal to the size of number of sectors stored in @sectors_read. Start inflating to a max output buffer size of 2MB and cache our access point to aid random access later if required. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 48 ++-- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index 32623e2..073e864 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -621,27 +621,47 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num, switch (s->types[chunk]) { /* block entry type */ case 0x8005: { /* zlib compressed */ -/* we need to buffer, because only the chunk as whole can be - * inflated. */ -ret = bdrv_pread(bs->file, s->offsets[chunk], - s->compressed_chunk, s->lengths[chunk]); -if (ret != s->lengths[chunk]) { -return -1; +/* check for cached random access point */ +if (drs->saved_next_in == NULL) { +/* we need to buffer, because only the chunk as whole can be + * inflated. */ +ret = bdrv_pread(bs->file, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); +if (ret != s->lengths[chunk]) { +return -1; +} + +s->zstream.next_in = s->compressed_chunk; +s->zstream.avail_in = s->lengths[chunk]; +} else { +s->zstream.next_in = drs->saved_next_in; +s->zstream.avail_in = drs->saved_avail_in; } -s->zstream.next_in = s->compressed_chunk; -s->zstream.avail_in = s->lengths[chunk]; s->zstream.next_out = s->uncompressed_chunk; -s->zstream.avail_out = 512 * s->sectorcounts[chunk]; -ret = inflateReset(&s->zstream); -if (ret != Z_OK) { -return -1; + +s->zstream.avail_out = sectors_read * BDRV_SECTOR_SIZE; + +if (drs->saved_next_in == NULL) { +ret = inflateReset(&s->zstream); +if (ret != Z_OK) { +return -1; +} +} +/* reset total_out for each successive call */ +s->zstream.total_out = 0; +ret = inflate(&s->zstream, Z_SYNC_FLUSH); +if (ret == Z_OK && +s->zstream.total_out == 512 * sectors_read) { +goto update; } -ret = inflate(&s->zstream, Z_FINISH); if (ret != Z_STREAM_END || -s->zstream.total_out != 512 * s->sectorcounts[chunk]) { +s->zstream.total_out != 512 * sectors_read) { return -1; } +update: +cache_access_point(drs, s->zstream.next_in, s->zstream.avail_in, + chunk, sectors_read, sector_offset); break; } case 0x8006: /* bzip2 compressed */ if (!dmg_uncompress_bz2) { -- 2.6.2
[Qemu-block] [PATCH v2 0/7] Refactor DMG driver to have chunk size independence
Previously posted series patches: v1: http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg04641.html This series helps to provide chunk size independence for DMG driver to prevent denial-of-service in cases where untrusted files are being accessed by the user. This task is mentioned on the public block ToDo Here -> http://wiki.qemu.org/ToDo/Block/DmgChunkSizeIndependence Patch 1 introduces a new data structure to aid caching of random access points within a compressed stream. Patch 2 is an extension of patch 1 and introduces a new function to initialize/update/reset our cached random access point. Patch 3 limits the output buffer size to a max of 2MB to avoid QEMU allocate huge amounts of memory. Patch 4 is a simple preparatory patch to aid handling of various types of chunks. Patches 5 & 6 help to handle various types of chunks. Patch 7 simply refactors dmg_co_preadv() to read multiple sectors at once. Patch 8 finally removes the error messages QEMU used to throw when an image with chunk sizes above 64MB were accessed by the user. ->Testing procedure: Convert a DMG file to raw format using the "qemu-img convert" tool present in v2.9.0 Next convert the same image again after applying these patches. Compare the two images using "qemu-img compare" tool to check if they are identical. You can pickup any DMG image from the collection present Here -> https://lists.gnu.org/archive/html/qemu-devel/2014-12/msg03606.html ->Important note: These patches assume that the terms "chunk" and "block" are synonyms of each other when we talk about bz2 compressed streams. Thus according to the bz2 docs[1], the max uncompressed size of a chunk/block can reach to 46MB which is less than the previously allowed size of 64MB, so we can continue decompressing the whole chunk/block at once instead of partial decompression just like we do now. This limitation was forced by the fact that bz2 compressed streams do not allow random access midway through a chunk/block as the BZ2_bzDecompress() API in bzlib seeks for the magic key "BZh" before starting decompression.[2] This magic key is present at the start of every chunk/block only and since our cached random access points need not necessarily point to the start of a chunk/block, BZ2_bzDecompress() fails with an error value BZ_DATA_ERROR_MAGIC[3] [1] https://en.wikipedia.org/wiki/Bzip2#File_format [2] https://blastedbio.blogspot.in/2011/11/random-access-to-bzip2.html [3] http://linux.math.tifr.res.in/manuals/html/manual_3.html#SEC17 Special thanks to Peter Wu for helping me understand and tackle the bz2 compressed chunks. Changes in v2: - limit the buffer size to 2MB after fixing the buffering problems (john/fam) Ashijeet Acharya (7): dmg: Introduce a new struct to cache random access points dmg: New function to help us cache random access point dmg: Refactor and prepare dmg_read_chunk() to cache random access points dmg: Handle zlib compressed chunks dmg: Handle bz2 compressed/raw/zeroed chunks dmg: Refactor dmg_co_preadv() to start reading multiple sectors dmg: Limit the output buffer size to a max of 2MB block/dmg.c | 214 +++- block/dmg.h | 10 +++ 2 files changed, 148 insertions(+), 76 deletions(-) -- 2.6.2
[Qemu-block] [PATCH v2 1/7] dmg: Introduce a new struct to cache random access points
We need to cache the random access point while performing partial decompression so that we can resume decompression from that point onwards in our next sequential read request. Introduce a new struct DMGReadState which will help us do this. Signed-off-by: Ashijeet Acharya --- block/dmg.h | 10 ++ 1 file changed, 10 insertions(+) diff --git a/block/dmg.h b/block/dmg.h index b592d6f..ee67ae1 100644 --- a/block/dmg.h +++ b/block/dmg.h @@ -31,6 +31,15 @@ #include "block/block_int.h" #include +/* used to cache current position in compressed input stream */ +typedef struct DMGReadState { +uint8_t *saved_next_in; +int64_t saved_avail_in; +int32_t saved_chunk_type; +int64_t sectors_read; /* possible sectors read in each cycle */ +int32_t sector_offset_in_chunk; +} DMGReadState; + typedef struct BDRVDMGState { CoMutex lock; /* each chunk contains a certain number of sectors, @@ -51,6 +60,7 @@ typedef struct BDRVDMGState { uint8_t *compressed_chunk; uint8_t *uncompressed_chunk; z_stream zstream; +DMGReadState *drs; } BDRVDMGState; extern int (*dmg_uncompress_bz2)(char *next_in, unsigned int avail_in, -- 2.6.2
[Qemu-block] [PATCH v2 6/7] dmg: Refactor dmg_co_preadv() to start reading multiple sectors
At the moment, dmg_co_preadv() reads one sector at a time. Make it read multiple sectors at a time depending on the number of sectors stored in "drs->sectors_read". This does not provide any significant optimization in the I/O process of DMG but is still a nicer way. Adjust the 'data' variable depending on our cached access point situation to align our read requests appropriately. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 23 +++ 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index f9045f9..8b7460c 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -718,7 +718,7 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, BDRVDMGState *s = bs->opaque; uint64_t sector_num = offset >> BDRV_SECTOR_BITS; int nb_sectors = bytes >> BDRV_SECTOR_BITS; -int ret, i; +int ret, i = 0; DMGReadState *drs = s->drs; assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); @@ -726,8 +726,7 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, qemu_co_mutex_lock(&s->lock); -for (i = 0; i < nb_sectors; i++) { -uint32_t sector_offset_in_chunk; +while (i < nb_sectors) { void *data; if (dmg_read_chunk(bs, sector_num + i, drs) != 0) { @@ -738,12 +737,20 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, * s->uncompressed_chunk may be too small to cover the large all-zeroes * section. dmg_read_chunk is called to find s->current_chunk */ if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */ -qemu_iovec_memset(qiov, i * 512, 0, 512); -continue; +qemu_iovec_memset(qiov, i * 512, 0, +512 * drs->sectors_read); +goto increment; +} + +if (drs->saved_next_in == NULL) { +data = s->uncompressed_chunk + drs->sector_offset_in_chunk * 512; +} else { +data = s->uncompressed_chunk; } -sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; -data = s->uncompressed_chunk + sector_offset_in_chunk * 512; -qemu_iovec_from_buf(qiov, i * 512, data, 512); +qemu_iovec_from_buf(qiov, i * 512, data, drs->sectors_read * 512); + +increment: +i += drs->sectors_read; } ret = 0; -- 2.6.2
[Qemu-block] [PATCH v2 7/7] dmg: Limit the output buffer size to a max of 2MB
The size of the output buffer is limited to a maximum of 2MB so that QEMU doesn't end up allocating huge amounts of memory while decompressing compressed input streams. 2MB is an appropriate size because "qemu-img convert" has the same I/O buffer size and the most important use case for DMG files is to be compatible with qemu-img convert. We have refactored the DMG driver to accept and process images irrespective of their chunk sizes since we now have a limit of 2MB on our output buffer size. Thus QEMU will not allocate huge amounts of memory no matter what the chunk size is. Remove the error messages to prevent denial-of-service in cases where untrusted files are being accessed by the user. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 24 +--- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index 8b7460c..ade2578 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -37,7 +37,7 @@ enum { /* Limit chunk sizes to prevent unreasonable amounts of memory being used * or truncating when converting to 32-bit types */ -DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */ +DMG_LENGTHS_MAX = 2 * 1024 * 1024, /* 2 MB */ DMG_SECTOR_MAX = DMG_LENGTHS_MAX / 512, }; @@ -209,7 +209,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, uint8_t *buffer, uint32_t count) { uint32_t type, i; -int ret; size_t new_size; uint32_t chunk_count; int64_t offset = 0; @@ -258,16 +257,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, /* sector count */ s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10); -/* all-zeroes sector (type 2) does not need to be "uncompressed" and can - * therefore be unbounded. */ -if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) { -error_report("sector count %" PRIu64 " for chunk %" PRIu32 - " is larger than max (%u)", - s->sectorcounts[i], i, DMG_SECTOR_MAX); -ret = -EINVAL; -goto fail; -} - /* offset in (compressed) data fork */ s->offsets[i] = buff_read_uint64(buffer, offset + 0x18); s->offsets[i] += in_offset; @@ -275,23 +264,12 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, /* length in (compressed) data fork */ s->lengths[i] = buff_read_uint64(buffer, offset + 0x20); -if (s->lengths[i] > DMG_LENGTHS_MAX) { -error_report("length %" PRIu64 " for chunk %" PRIu32 - " is larger than max (%u)", - s->lengths[i], i, DMG_LENGTHS_MAX); -ret = -EINVAL; -goto fail; -} - update_max_chunk_size(s, i, &ds->max_compressed_size, &ds->max_sectors_per_chunk); offset += 40; } s->n_chunks += chunk_count; return 0; - -fail: -return ret; } static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, -- 2.6.2
[Qemu-block] [PATCH v2 3/7] dmg: Refactor and prepare dmg_read_chunk() to cache random access points
Refactor dmg_read_chunk() to start making use of the new DMGReadState structure and do chunk and sector related calculations based on it. Add a new argument "DMGReadState *drs" to it. Also, rename DMG_SECTORCOUNTS_MAX to DMG_SECTOR_MAX to avoid indentaion problems at some places inside dmg_read_chunk() Signed-off-by: Ashijeet Acharya --- block/dmg.c | 159 +++- 1 file changed, 94 insertions(+), 65 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index c6fe8b0..32623e2 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -38,7 +38,7 @@ enum { * or truncating when converting to 32-bit types */ DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */ -DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512, +DMG_SECTOR_MAX = DMG_LENGTHS_MAX / 512, }; static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) @@ -260,10 +260,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, /* all-zeroes sector (type 2) does not need to be "uncompressed" and can * therefore be unbounded. */ -if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { +if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) { error_report("sector count %" PRIu64 " for chunk %" PRIu32 " is larger than max (%u)", - s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); + s->sectorcounts[i], i, DMG_SECTOR_MAX); ret = -EINVAL; goto fail; } @@ -578,78 +578,106 @@ static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num) return s->n_chunks; /* error */ } -static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) +static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num, + DMGReadState *drs) { BDRVDMGState *s = bs->opaque; +int ret; +uint32_t sector_offset; +uint64_t sectors_read; +uint32_t chunk; + if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) { -int ret; -uint32_t chunk = search_chunk(s, sector_num); +chunk = search_chunk(s, sector_num); +} else { +chunk = drs->saved_chunk_type; +} -if (chunk >= s->n_chunks) { +if (chunk >= s->n_chunks) { +return -1; +} + +/* reset our access point cache if we had a change in current chunk */ +if (chunk != drs->saved_chunk_type) { +cache_access_point(drs, NULL, -1, -1, -1, -1); +} + +sector_offset = sector_num - s->sectors[chunk]; + +if ((s->sectorcounts[chunk] - sector_offset) > DMG_SECTOR_MAX) { +sectors_read = DMG_SECTOR_MAX; +} else { +sectors_read = s->sectorcounts[chunk] - sector_offset; +} + +/* truncate sectors read if it exceeds the 2MB buffer of qemu-img + * convert */ +if ((sector_num % DMG_SECTOR_MAX) + sectors_read > DMG_SECTOR_MAX) { +sectors_read = DMG_SECTOR_MAX - (sector_num % DMG_SECTOR_MAX); +} + +s->current_chunk = s->n_chunks; + +switch (s->types[chunk]) { /* block entry type */ +case 0x8005: { /* zlib compressed */ +/* we need to buffer, because only the chunk as whole can be + * inflated. */ +ret = bdrv_pread(bs->file, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); +if (ret != s->lengths[chunk]) { return -1; } -s->current_chunk = s->n_chunks; -switch (s->types[chunk]) { /* block entry type */ -case 0x8005: { /* zlib compressed */ -/* we need to buffer, because only the chunk as whole can be - * inflated. */ -ret = bdrv_pread(bs->file, s->offsets[chunk], - s->compressed_chunk, s->lengths[chunk]); -if (ret != s->lengths[chunk]) { -return -1; -} - -s->zstream.next_in = s->compressed_chunk; -s->zstream.avail_in = s->lengths[chunk]; -s->zstream.next_out = s->uncompressed_chunk; -s->zstream.avail_out = 512 * s->sectorcounts[chunk]; -ret = inflateReset(&s->zstream); -if (ret != Z_OK) { -return -1; -} -ret = inflate(&s->zstream, Z_FINISH); -if (ret != Z_STREAM_END || -s->zstream.total_out != 512 * s->sectorcounts[chunk]) { -return -1; -} -break; } -case 0x8006: /* bzip2 compressed */ -if (!dmg_uncompress_bz2) { -break; -} -/* we need to buffer, because only t
[Qemu-block] [PATCH v2 2/7] dmg: New function to help us cache random access point
Introduce a new cache_access_point() function which will help us first cache a random access point inside a compressed stream and then keep updating it according to our requirement at appropriate times. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/block/dmg.c b/block/dmg.c index a7d25fc..c6fe8b0 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -128,6 +128,18 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, } } +static void cache_access_point(DMGReadState *drs, uint8_t *next_in, +int64_t avail_in, int32_t chunk, +int64_t sectors_read, int32_t sector_offset) +{ +drs->saved_next_in = next_in; +drs->saved_avail_in = avail_in; +drs->saved_chunk_type = chunk; +drs->sectors_read = sectors_read; +drs->sector_offset_in_chunk = sector_offset; +return; +} + static int64_t dmg_find_koly_offset(BdrvChild *file, Error **errp) { BlockDriverState *file_bs = file->bs; @@ -507,6 +519,10 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } +s->drs = g_malloc(sizeof(DMGReadState)); +/* initialise our access point cache */ +cache_access_point(s->drs, NULL, -1, -1, -1, -1); + if (inflateInit(&s->zstream) != Z_OK) { ret = -EINVAL; goto fail; @@ -523,6 +539,7 @@ fail: g_free(s->lengths); g_free(s->sectors); g_free(s->sectorcounts); +g_free(s->drs); qemu_vfree(s->compressed_chunk); qemu_vfree(s->uncompressed_chunk); return ret; @@ -685,6 +702,7 @@ static void dmg_close(BlockDriverState *bs) g_free(s->lengths); g_free(s->sectors); g_free(s->sectorcounts); +g_free(s->drs); qemu_vfree(s->compressed_chunk); qemu_vfree(s->uncompressed_chunk); -- 2.6.2
Re: [Qemu-block] [Qemu-devel] [PATCH v1 3/8] dmg: Limit the output buffer size to a max of 2MB
On Thu, Apr 27, 2017 at 12:56 PM, Fam Zheng wrote: > On Wed, 04/26 17:30, John Snow wrote: >> Seems OK otherwise, but I would normally expect you to fix the buffering >> problems first, and then reduce the size of the buffer -- not the other >> way around. This version introduces new limitations that didn't exist >> previously (As of this commit, QEMU can't open DMG files with chunks >> larger than 2MB now, right?) > > Yes, each commit should _not_ introduce issues (compiling failures, functional > degeneration, etc.), and cannot rely on following commits to fix things > screwed > up in this one. > > This is important for bisectability - each commit can be built and tested in > the > whole git history. Yes, understood. That's why I am gonna squash it with the last patch (patch 8) which removes this limitation completely. Ashijeet
Re: [Qemu-block] [Qemu-devel] [PATCH v1 3/8] dmg: Limit the output buffer size to a max of 2MB
On Thu, Apr 27, 2017 at 3:00 AM, John Snow wrote: > > > On 04/25/2017 03:59 PM, Ashijeet Acharya wrote: >> The size of the output buffer is limited to a maximum of 2MB so that >> QEMU doesn't end up allocating huge amounts of memory while >> decompressing compressed input streams. >> >> 2MB is an appropriate size because "qemu-img convert" has the same I/O >> buffer size and the most important use case for DMG files is to be >> compatible with qemu-img convert. >> >> Signed-off-by: Ashijeet Acharya >> --- > > Patch 1 adds a new structure and patch 2 starts using it, but in a > store-only manner and only with placeholder variables that are difficult > to authenticate, so there's still "insufficient data" to review either > patch meaningfully. > > This patch seems unrelated to either of those, so the ordering is strange. Actually, I have tried to keep these patches very short so that it is easier to review them (mainly because of the time limitation I have). But seems like I over tried. If you have any suggestions for the first 2 patches, I am happy to change them in you preferred way. > >> block/dmg.c | 12 ++-- >> 1 file changed, 6 insertions(+), 6 deletions(-) >> >> diff --git a/block/dmg.c b/block/dmg.c >> index c6fe8b0..7ae30e3 100644 >> --- a/block/dmg.c >> +++ b/block/dmg.c >> @@ -37,8 +37,8 @@ enum { >> /* Limit chunk sizes to prevent unreasonable amounts of memory being >> used >> * or truncating when converting to 32-bit types >> */ >> -DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */ >> -DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512, >> +DMG_MAX_OUTPUT = 2 * 1024 * 1024, /* 2 MB */ > > why "MAX OUTPUT" ? Aren't we using this for buffering on reads? I just thought that this looked better, but I will revert back to the original one. > >> +DMG_SECTOR_MAX = DMG_MAX_OUTPUT / 512, >> }; >> >> static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) >> @@ -260,10 +260,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, >> DmgHeaderState *ds, >> >> /* all-zeroes sector (type 2) does not need to be "uncompressed" >> and can >> * therefore be unbounded. */ >> -if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { >> +if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) { >> error_report("sector count %" PRIu64 " for chunk %" PRIu32 >> " is larger than max (%u)", >> - s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); >> + s->sectorcounts[i], i, DMG_SECTOR_MAX); >> ret = -EINVAL; >> goto fail; >> } >> @@ -275,10 +275,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, >> DmgHeaderState *ds, >> /* length in (compressed) data fork */ >> s->lengths[i] = buff_read_uint64(buffer, offset + 0x20); >> >> -if (s->lengths[i] > DMG_LENGTHS_MAX) { >> +if (s->lengths[i] > DMG_MAX_OUTPUT) { >> error_report("length %" PRIu64 " for chunk %" PRIu32 >> " is larger than max (%u)", >> - s->lengths[i], i, DMG_LENGTHS_MAX); >> + s->lengths[i], i, DMG_MAX_OUTPUT); >> ret = -EINVAL; >> goto fail; >> } >> > > Seems OK otherwise, but I would normally expect you to fix the buffering > problems first, and then reduce the size of the buffer -- not the other > way around. This version introduces new limitations that didn't exist > previously (As of this commit, QEMU can't open DMG files with chunks > larger than 2MB now, right?) I think I will squash it with the last one (patch 8) which removes this limitation completely and will also fix the problem of handling the buffering problems first and then reducing the buffer size? Ashijeet
[Qemu-block] [PATCH v1 7/8] dmg: Refactor dmg_co_preadv() to start reading multiple sectors
At the moment, dmg_co_preadv() reads one sector at a time. Make it read multiple sectors at a time depending on the number of sectors stored in "drs->sectors_read". This does not provide any significant optimization in the I/O process of DMG but is still a nicer way. Adjust the 'data' variable depending on our cached access point situation to align our read requests appropriately. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 23 +++ 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index f643e41..b0f3c84 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -718,7 +718,7 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, BDRVDMGState *s = bs->opaque; uint64_t sector_num = offset >> BDRV_SECTOR_BITS; int nb_sectors = bytes >> BDRV_SECTOR_BITS; -int ret, i; +int ret, i = 0; DMGReadState *drs = s->drs; assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); @@ -726,8 +726,7 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, qemu_co_mutex_lock(&s->lock); -for (i = 0; i < nb_sectors; i++) { -uint32_t sector_offset_in_chunk; +while (i < nb_sectors) { void *data; if (dmg_read_chunk(bs, sector_num + i, drs) != 0) { @@ -738,12 +737,20 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, * s->uncompressed_chunk may be too small to cover the large all-zeroes * section. dmg_read_chunk is called to find s->current_chunk */ if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */ -qemu_iovec_memset(qiov, i * 512, 0, 512); -continue; +qemu_iovec_memset(qiov, i * 512, 0, +512 * drs->sectors_read); +goto increment; +} + +if (drs->saved_next_in == NULL) { +data = s->uncompressed_chunk + drs->sector_offset_in_chunk * 512; +} else { +data = s->uncompressed_chunk; } -sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; -data = s->uncompressed_chunk + sector_offset_in_chunk * 512; -qemu_iovec_from_buf(qiov, i * 512, data, 512); +qemu_iovec_from_buf(qiov, i * 512, data, drs->sectors_read * 512); + +increment: +i += drs->sectors_read; } ret = 0; -- 2.6.2
[Qemu-block] [PATCH v1 4/8] dmg: Refactor and prepare dmg_read_chunk() to cache random access points
Refactor dmg_read_chunk() to start making use of the new DMGReadState structure and do chunk and sector related calculations based on it. Add a new argument "DMGReadState *drs" to it. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 153 1 file changed, 91 insertions(+), 62 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index 7ae30e3..dc356b0 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -578,78 +578,106 @@ static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num) return s->n_chunks; /* error */ } -static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) +static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num, + DMGReadState *drs) { BDRVDMGState *s = bs->opaque; +int ret; +uint32_t sector_offset; +uint64_t sectors_read; +uint32_t chunk; + if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) { -int ret; -uint32_t chunk = search_chunk(s, sector_num); +chunk = search_chunk(s, sector_num); +} else { +chunk = drs->saved_chunk_type; +} -if (chunk >= s->n_chunks) { +if (chunk >= s->n_chunks) { +return -1; +} + +/* reset our access point cache if we had a change in current chunk */ +if (chunk != drs->saved_chunk_type) { +cache_access_point(drs, NULL, -1, -1, -1, -1); +} + +sector_offset = sector_num - s->sectors[chunk]; + +if ((s->sectorcounts[chunk] - sector_offset) > DMG_SECTOR_MAX) { +sectors_read = DMG_SECTOR_MAX; +} else { +sectors_read = s->sectorcounts[chunk] - sector_offset; +} + +/* truncate sectors read if it exceeds the 2MB buffer of qemu-img + * convert */ +if ((sector_num % DMG_SECTOR_MAX) + sectors_read > DMG_SECTOR_MAX) { +sectors_read = DMG_SECTOR_MAX - (sector_num % DMG_SECTOR_MAX); +} + +s->current_chunk = s->n_chunks; + +switch (s->types[chunk]) { /* block entry type */ +case 0x8005: { /* zlib compressed */ +/* we need to buffer, because only the chunk as whole can be + * inflated. */ +ret = bdrv_pread(bs->file, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); +if (ret != s->lengths[chunk]) { return -1; } -s->current_chunk = s->n_chunks; -switch (s->types[chunk]) { /* block entry type */ -case 0x8005: { /* zlib compressed */ -/* we need to buffer, because only the chunk as whole can be - * inflated. */ -ret = bdrv_pread(bs->file, s->offsets[chunk], - s->compressed_chunk, s->lengths[chunk]); -if (ret != s->lengths[chunk]) { -return -1; -} - -s->zstream.next_in = s->compressed_chunk; -s->zstream.avail_in = s->lengths[chunk]; -s->zstream.next_out = s->uncompressed_chunk; -s->zstream.avail_out = 512 * s->sectorcounts[chunk]; -ret = inflateReset(&s->zstream); -if (ret != Z_OK) { -return -1; -} -ret = inflate(&s->zstream, Z_FINISH); -if (ret != Z_STREAM_END || -s->zstream.total_out != 512 * s->sectorcounts[chunk]) { -return -1; -} -break; } -case 0x8006: /* bzip2 compressed */ -if (!dmg_uncompress_bz2) { -break; -} -/* we need to buffer, because only the chunk as whole can be - * inflated. */ -ret = bdrv_pread(bs->file, s->offsets[chunk], - s->compressed_chunk, s->lengths[chunk]); -if (ret != s->lengths[chunk]) { -return -1; -} - -ret = dmg_uncompress_bz2((char *)s->compressed_chunk, - (unsigned int) s->lengths[chunk], - (char *)s->uncompressed_chunk, - (unsigned int) - (512 * s->sectorcounts[chunk])); -if (ret < 0) { -return ret; -} -break; -case 1: /* copy */ -ret = bdrv_pread(bs->file, s->offsets[chunk], - s->uncompressed_chunk, s->lengths[chunk]); -if (ret != s->lengths[chunk]) { -return -1; -} -break; -case 2: /* zero */ -/* see dmg_read, it is treated specially. No buffer needs to be - * pre-filled, the zeroes can be set directly. */ +s->zstream.ne
[Qemu-block] [PATCH v1 8/8] dmg: Remove the error messages to allow wild images
We have refactored the DMG driver to accept and process images irrespective of their chunk sizes since we now have limit of 2MB on our output buffer size. Thus QEMU will not allocate huge amounts of memory no matter what the chunk size is. Remove the error messages to prevent denial-of-service in cases where untrusted files are being accessed by the user. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 22 -- 1 file changed, 22 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index b0f3c84..01ec40e 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -209,7 +209,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, uint8_t *buffer, uint32_t count) { uint32_t type, i; -int ret; size_t new_size; uint32_t chunk_count; int64_t offset = 0; @@ -258,16 +257,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, /* sector count */ s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10); -/* all-zeroes sector (type 2) does not need to be "uncompressed" and can - * therefore be unbounded. */ -if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) { -error_report("sector count %" PRIu64 " for chunk %" PRIu32 - " is larger than max (%u)", - s->sectorcounts[i], i, DMG_SECTOR_MAX); -ret = -EINVAL; -goto fail; -} - /* offset in (compressed) data fork */ s->offsets[i] = buff_read_uint64(buffer, offset + 0x18); s->offsets[i] += in_offset; @@ -275,23 +264,12 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, /* length in (compressed) data fork */ s->lengths[i] = buff_read_uint64(buffer, offset + 0x20); -if (s->lengths[i] > DMG_MAX_OUTPUT) { -error_report("length %" PRIu64 " for chunk %" PRIu32 - " is larger than max (%u)", - s->lengths[i], i, DMG_MAX_OUTPUT); -ret = -EINVAL; -goto fail; -} - update_max_chunk_size(s, i, &ds->max_compressed_size, &ds->max_sectors_per_chunk); offset += 40; } s->n_chunks += chunk_count; return 0; - -fail: -return ret; } static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, -- 2.6.2
[Qemu-block] [PATCH v1 5/8] dmg: Handle zlib compressed chunks
Set the output buffer size to be equal to the size of number of sectors stored in @sectors_read. Start inflating to a max output buffer size of 2MB and cache our access point to aid random access later if required. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 48 ++-- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index dc356b0..749c151 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -621,27 +621,47 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num, switch (s->types[chunk]) { /* block entry type */ case 0x8005: { /* zlib compressed */ -/* we need to buffer, because only the chunk as whole can be - * inflated. */ -ret = bdrv_pread(bs->file, s->offsets[chunk], - s->compressed_chunk, s->lengths[chunk]); -if (ret != s->lengths[chunk]) { -return -1; +/* check for cached random access point */ +if (drs->saved_next_in == NULL) { +/* we need to buffer, because only the chunk as whole can be + * inflated. */ +ret = bdrv_pread(bs->file, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); +if (ret != s->lengths[chunk]) { +return -1; +} + +s->zstream.next_in = s->compressed_chunk; +s->zstream.avail_in = s->lengths[chunk]; +} else { +s->zstream.next_in = drs->saved_next_in; +s->zstream.avail_in = drs->saved_avail_in; } -s->zstream.next_in = s->compressed_chunk; -s->zstream.avail_in = s->lengths[chunk]; s->zstream.next_out = s->uncompressed_chunk; -s->zstream.avail_out = 512 * s->sectorcounts[chunk]; -ret = inflateReset(&s->zstream); -if (ret != Z_OK) { -return -1; + +s->zstream.avail_out = sectors_read * BDRV_SECTOR_SIZE; + +if (drs->saved_next_in == NULL) { +ret = inflateReset(&s->zstream); +if (ret != Z_OK) { +return -1; +} +} +/* reset total_out for each successive call */ +s->zstream.total_out = 0; +ret = inflate(&s->zstream, Z_SYNC_FLUSH); +if (ret == Z_OK && +s->zstream.total_out == 512 * sectors_read) { +goto update; } -ret = inflate(&s->zstream, Z_FINISH); if (ret != Z_STREAM_END || -s->zstream.total_out != 512 * s->sectorcounts[chunk]) { +s->zstream.total_out != 512 * sectors_read) { return -1; } +update: +cache_access_point(drs, s->zstream.next_in, s->zstream.avail_in, + chunk, sectors_read, sector_offset); break; } case 0x8006: /* bzip2 compressed */ if (!dmg_uncompress_bz2) { -- 2.6.2
[Qemu-block] [PATCH v1 3/8] dmg: Limit the output buffer size to a max of 2MB
The size of the output buffer is limited to a maximum of 2MB so that QEMU doesn't end up allocating huge amounts of memory while decompressing compressed input streams. 2MB is an appropriate size because "qemu-img convert" has the same I/O buffer size and the most important use case for DMG files is to be compatible with qemu-img convert. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index c6fe8b0..7ae30e3 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -37,8 +37,8 @@ enum { /* Limit chunk sizes to prevent unreasonable amounts of memory being used * or truncating when converting to 32-bit types */ -DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */ -DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512, +DMG_MAX_OUTPUT = 2 * 1024 * 1024, /* 2 MB */ +DMG_SECTOR_MAX = DMG_MAX_OUTPUT / 512, }; static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) @@ -260,10 +260,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, /* all-zeroes sector (type 2) does not need to be "uncompressed" and can * therefore be unbounded. */ -if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { +if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) { error_report("sector count %" PRIu64 " for chunk %" PRIu32 " is larger than max (%u)", - s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); + s->sectorcounts[i], i, DMG_SECTOR_MAX); ret = -EINVAL; goto fail; } @@ -275,10 +275,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, /* length in (compressed) data fork */ s->lengths[i] = buff_read_uint64(buffer, offset + 0x20); -if (s->lengths[i] > DMG_LENGTHS_MAX) { +if (s->lengths[i] > DMG_MAX_OUTPUT) { error_report("length %" PRIu64 " for chunk %" PRIu32 " is larger than max (%u)", - s->lengths[i], i, DMG_LENGTHS_MAX); + s->lengths[i], i, DMG_MAX_OUTPUT); ret = -EINVAL; goto fail; } -- 2.6.2
[Qemu-block] [PATCH v1 6/8] dmg: Handle bz2 compressed/raw/zeroed chunks
We do not need to cache the access point for these chunks but need to update our various supporting variables like chunk, sectors_read etc. to keep maintaining our code structure. Call cache_access_point() after reading chunks of these types. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 18 ++ 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index 749c151..f643e41 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -680,20 +680,30 @@ update: (char *)s->uncompressed_chunk, (unsigned int) (512 * s->sectorcounts[chunk])); + if (ret < 0) { return ret; } +cache_access_point(drs, NULL, -1, chunk, sectors_read, + sector_offset); break; case 1: /* copy */ -ret = bdrv_pread(bs->file, s->offsets[chunk], - s->uncompressed_chunk, s->lengths[chunk]); -if (ret != s->lengths[chunk]) { -return -1; +if (drs->sectors_read == -1) { +ret = bdrv_pread(bs->file, s->offsets[chunk], + s->uncompressed_chunk, s->lengths[chunk]); +if (ret != s->lengths[chunk]) { +return -1; +} } +cache_access_point(drs, NULL, -1, chunk, sectors_read, + sector_offset); break; case 2: /* zero */ /* see dmg_read, it is treated specially. No buffer needs to be * pre-filled, the zeroes can be set directly. */ +cache_access_point(drs, NULL, -1, chunk, sectors_read, + sector_offset); + break; } s->current_chunk = chunk; -- 2.6.2
[Qemu-block] [PATCH v1 0/8] Refactor DMG driver to have chunk size independence
This series helps to provide chunk size independence for DMG driver to prevent denial-of-service in cases where untrusted files are being accessed by the user. This task is mentioned on the public block ToDo Here -> http://wiki.qemu.org/ToDo/Block/DmgChunkSizeIndependence Patch 1 introduces a new data structure to aid caching of random access points within a compressed stream. Patch 2 is an extension of patch 1 and introduces a new function to initialize/update/reset our cached random access point. Patch 3 limits the output buffer size to a max of 2MB to avoid QEMU allocate huge amounts of memory. Patch 4 is a simple preparatory patch to aid handling of various types of chunks. Patches 5 & 6 help to handle various types of chunks. Patch 7 simply refactors dmg_co_preadv() to read multiple sectors at once. Patch 8 finally removes the error messages QEMU used to throw when an image with chunk sizes above 64MB were accessed by the user. ->Testing procedure: Convert a DMG file to raw format using the "qemu-img convert" tool present in v2.9.0 Next convert the same image again after applying these patches. Compare the two images using "qemu-img compare" tool to check if they are identical. You can pickup any DMG image from the collection present Here -> https://lists.gnu.org/archive/html/qemu-devel/2014-12/msg03606.html ->Important note: These patches assume that the terms "chunk" and "block" are synonyms of each other when we talk about bz2 compressed streams. Thus according to the bz2 docs[1], the max uncompressed size of a chunk/block can reach to 46MB which is less than the previously allowed size of 64MB, so we can continue decompressing the whole chunk/block at once instead of partial decompression just like we do now. This limitation was forced by the fact that bz2 compressed streams do not allow random access midway through a chunk/block as the BZ2_bzDecompress() API in bzlib seeks for the magic key "BZh" before starting decompression.[2] This magic key is present at the start of every chunk/block only and since our cached random access points need not necessarily point to the start of a chunk/block, BZ2_bzDecompress() fails with an error value BZ_DATA_ERROR_MAGIC[3] [1] https://en.wikipedia.org/wiki/Bzip2#File_format [2] https://blastedbio.blogspot.in/2011/11/random-access-to-bzip2.html [3] http://linux.math.tifr.res.in/manuals/html/manual_3.html#SEC17 Special thanks to Peter Wu for helping me understand and tackle the bz2 compressed chunks. Ashijeet Acharya (8): dmg: Introduce a new struct to cache random access points dmg: New function to help us cache random access point dmg: Limit the output buffer size to a max of 2MB dmg: Refactor and prepare dmg_read_chunk() to cache random access points dmg: Handle zlib compressed chunks dmg: Handle bz2 compressed/raw/zeroed chunks dmg: Refactor dmg_co_preadv() to start reading multiple sectors dmg: Remove the error messages to allow wild images block/dmg.c | 214 +++- block/dmg.h | 10 +++ 2 files changed, 148 insertions(+), 76 deletions(-) -- 2.6.2
[Qemu-block] [PATCH v1 1/8] dmg: Introduce a new struct to cache random access points
We need to cache the random access point while performing partial decompression so that we can resume decompression from that point onwards in our next sequential read request. Introduce a new struct DMGReadState which will help us do this. Signed-off-by: Ashijeet Acharya --- block/dmg.h | 10 ++ 1 file changed, 10 insertions(+) diff --git a/block/dmg.h b/block/dmg.h index b592d6f..ee67ae1 100644 --- a/block/dmg.h +++ b/block/dmg.h @@ -31,6 +31,15 @@ #include "block/block_int.h" #include +/* used to cache current position in compressed input stream */ +typedef struct DMGReadState { +uint8_t *saved_next_in; +int64_t saved_avail_in; +int32_t saved_chunk_type; +int64_t sectors_read; /* possible sectors read in each cycle */ +int32_t sector_offset_in_chunk; +} DMGReadState; + typedef struct BDRVDMGState { CoMutex lock; /* each chunk contains a certain number of sectors, @@ -51,6 +60,7 @@ typedef struct BDRVDMGState { uint8_t *compressed_chunk; uint8_t *uncompressed_chunk; z_stream zstream; +DMGReadState *drs; } BDRVDMGState; extern int (*dmg_uncompress_bz2)(char *next_in, unsigned int avail_in, -- 2.6.2
[Qemu-block] [PATCH v1 2/8] dmg: New function to help us cache random access point
Introduce a new cache_access_point() function which will help us first cache a random access point inside a compressed stream and then keep updating it according to our requirement at appropriate times. Signed-off-by: Ashijeet Acharya --- block/dmg.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/block/dmg.c b/block/dmg.c index a7d25fc..c6fe8b0 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -128,6 +128,18 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, } } +static void cache_access_point(DMGReadState *drs, uint8_t *next_in, +int64_t avail_in, int32_t chunk, +int64_t sectors_read, int32_t sector_offset) +{ +drs->saved_next_in = next_in; +drs->saved_avail_in = avail_in; +drs->saved_chunk_type = chunk; +drs->sectors_read = sectors_read; +drs->sector_offset_in_chunk = sector_offset; +return; +} + static int64_t dmg_find_koly_offset(BdrvChild *file, Error **errp) { BlockDriverState *file_bs = file->bs; @@ -507,6 +519,10 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } +s->drs = g_malloc(sizeof(DMGReadState)); +/* initialise our access point cache */ +cache_access_point(s->drs, NULL, -1, -1, -1, -1); + if (inflateInit(&s->zstream) != Z_OK) { ret = -EINVAL; goto fail; @@ -523,6 +539,7 @@ fail: g_free(s->lengths); g_free(s->sectors); g_free(s->sectorcounts); +g_free(s->drs); qemu_vfree(s->compressed_chunk); qemu_vfree(s->uncompressed_chunk); return ret; @@ -685,6 +702,7 @@ static void dmg_close(BlockDriverState *bs) g_free(s->lengths); g_free(s->sectors); g_free(s->sectorcounts); +g_free(s->drs); qemu_vfree(s->compressed_chunk); qemu_vfree(s->uncompressed_chunk); -- 2.6.2
[Qemu-block] [PATCH v4 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only
vmdk_alloc_clusters() introduced earlier now handles the task of allocating clusters and performing COW when needed. Thus we can change vmdk_get_cluster_offset() to stick to the sole purpose of returning cluster offset using sector number. Update the changes at all call sites. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 56 1 file changed, 12 insertions(+), 44 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index e52c373..be08bde 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1486,25 +1486,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs, * For flat extents, the start offset as parsed from the description file is * returned. * - * For sparse extents, look up in L1, L2 table. If allocate is true, return an - * offset for a new cluster and update L2 cache. If there is a backing file, - * COW is done before returning; otherwise, zeroes are written to the allocated - * cluster. Both COW and zero writing skips the sector range - * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller - * has new data to write there. + * For sparse extents, look up the L1, L2 table. * * Returns: VMDK_OK if cluster exists and mapped in the image. - * VMDK_UNALLOC if cluster is not mapped and @allocate is false. - * VMDK_ERROR if failed. + * VMDK_UNALLOC if cluster is not mapped. + * VMDK_ERROR if failed */ static int vmdk_get_cluster_offset(BlockDriverState *bs, VmdkExtent *extent, - VmdkMetaData *m_data, uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) + uint64_t *cluster_offset) { int l1_index, l2_offset, l2_index; uint32_t *l2_table; @@ -1529,31 +1520,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, } if (!cluster_sector || zeroed) { -if (!allocate) { -return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; -} - -cluster_sector = extent->next_cluster_sector; -extent->next_cluster_sector += extent->cluster_sectors; - -/* First of all we write grain itself, to avoid race condition - * that may to corrupt the image. - * This problem may occur because of insufficient space on host disk - * or inappropriate VM shutdown. - */ -ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, -offset, skip_start_bytes, skip_end_bytes); -if (ret) { -return ret; -} -if (m_data) { -m_data->valid = 1; -m_data->l1_index = l1_index; -m_data->l2_index = l2_index; -m_data->l2_offset = l2_offset; -m_data->l2_cache_entry = &l2_table[l2_index]; -} +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; } + *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; return VMDK_OK; } @@ -1596,9 +1565,7 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = vmdk_get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1789,13 +1756,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); +ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset); + if (ret != VMDK_OK) { /* if not allocated, try to read from parent image, if exist */ if (bs->backing && ret != VMDK_ZEROED) { @@ -2542,9 +2510,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, sector_num); break; } -ret = vmdk_get_cluster_offset(bs, extent, NULL, +ret = vmdk_get_cluster_offset(bs, extent, sector_num << BDRV_SECTOR_BITS, - false, &cluster_offset, 0, 0);
[Qemu-block] [PATCH v4 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
Rename the existing get_cluster_offset() to vmdk_get_cluster_offset() and update name in all the callers accordingly. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 46 +++--- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 73ae786..f403981 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1144,7 +1144,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, } /** - * get_cluster_offset + * vmdk_get_cluster_offset * * Look up cluster offset in extent file by sector number, and store in * @cluster_offset. @@ -1163,14 +1163,14 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, * VMDK_UNALLOC if cluster is not mapped and @allocate is false. * VMDK_ERROR if failed. */ -static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_get_cluster_offset(BlockDriverState *bs, + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + bool allocate, + uint64_t *cluster_offset, + uint64_t skip_start_bytes, + uint64_t skip_end_bytes) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; @@ -1304,9 +1304,9 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, return 0; } qemu_co_mutex_lock(&s->lock); -ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num * 512, false, &offset, + 0, 0); qemu_co_mutex_unlock(&s->lock); index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); @@ -1497,8 +1497,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, ret = -EIO; goto fail; } -ret = get_cluster_offset(bs, extent, NULL, - offset, false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + offset, false, &cluster_offset, 0, 0); offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE @@ -1584,10 +1584,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - offset_in_cluster); -ret = get_cluster_offset(bs, extent, &m_data, offset, - !(extent->compressed || zeroed), - &cluster_offset, offset_in_cluster, - offset_in_cluster + n_bytes); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + !(extent->compressed || zeroed), + &cluster_offset, offset_in_cluster, + offset_in_cluster + n_bytes); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1596,8 +1596,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, return -EIO; } else { /* allocate */ -ret = get_cluster_offset(bs, extent, &m_data, offset, - true, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, + true, &cluster_offset, 0, 0); } } if (ret == VMDK_ERROR) { @@ -2229,9 +2229,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, sector_num); break; } -ret = get_cluster_offset(bs, extent, NULL, - sector_num << BDRV_SECTOR_BITS, - false, &cluster_offset, 0, 0); +ret = vmdk_get_cluster_offset(bs, extent, NULL, + sector_num << BDRV_SECTOR_BITS, +
[Qemu-block] [PATCH v4 5/8] vmdk: Set maximum bytes allocated in one cycle
Set the maximum bytes allowed to get allocated at once to be not more than the extent size boundary to handle writes at two separate extents appropriately. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 4cee868..7862791 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1624,6 +1624,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t cluster_offset; uint64_t bytes_done = 0; VmdkMetaData m_data; +uint64_t extent_end; if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) { error_report("Wrong offset: offset=0x%" PRIx64 @@ -1637,9 +1638,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, if (!extent) { return -EIO; } +extent_end = extent->end_sector * BDRV_SECTOR_SIZE; + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - - offset_in_cluster); + +/* truncate n_bytes to first cluster because we need to perform COW */ +if (offset_in_cluster > 0) { +n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); +} else { +n_bytes = MIN(bytes, extent_end - offset); +} ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset, !(extent->compressed || zeroed), -- 2.6.2
[Qemu-block] [PATCH v4 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()
Move the cluster tables loading code out of the existing vmdk_get_cluster_offset() function and implement it in separate get_cluster_table() and vmdk_L2load() functions. This patch will help us avoid code duplication in future patches of this series. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 153 --- 1 file changed, 105 insertions(+), 48 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index f403981..4cee868 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1143,6 +1143,105 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, return VMDK_OK; } +/* + * vmdk_l2load + * + * Load a new L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns: + * VMDK_OK: on success + * VMDK_ERROR:in error cases + */ +static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset, + uint32_t **new_l2_table, int *new_l2_index) +{ +int min_index, i, j; +uint32_t *l2_table; +uint32_t min_count; + +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (l2_offset == extent->l2_cache_offsets[i]) { +/* increment the hit count */ +if (++extent->l2_cache_counts[i] == UINT32_MAX) { +for (j = 0; j < L2_CACHE_SIZE; j++) { +extent->l2_cache_counts[j] >>= 1; +} +} +l2_table = extent->l2_cache + (i * extent->l2_size); +goto found; +} +} +/* not found: load a new entry in the least used one */ +min_index = 0; +min_count = UINT32_MAX; +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (extent->l2_cache_counts[i] < min_count) { +min_count = extent->l2_cache_counts[i]; +min_index = i; +} +} +l2_table = extent->l2_cache + (min_index * extent->l2_size); +if (bdrv_pread(extent->file, +(int64_t)l2_offset * 512, +l2_table, +extent->l2_size * sizeof(uint32_t) +) != extent->l2_size * sizeof(uint32_t)) { +return VMDK_ERROR; +} + +extent->l2_cache_offsets[min_index] = l2_offset; +extent->l2_cache_counts[min_index] = 1; +found: +*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; +*new_l2_table = l2_table; + +return VMDK_OK; +} + +/* + * get_cluster_table + * + * for a given offset, load (and allocate if needed) the l2 table. + * + * Returns: + * VMDK_OK:on success + * + * VMDK_UNALLOC: if cluster is not mapped + * + * VMDK_ERROR: in error cases + */ +static int get_cluster_table(VmdkExtent *extent, uint64_t offset, + int *new_l1_index, int *new_l2_offset, + int *new_l2_index, uint32_t **new_l2_table) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +int ret; + +offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; +l1_index = (offset >> 9) / extent->l1_entry_sectors; +if (l1_index >= extent->l1_size) { +return VMDK_ERROR; +} +l2_offset = extent->l1_table[l1_index]; +if (!l2_offset) { +return VMDK_UNALLOC; +} + +ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index); +if (ret < 0) { +return ret; +} + +*new_l1_index = l1_index; +*new_l2_offset = l2_offset; +*new_l2_index = l2_index; +*new_l2_table = l2_table; + +return VMDK_OK; +} + /** * vmdk_get_cluster_offset * @@ -1172,66 +1271,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs, uint64_t skip_start_bytes, uint64_t skip_end_bytes) { -unsigned int l1_index, l2_offset, l2_index; -int min_index, i, j; -uint32_t min_count, *l2_table; +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; bool zeroed = false; int64_t ret; int64_t cluster_sector; -if (m_data) { -m_data->valid = 0; -} if (extent->flat) { *cluster_offset = extent->flat_start_offset; return VMDK_OK; } -offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; -l1_index = (offset >> 9) / extent->l1_entry_sectors; -if (l1_index >= extent->l1_size) { -return VMDK_ERROR; -} -l2_offset = extent->l1_table[l1_index]; -if (!l2_offset) { -return VMDK_UNALLOC; -} -for (i = 0; i < L2_CACHE_SIZE; i++) { -if (l2_offset == extent->l2_cache_offsets[i]) { -/* increment the hit count */ -if (++extent->l2_cache_counts[i] == 0x) { -for (j = 0; j < L2_CACHE_SIZE; j++) { -exten
[Qemu-block] [PATCH v4 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
Rename the existing function get_whole_cluster() to vmdk_perform_cow() as its sole purpose is to perform COW for the first and the last allocated clusters if needed. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 23 ++- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 22be887..73ae786 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } -/** - * get_whole_cluster +/* + * vmdk_perform_cow * * Copy backing file's cluster that covers @sector_num, otherwise write zero, * to the cluster at @cluster_sector_num. @@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) * If @skip_start_sector < @skip_end_sector, the relative range * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave * it for call to write user data in the request. + * + * Returns: + * VMDK_OK: on success + * + * VMDK_ERROR:in error cases */ -static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_offset, - uint64_t offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_perform_cow(BlockDriverState *bs, +VmdkExtent *extent, +uint64_t cluster_offset, +uint64_t offset, +uint64_t skip_start_bytes, +uint64_t skip_end_bytes) { int ret = VMDK_OK; int64_t cluster_bytes; @@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs, * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ -ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, offset, skip_start_bytes, skip_end_bytes); if (ret) { return ret; -- 2.6.2
[Qemu-block] [PATCH v4 6/8] vmdk: New functions to assist allocating multiple clusters
Introduce two new helper functions handle_alloc() and vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple clusters at once starting from a given offset on disk and performs COW if necessary for first and last allocated clusters. vmdk_alloc_cluster_offset() helps to return the offset of the first of the many newly allocated clusters. Also, provide proper documentation for both. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 192 +++ 1 file changed, 182 insertions(+), 10 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 7862791..8d34cd9 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -136,6 +136,7 @@ typedef struct VmdkMetaData { unsigned int l2_offset; int valid; uint32_t *l2_cache_entry; +uint32_t nb_clusters; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1242,6 +1243,174 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, return VMDK_OK; } +/* + * handle_alloc + * + * Allocate new clusters for an area that either is yet unallocated or needs a + * copy on write. If *cluster_offset is non_zero, clusters are only allocated if + * the new allocation can match the specified host offset. + * + * Returns: + * VMDK_OK: if new clusters were allocated, *bytes may be decreased if + * the new allocation doesn't cover all of the requested area. + * *cluster_offset is updated to contain the offset of the + * first newly allocated cluster. + * + * VMDK_UNALLOC: if no clusters could be allocated. *cluster_offset is left + * unchanged. + * + * VMDK_ERROR:in error cases + */ +static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent, +uint64_t offset, uint64_t *cluster_offset, +int64_t *bytes, VmdkMetaData *m_data, +bool allocate, uint32_t *total_alloc_clusters) +{ +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; +uint32_t cluster_sector; +uint32_t nb_clusters; +bool zeroed = false; +uint64_t skip_start_bytes, skip_end_bytes; +int ret; + +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, +&l2_index, &l2_table); +if (ret < 0) { +return ret; +} + +cluster_sector = le32_to_cpu(l2_table[l2_index]); + +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset); +/* Calculate the number of clusters to look for. Here we truncate the last + * cluster, i.e. 1 less than the actual value calculated as we may need to + * perform COW for the last one. */ +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes, +extent->cluster_sectors << BDRV_SECTOR_BITS) - 1; + +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index); +assert(nb_clusters <= INT_MAX); + +/* update bytes according to final nb_clusters value */ +if (nb_clusters != 0) { +*bytes = ((nb_clusters * extent->cluster_sectors) << 9) +- skip_start_bytes; +} else { +nb_clusters = 1; +} +*total_alloc_clusters += nb_clusters; +skip_end_bytes = skip_start_bytes + MIN(*bytes, + extent->cluster_sectors * BDRV_SECTOR_SIZE +- skip_start_bytes); + +if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { +zeroed = true; +} + +if (!cluster_sector || zeroed) { +if (!allocate) { +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; +} + +cluster_sector = extent->next_cluster_sector; +extent->next_cluster_sector += extent->cluster_sectors +* nb_clusters; + +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, + offset, skip_start_bytes, + skip_end_bytes); +if (ret < 0) { +return ret; +} +if (m_data) { +m_data->valid = 1; +m_data->l1_index = l1_index; +m_data->l2_index = l2_index; +m_data->l2_offset = l2_offset; +m_data->l2_cache_entry = &l2_table[l2_index]; +m_data->nb_clusters = nb_clusters; +} +} +*cluster_offset = cluster_sector << BDRV_SECTOR_BITS; +return VMDK_OK; +} + +/* + * vmdk_alloc_clusters + * + * For a given offset on the virtual disk, find the cluster offset in vmdk + * file. If the offset is not found, allocate a new cluster. + * + * If the cluster is newly allocated, m_data->nb_clusters is set to the number + * of contiguous clusters that have been allocated. In this case, the other + * fields of m_data are valid and contain information about the
[Qemu-block] [PATCH v4 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top
Move the existing vmdk_find_offset_in_cluster() function to the top of the driver. Signed-off-by: Ashijeet Acharya Reviewed-by: Fam Zheng --- block/vmdk.c | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index a9bd22b..22be887 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs) s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); } +static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, + int64_t offset) +{ +uint64_t extent_begin_offset, extent_relative_offset; +uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; + +extent_begin_offset = +(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; +extent_relative_offset = offset - extent_begin_offset; +return extent_relative_offset % cluster_size; +} + static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { char *desc; @@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } -static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, - int64_t offset) -{ -uint64_t extent_begin_offset, extent_relative_offset; -uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; - -extent_begin_offset = -(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; -extent_relative_offset = offset - extent_begin_offset; -return extent_relative_offset % cluster_size; -} - static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, int64_t sector_num) { -- 2.6.2
[Qemu-block] [PATCH v4 7/8] vmdk: Update metadata for multiple clusters
Include a next pointer in VmdkMetaData struct to point to the previous allocated L2 table. Modify vmdk_L2update to start updating metadata for allocation of multiple clusters at once. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 129 ++- 1 file changed, 102 insertions(+), 27 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 8d34cd9..e52c373 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -137,6 +137,8 @@ typedef struct VmdkMetaData { int valid; uint32_t *l2_cache_entry; uint32_t nb_clusters; +uint32_t offset; +struct VmdkMetaData *next; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -1116,34 +1118,89 @@ exit: return ret; } -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, - uint32_t offset) +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent, + VmdkMetaData *m_data, bool zeroed) { -offset = cpu_to_le32(offset); +int i; +uint32_t offset, temp_offset; +int *l2_table_array; +int l2_array_size; + +if (zeroed) { +temp_offset = VMDK_GTE_ZEROED; +} else { +temp_offset = m_data->offset; +} + +temp_offset = cpu_to_le32(temp_offset); + +l2_array_size = sizeof(uint32_t) * m_data->nb_clusters; +l2_table_array = qemu_try_blockalign(extent->file->bs, + QEMU_ALIGN_UP(l2_array_size, + BDRV_SECTOR_SIZE)); +if (l2_table_array == NULL) { +return VMDK_ERROR; +} +memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE)); /* update L2 table */ +offset = temp_offset; +for (i = 0; i < m_data->nb_clusters; i++) { +l2_table_array[i] = offset; +if (!zeroed) { +offset += 128; +} +} if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } /* update backup L2 table */ if (extent->l1_backup_table_offset != 0) { m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { + ((int64_t)m_data->l2_offset * 512) + + ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } } + +offset = temp_offset; if (m_data->l2_cache_entry) { -*m_data->l2_cache_entry = offset; +for (i = 0; i < m_data->nb_clusters; i++) { +*m_data->l2_cache_entry = offset; +m_data->l2_cache_entry++; + +if (!zeroed) { +offset += 128; +} +} } +qemu_vfree(l2_table_array); return VMDK_OK; } +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + bool zeroed) +{ +int ret; + +while (m_data->next != NULL) { + +ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed); +if (ret < 0) { +return ret; +} + +m_data = m_data->next; + } + + return VMDK_OK; +} + /* * vmdk_l2load * @@ -1263,7 +1320,7 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t offset, */ static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint64_t offset, uint64_t *cluster_offset, -int64_t *bytes, VmdkMetaData *m_data, +int64_t *bytes, VmdkMetaData **m_data, bool allocate, uint32_t *total_alloc_clusters) { int l1_index, l2_offset, l2_index; @@ -1272,6 +1329,7 @@ static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint32_t nb_clusters; bool zeroed = false; uint64_t skip_start_bytes, skip_end_bytes; +VmdkMetaData *old_m_data; int ret; ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, @@ -1323,13 +1381,21 @@ static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent, if (ret < 0) { return ret; } -if (m_data) { -m_data->valid = 1; -m_data->l1_index = l1_index; -m_data->l2_index = l2_index; -m_data->l2_offset = l
[Qemu-block] [PATCH v4 0/8] Optimize VMDK I/O by allocating multiple clusters
Previously posted series patches: v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html This series helps to optimize the I/O performance of VMDK driver. Patch 1 helps us to move vmdk_find_offset_in_cluster. Patch 2 & 3 perform a simple function re-naming tasks. Patch 4 is used to factor out metadata loading code and implement it in separate functions. This will help us to avoid code duplication in future patches of this series. Patch 5 helps to set the upper limit of the bytes handled in one cycle. Patch 6 adds new functions to help us allocate multiple clusters according to the size requested, perform COW if required and return the offset of the first newly allocated cluster. Patch 7 changes the metadata update code to update the L2 tables for multiple clusters at once. Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster offset only as cluster allocation task is now handled by vmdk_alloc_clusters() Note: v4 has an addition of new optimization of calling bdrv_pwrite_sync() only once for atmost 512 clusters, as a result performance has increased to a great extent (earlier till v2 it was 29%). Optimization test results: This patch series improves 128 KB sequential write performance to an empty VMDK file by 54% Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f vmdk test.vmdk As showoff, these patches now complete a 128M write request on an empty VMDK file on my slow laptop in just 2.7 secs compared to 3.5 mins as in v2.9.0 This is obviously using qemu-io with "--cache writeback" and not an official benchmark, but worth mentioning from a newbie's perspective. Note: These patches pass all 41/41 tests suitable for the VMDK driver. Changes in v4: - fix commit message in patch 1 (fam) - drop size_to_clusters() function (fam) - fix grammatical errors in function documentations (fam) - factor out metadata loading coding in a separate patch (patch 4) (fam) - rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam) - break patch 4(in v3) into separate patches (patch 3 and 8) (fam) - rename extent_size to extent_end (fam) - use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam) - drop next and simply do m_data = m_data->next (fam) Changes in v3: - move size_to_clusters() from patch 1 to 3 (fam) - use DIV_ROUND_UP in size_to_clusters (fam) - make patch 2 compilable (fam) - rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam) - combine patch 3 and patch 4 (as in v2) to make them compilable (fam) - call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam) Changes in v2: - segregate the ugly Patch 1 in v1 into 6 readable and sensible patches - include benchmark test results in v2 Ashijeet Acharya (8): vmdk: Move vmdk_find_offset_in_cluster() to the top vmdk: Rename get_whole_cluster() to vmdk_perform_cow() vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset() vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset() vmdk: Set maximum bytes allocated in one cycle vmdk: New functions to assist allocating multiple clusters vmdk: Update metadata for multiple clusters vmdk: Make vmdk_get_cluster_offset() return cluster offset only block/vmdk.c | 530 +-- 1 file changed, 408 insertions(+), 122 deletions(-) -- 2.6.2
Re: [Qemu-block] [PATCH v3 5/6] vmdk: Set maximum bytes allocated in one cycle
On Fri, Apr 21, 2017 at 8:23 PM, Ashijeet Acharya wrote: > On Wed, Apr 19, 2017 at 6:30 PM, Fam Zheng wrote: >> On Sat, 04/01 20:14, Ashijeet Acharya wrote: >>> Set the maximum bytes allowed to get allocated at once to be not more >>> than the extent size boundary to handle writes at two separate extents >>> appropriately. >>> >>> Signed-off-by: Ashijeet Acharya >>> --- >>> block/vmdk.c | 13 +++-- >>> 1 file changed, 11 insertions(+), 2 deletions(-) >>> >>> diff --git a/block/vmdk.c b/block/vmdk.c >>> index a8babd7..9456ddd 100644 >>> --- a/block/vmdk.c >>> +++ b/block/vmdk.c >>> @@ -1767,6 +1767,7 @@ static int vmdk_pwritev(BlockDriverState *bs, >>> uint64_t offset, >>> int64_t offset_in_cluster, n_bytes; >>> uint64_t cluster_offset; >>> uint64_t bytes_done = 0; >>> +uint64_t extent_size; >>> VmdkMetaData m_data; >>> uint32_t total_alloc_clusters = 0; >>> >>> @@ -1782,9 +1783,17 @@ static int vmdk_pwritev(BlockDriverState *bs, >>> uint64_t offset, >>> if (!extent) { >>> return -EIO; >>> } >>> +extent_size = extent->end_sector * BDRV_SECTOR_SIZE; >> >> Maybe extent_end to be more accurate? > > Done > >>> + >>> offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); >>> -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE >>> - - offset_in_cluster); >>> + >>> +/* truncate n_bytes to first cluster because we need to perform >>> COW */ >> >> Makes sense, but shouldn't this be squashed into patch patch 3? Because it >> looks >> like it is fixing an intermediate bug. > > Did you mean that I should merge this whole patch into patch 3? Maybe > moving it before patch 3 rather than squashing it make more sense? Instead I have moved it before patch 3 in v4 Ashijeet
Re: [Qemu-block] [PATCH v3 6/6] vmdk: Update metadata for multiple clusters
On Fri, Apr 21, 2017 at 1:45 PM, Fam Zheng wrote: > On Sat, 04/01 20:14, Ashijeet Acharya wrote: >> Include a next pointer in VmdkMetaData struct to point to the previous >> allocated L2 table. Modify vmdk_L2update to start updating metadata for >> allocation of multiple clusters at once. >> >> Signed-off-by: Ashijeet Acharya > > This is the metadata part of the coalesed allocation. I think patch 3 is > functionally incomplete without these changes, and is perhaps broken because > metadata is not handled correctly. > > Such an "intermediate functional regression" is not good in a series, which we > need to avoid. I have moved this patch right after patch 3 because merging both will result in an unnecessary huge patch. Will that work? > >> --- >> block/vmdk.c | 136 >> --- >> 1 file changed, 111 insertions(+), 25 deletions(-) >> >> diff --git a/block/vmdk.c b/block/vmdk.c >> index 9456ddd..c7675db 100644 >> --- a/block/vmdk.c >> +++ b/block/vmdk.c >> @@ -137,6 +137,8 @@ typedef struct VmdkMetaData { >> int valid; >> uint32_t *l2_cache_entry; >> uint32_t nb_clusters; >> +uint32_t offset; >> +struct VmdkMetaData *next; >> } VmdkMetaData; >> >> typedef struct VmdkGrainMarker { >> @@ -263,6 +265,12 @@ static inline uint64_t size_to_clusters(VmdkExtent >> *extent, uint64_t size) >> return (DIV_ROUND_UP(size + round_off_size, BDRV_SECTOR_SIZE * 128) - >> 1); >> } >> >> +static inline int64_t vmdk_align_offset(int64_t offset, int n) >> +{ >> +offset = (offset + n - 1) & ~(n - 1); >> +return offset; >> +} >> + >> static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) >> { >> char *desc; >> @@ -1037,29 +1045,88 @@ static void vmdk_refresh_limits(BlockDriverState >> *bs, Error **errp) >> } >> } >> >> -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, >> - uint32_t offset) >> +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent, >> + VmdkMetaData *m_data, bool zeroed) >> { >> -offset = cpu_to_le32(offset); >> +int i; >> +uint32_t offset, temp_offset; >> +int *l2_table_array; >> +int l2_array_size; >> + >> +if (zeroed) { >> +temp_offset = VMDK_GTE_ZEROED; >> +} else { >> +temp_offset = m_data->offset; >> +} >> + >> +temp_offset = cpu_to_le32(temp_offset); >> + >> +l2_array_size = sizeof(uint32_t) * m_data->nb_clusters; >> +l2_table_array = qemu_try_blockalign(extent->file->bs, >> +vmdk_align_offset(l2_array_size, 512)); > > Indentation is off. > > Use QEMU_ALIGN_UP, instead of vmdk_align_offset. > > 512 is a magic number, use BDRV_SECTOR_SIZE. Done > >> +if (l2_table_array == NULL) { >> +return VMDK_ERROR; >> +} >> +memset(l2_table_array, 0, vmdk_align_offset(l2_array_size, 512)); >> + >> /* update L2 table */ >> +offset = temp_offset; >> +for (i = 0; i < m_data->nb_clusters; i++) { >> +l2_table_array[i] = offset; >> +if (!zeroed) { >> +offset += 128; > > Something is going wrong here with endianness on BE host, I believe. I have changed temp_offset to LE above, wouldn't that be enough. I am not sure. > >> +} >> +} >> + >> if (bdrv_pwrite_sync(extent->file, >> -((int64_t)m_data->l2_offset * 512) >> -+ (m_data->l2_index * sizeof(offset)), >> -&offset, sizeof(offset)) < 0) { >> +((int64_t)m_data->l2_offset * 512) >> ++ ((m_data->l2_index) * sizeof(offset)), >> + l2_table_array, l2_array_size) < 0) { > > You can fix the indentation while changing these lines. If not, don't change > it, > or at least don't make it uglier. I have aligned it, if it still looks ugly in v4, I will revert. > >> return VMDK_ERROR; >> } >> + >> /* update backup L2 table */ >> if (extent->l1_backup_table_offset != 0) { >> m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; >> if (bdrv_pwrite_sync(extent->file, >> ((int64_t)m_data->l2_offset * 512) >> -
Re: [Qemu-block] [PATCH v3 5/6] vmdk: Set maximum bytes allocated in one cycle
On Wed, Apr 19, 2017 at 6:30 PM, Fam Zheng wrote: > On Sat, 04/01 20:14, Ashijeet Acharya wrote: >> Set the maximum bytes allowed to get allocated at once to be not more >> than the extent size boundary to handle writes at two separate extents >> appropriately. >> >> Signed-off-by: Ashijeet Acharya >> --- >> block/vmdk.c | 13 +++-- >> 1 file changed, 11 insertions(+), 2 deletions(-) >> >> diff --git a/block/vmdk.c b/block/vmdk.c >> index a8babd7..9456ddd 100644 >> --- a/block/vmdk.c >> +++ b/block/vmdk.c >> @@ -1767,6 +1767,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t >> offset, >> int64_t offset_in_cluster, n_bytes; >> uint64_t cluster_offset; >> uint64_t bytes_done = 0; >> +uint64_t extent_size; >> VmdkMetaData m_data; >> uint32_t total_alloc_clusters = 0; >> >> @@ -1782,9 +1783,17 @@ static int vmdk_pwritev(BlockDriverState *bs, >> uint64_t offset, >> if (!extent) { >> return -EIO; >> } >> +extent_size = extent->end_sector * BDRV_SECTOR_SIZE; > > Maybe extent_end to be more accurate? Done >> + >> offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); >> -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE >> - - offset_in_cluster); >> + >> +/* truncate n_bytes to first cluster because we need to perform COW >> */ > > Makes sense, but shouldn't this be squashed into patch patch 3? Because it > looks > like it is fixing an intermediate bug. Did you mean that I should merge this whole patch into patch 3? Maybe moving it before patch 3 rather than squashing it make more sense? Ashijeet
Re: [Qemu-block] [PATCH v3 4/6] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
On Wed, Apr 19, 2017 at 18:27 Fam Zheng wrote: > On Sat, 04/01 20:14, Ashijeet Acharya wrote: > > Rename the existing get_cluster_offset() function to > > vmdk_get_cluster_offset() and have it make use of the new > > get_cluster_table() to load the cluster tables. Also, it is no longer > > used to allocate new clusters and hence perform COW. Make the necessary > > renames at all the occurrences of get_cluster_offset(). > > > > Signed-off-by: Ashijeet Acharya > > --- > > block/vmdk.c | 117 > +++ > > 1 file changed, 21 insertions(+), 96 deletions(-) > > This is definitely more than a function rename, like I said in reply to > patch 3, > it could probably be split to smaller ones (rename, and others, for > example), > and reordered to make reviewing easier. Maybe, because I have also refactored it to have vmdk_get_cluster_offset() make use of the get_cluster_table() (and friends) to avoid duplication. I will try to split it as 1. Rename 2. Refactor it to make use of get_cluster_table() by moving that out of patch 3 as of now. Will that work? I think this will also keep the compiler happy while reviewing. Ashijeet
Re: [Qemu-block] [PATCH v3 3/6] vmdk: New functions to assist allocating multiple clusters
On Wed, Apr 19, 2017 at 18:26 Fam Zheng wrote: > On Sat, 04/01 20:14, Ashijeet Acharya wrote: > > Move the cluster tables loading code out of the existing > > get_cluster_offset() function to avoid code duplication and implement it > > in separate get_cluster_table() and vmdk_L2load() functions. > > > > Introduce two new helper functions handle_alloc() and > > vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple > > clusters at once starting from a given offset on disk and performs COW > > if necessary for first and last allocated clusters. > > vmdk_alloc_cluster_offset() helps to return the offset of the first of > > the many newly allocated clusters. Also, provide proper documentation > > for both. > > > > Signed-off-by: Ashijeet Acharya > > --- > > block/vmdk.c | 337 > ++- > > 1 file changed, 308 insertions(+), 29 deletions(-) > > > > diff --git a/block/vmdk.c b/block/vmdk.c > > index 73ae786..e5a289d 100644 > > --- a/block/vmdk.c > > +++ b/block/vmdk.c > > @@ -136,6 +136,7 @@ typedef struct VmdkMetaData { > > unsigned int l2_offset; > > int valid; > > uint32_t *l2_cache_entry; > > +uint32_t nb_clusters; > > } VmdkMetaData; > > > > typedef struct VmdkGrainMarker { > > @@ -254,6 +255,14 @@ static inline uint64_t > vmdk_find_offset_in_cluster(VmdkExtent *extent, > > return extent_relative_offset % cluster_size; > > } > > > > +static inline uint64_t size_to_clusters(VmdkExtent *extent, uint64_t > size) > > +{ > > +uint64_t cluster_size, round_off_size; > > +cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; > > +round_off_size = cluster_size - (size % cluster_size); > > +return DIV_ROUND_UP(size + round_off_size, BDRV_SECTOR_SIZE * 128) > - 1; > > What is (BDRV_SECTOR_SIZE * 128)? Do you mean extent->cluster_size? And > the > function doesn't make sense up to me. > > Just un-inline this to > > DIV_ROUND_UP(size, > extent->cluster_sectors << BDRV_SECTOR_BITS) - 1 > > in the calling site and be done with it. > > > +} > > + > > static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) > > { > > char *desc; > > @@ -1028,6 +1037,133 @@ static void vmdk_refresh_limits(BlockDriverState > *bs, Error **errp) > > } > > } > > > > +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, > > + uint32_t offset) > > +{ > > +offset = cpu_to_le32(offset); > > +/* update L2 table */ > > +if (bdrv_pwrite_sync(extent->file, > > +((int64_t)m_data->l2_offset * 512) > > ++ (m_data->l2_index * sizeof(offset)), > > +&offset, sizeof(offset)) < 0) { > > +return VMDK_ERROR; > > +} > > +/* update backup L2 table */ > > +if (extent->l1_backup_table_offset != 0) { > > +m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; > > +if (bdrv_pwrite_sync(extent->file, > > +((int64_t)m_data->l2_offset * 512) > > ++ (m_data->l2_index * sizeof(offset)), > > +&offset, sizeof(offset)) < 0) { > > +return VMDK_ERROR; > > +} > > +} > > +if (m_data->l2_cache_entry) { > > +*m_data->l2_cache_entry = offset; > > +} > > + > > +return VMDK_OK; > > +} > > + > > +/* > > + * vmdk_l2load > > + * > > + * Loads a new L2 table into memory. If the table is in the cache, the > cache > > Not a native speaker, but s/Loads/Load/ feels more nature and consistent > with > other comments. > > > + * is used; otherwise the L2 table is loaded from the image file. > > + * > > + * Returns: > > + * VMDK_OK: on success > > + * VMDK_ERROR:in error cases > > + */ > > +static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int > l2_offset, > > + uint32_t **new_l2_table, int *new_l2_index) > > +{ > > +int min_index, i, j; > > +uint32_t *l2_table; > > +uint32_t min_count; > > + > > +for (i = 0; i < L2_CACHE_SIZE; i++) { > > +if (l2_offset == extent->l2_cache_offsets[i]) { > > +/* increment the hit count */ > > +if (++extent->l2_cache_counts[i] == UINT32_MAX)
Re: [Qemu-block] [PATCH v3 1/6] vmdk: Move vmdk_find_offset_in_cluster() to the top
On Sat, Apr 1, 2017 at 8:14 PM, Ashijeet Acharya wrote: > Move the existing vmdk_find_offset_in_cluster() function to the top of > the driver. Also, introduce a new helper function size_to_clusters() > which returns the number of clusters for a given size in bytes. Here, > we leave the last cluster as we need to perform COW for that one. > I will remove the trailing part of the commit message in v4 as there is no size_to_clusters() in this patch anymore, I forgot to update it! Ashijeet
[Qemu-block] [PATCH v3 5/6] vmdk: Set maximum bytes allocated in one cycle
Set the maximum bytes allowed to get allocated at once to be not more than the extent size boundary to handle writes at two separate extents appropriately. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index a8babd7..9456ddd 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1767,6 +1767,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, int64_t offset_in_cluster, n_bytes; uint64_t cluster_offset; uint64_t bytes_done = 0; +uint64_t extent_size; VmdkMetaData m_data; uint32_t total_alloc_clusters = 0; @@ -1782,9 +1783,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, if (!extent) { return -EIO; } +extent_size = extent->end_sector * BDRV_SECTOR_SIZE; + offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE - - offset_in_cluster); + +/* truncate n_bytes to first cluster because we need to perform COW */ +if (offset_in_cluster > 0) { +n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE + - offset_in_cluster); +} else { +n_bytes = MIN(bytes, extent_size - offset); +} ret = vmdk_alloc_cluster_offset(bs, extent, &m_data, offset, !(extent->compressed || zeroed), -- 2.6.2
[Qemu-block] [PATCH v3 6/6] vmdk: Update metadata for multiple clusters
Include a next pointer in VmdkMetaData struct to point to the previous allocated L2 table. Modify vmdk_L2update to start updating metadata for allocation of multiple clusters at once. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 136 --- 1 file changed, 111 insertions(+), 25 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 9456ddd..c7675db 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -137,6 +137,8 @@ typedef struct VmdkMetaData { int valid; uint32_t *l2_cache_entry; uint32_t nb_clusters; +uint32_t offset; +struct VmdkMetaData *next; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -263,6 +265,12 @@ static inline uint64_t size_to_clusters(VmdkExtent *extent, uint64_t size) return (DIV_ROUND_UP(size + round_off_size, BDRV_SECTOR_SIZE * 128) - 1); } +static inline int64_t vmdk_align_offset(int64_t offset, int n) +{ +offset = (offset + n - 1) & ~(n - 1); +return offset; +} + static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { char *desc; @@ -1037,29 +1045,88 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, - uint32_t offset) +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent, + VmdkMetaData *m_data, bool zeroed) { -offset = cpu_to_le32(offset); +int i; +uint32_t offset, temp_offset; +int *l2_table_array; +int l2_array_size; + +if (zeroed) { +temp_offset = VMDK_GTE_ZEROED; +} else { +temp_offset = m_data->offset; +} + +temp_offset = cpu_to_le32(temp_offset); + +l2_array_size = sizeof(uint32_t) * m_data->nb_clusters; +l2_table_array = qemu_try_blockalign(extent->file->bs, +vmdk_align_offset(l2_array_size, 512)); +if (l2_table_array == NULL) { +return VMDK_ERROR; +} +memset(l2_table_array, 0, vmdk_align_offset(l2_array_size, 512)); + /* update L2 table */ +offset = temp_offset; +for (i = 0; i < m_data->nb_clusters; i++) { +l2_table_array[i] = offset; +if (!zeroed) { +offset += 128; +} +} + if (bdrv_pwrite_sync(extent->file, -((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { +((int64_t)m_data->l2_offset * 512) ++ ((m_data->l2_index) * sizeof(offset)), + l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } + /* update backup L2 table */ if (extent->l1_backup_table_offset != 0) { m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; if (bdrv_pwrite_sync(extent->file, ((int64_t)m_data->l2_offset * 512) -+ (m_data->l2_index * sizeof(offset)), -&offset, sizeof(offset)) < 0) { ++ ((m_data->l2_index) * sizeof(offset)), +l2_table_array, l2_array_size) < 0) { return VMDK_ERROR; } } + +offset = temp_offset; if (m_data->l2_cache_entry) { -*m_data->l2_cache_entry = offset; +for (i = 0; i < m_data->nb_clusters; i++) { +*m_data->l2_cache_entry = offset; +m_data->l2_cache_entry++; + +if (!zeroed) { +offset += 128; +} +} +} + +qemu_vfree(l2_table_array); +return VMDK_OK; +} + +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + bool zeroed) +{ +int ret; + +while (m_data->next != NULL) { +VmdkMetaData *next; + +ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed); +if (ret < 0) { +return ret; +} + +next = m_data->next; +m_data = next; } return VMDK_OK; @@ -1271,7 +1338,7 @@ exit: */ static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint64_t offset, uint64_t *cluster_offset, -int64_t *bytes, VmdkMetaData *m_data, +int64_t *bytes, VmdkMetaData **m_data, bool allocate, uint32_t *total_alloc_clusters) { int l1_index, l2_offset, l2_index; @@ -1280,6 +1347,7 @@ static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent, uint32_t nb_clusters; bool zeroed = false; uint64_t skip_start_bytes, skip_end_bytes; +VmdkMetaData *old_m_data; int ret; ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, @@ -1330,13 +1398,21 @@ static int handle_alloc(BlockDriverState *bs, VmdkExtent *exten
[Qemu-block] [PATCH v3 4/6] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
Rename the existing get_cluster_offset() function to vmdk_get_cluster_offset() and have it make use of the new get_cluster_table() to load the cluster tables. Also, it is no longer used to allocate new clusters and hence perform COW. Make the necessary renames at all the occurrences of get_cluster_offset(). Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 117 +++ 1 file changed, 21 insertions(+), 96 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index e5a289d..a8babd7 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1419,7 +1419,7 @@ static int vmdk_alloc_cluster_offset(BlockDriverState *bs, } /** - * get_cluster_offset + * vmdk_get_cluster_offset * * Look up cluster offset in extent file by sector number, and store in * @cluster_offset. @@ -1427,84 +1427,34 @@ static int vmdk_alloc_cluster_offset(BlockDriverState *bs, * For flat extents, the start offset as parsed from the description file is * returned. * - * For sparse extents, look up in L1, L2 table. If allocate is true, return an - * offset for a new cluster and update L2 cache. If there is a backing file, - * COW is done before returning; otherwise, zeroes are written to the allocated - * cluster. Both COW and zero writing skips the sector range - * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller - * has new data to write there. + * For sparse extents, look up in L1, L2 table. * * Returns: VMDK_OK if cluster exists and mapped in the image. - * VMDK_UNALLOC if cluster is not mapped and @allocate is false. - * VMDK_ERROR if failed. + * VMDK_UNALLOC if cluster is not mapped. + * VMDK_ERROR if failed */ -static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_get_cluster_offset(BlockDriverState *bs, + VmdkExtent *extent, + uint64_t offset, + uint64_t *cluster_offset) { -unsigned int l1_index, l2_offset, l2_index; -int min_index, i, j; -uint32_t min_count, *l2_table; +int l1_index, l2_offset, l2_index; +uint32_t *l2_table; bool zeroed = false; int64_t ret; int64_t cluster_sector; -if (m_data) { -m_data->valid = 0; -} if (extent->flat) { *cluster_offset = extent->flat_start_offset; return VMDK_OK; } -offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; -l1_index = (offset >> 9) / extent->l1_entry_sectors; -if (l1_index >= extent->l1_size) { -return VMDK_ERROR; -} -l2_offset = extent->l1_table[l1_index]; -if (!l2_offset) { -return VMDK_UNALLOC; -} -for (i = 0; i < L2_CACHE_SIZE; i++) { -if (l2_offset == extent->l2_cache_offsets[i]) { -/* increment the hit count */ -if (++extent->l2_cache_counts[i] == 0x) { -for (j = 0; j < L2_CACHE_SIZE; j++) { -extent->l2_cache_counts[j] >>= 1; -} -} -l2_table = extent->l2_cache + (i * extent->l2_size); -goto found; -} -} -/* not found: load a new entry in the least used one */ -min_index = 0; -min_count = 0x; -for (i = 0; i < L2_CACHE_SIZE; i++) { -if (extent->l2_cache_counts[i] < min_count) { -min_count = extent->l2_cache_counts[i]; -min_index = i; -} -} -l2_table = extent->l2_cache + (min_index * extent->l2_size); -if (bdrv_pread(extent->file, -(int64_t)l2_offset * 512, -l2_table, -extent->l2_size * sizeof(uint32_t) -) != extent->l2_size * sizeof(uint32_t)) { -return VMDK_ERROR; +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset, +&l2_index, &l2_table); +if (ret < 0) { +return ret; } -extent->l2_cache_offsets[min_index] = l2_offset; -extent->l2_cache_counts[min_index] = 1; - found: -l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; cluster_sector = le32_to_cpu(l2_table[l2_index]); if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { @@ -1512,31 +1462,9 @@ static int get_cluster_offset(BlockDriverState *bs, } if (!cluster_sector || zeroed) { -if (!allocate) { -return zeroed ? VMDK_ZE
[Qemu-block] [PATCH v3 3/6] vmdk: New functions to assist allocating multiple clusters
Move the cluster tables loading code out of the existing get_cluster_offset() function to avoid code duplication and implement it in separate get_cluster_table() and vmdk_L2load() functions. Introduce two new helper functions handle_alloc() and vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple clusters at once starting from a given offset on disk and performs COW if necessary for first and last allocated clusters. vmdk_alloc_cluster_offset() helps to return the offset of the first of the many newly allocated clusters. Also, provide proper documentation for both. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 337 ++- 1 file changed, 308 insertions(+), 29 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 73ae786..e5a289d 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -136,6 +136,7 @@ typedef struct VmdkMetaData { unsigned int l2_offset; int valid; uint32_t *l2_cache_entry; +uint32_t nb_clusters; } VmdkMetaData; typedef struct VmdkGrainMarker { @@ -254,6 +255,14 @@ static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, return extent_relative_offset % cluster_size; } +static inline uint64_t size_to_clusters(VmdkExtent *extent, uint64_t size) +{ +uint64_t cluster_size, round_off_size; +cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; +round_off_size = cluster_size - (size % cluster_size); +return DIV_ROUND_UP(size + round_off_size, BDRV_SECTOR_SIZE * 128) - 1; +} + static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { char *desc; @@ -1028,6 +1037,133 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + uint32_t offset) +{ +offset = cpu_to_le32(offset); +/* update L2 table */ +if (bdrv_pwrite_sync(extent->file, +((int64_t)m_data->l2_offset * 512) ++ (m_data->l2_index * sizeof(offset)), +&offset, sizeof(offset)) < 0) { +return VMDK_ERROR; +} +/* update backup L2 table */ +if (extent->l1_backup_table_offset != 0) { +m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; +if (bdrv_pwrite_sync(extent->file, +((int64_t)m_data->l2_offset * 512) ++ (m_data->l2_index * sizeof(offset)), +&offset, sizeof(offset)) < 0) { +return VMDK_ERROR; +} +} +if (m_data->l2_cache_entry) { +*m_data->l2_cache_entry = offset; +} + +return VMDK_OK; +} + +/* + * vmdk_l2load + * + * Loads a new L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns: + * VMDK_OK: on success + * VMDK_ERROR:in error cases + */ +static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset, + uint32_t **new_l2_table, int *new_l2_index) +{ +int min_index, i, j; +uint32_t *l2_table; +uint32_t min_count; + +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (l2_offset == extent->l2_cache_offsets[i]) { +/* increment the hit count */ +if (++extent->l2_cache_counts[i] == UINT32_MAX) { +for (j = 0; j < L2_CACHE_SIZE; j++) { +extent->l2_cache_counts[j] >>= 1; +} +} +l2_table = extent->l2_cache + (i * extent->l2_size); +goto found; +} +} +/* not found: load a new entry in the least used one */ +min_index = 0; +min_count = UINT32_MAX; +for (i = 0; i < L2_CACHE_SIZE; i++) { +if (extent->l2_cache_counts[i] < min_count) { +min_count = extent->l2_cache_counts[i]; +min_index = i; +} +} +l2_table = extent->l2_cache + (min_index * extent->l2_size); +if (bdrv_pread(extent->file, +(int64_t)l2_offset * 512, +l2_table, +extent->l2_size * sizeof(uint32_t) +) != extent->l2_size * sizeof(uint32_t)) { +return VMDK_ERROR; +} + +extent->l2_cache_offsets[min_index] = l2_offset; +extent->l2_cache_counts[min_index] = 1; +found: +*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; +*new_l2_table = l2_table; + +return VMDK_OK; +} + +/* + * get_cluster_table + * + * for a given offset, load (and allocate if needed) the l2 table. + * + * Returns: + * VMDK_OK:on success + * + * VMDK_UNALLOC: if cluster is not mapped + * + * VMDK_ERROR: in error cases + */ +static int get_cluster_table(VmdkExtent *extent, uint64_t offset, + int *new_l1_index,
[Qemu-block] [PATCH v3 2/6] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
Rename the existing function get_whole_cluster() to vmdk_perform_cow() as its sole purpose is to perform COW for the first and the last allocated clusters if needed. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 23 ++- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 22be887..73ae786 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } -/** - * get_whole_cluster +/* + * vmdk_perform_cow * * Copy backing file's cluster that covers @sector_num, otherwise write zero, * to the cluster at @cluster_sector_num. @@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) * If @skip_start_sector < @skip_end_sector, the relative range * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave * it for call to write user data in the request. + * + * Returns: + * VMDK_OK: on success + * + * VMDK_ERROR:in error cases */ -static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_offset, - uint64_t offset, - uint64_t skip_start_bytes, - uint64_t skip_end_bytes) +static int vmdk_perform_cow(BlockDriverState *bs, +VmdkExtent *extent, +uint64_t cluster_offset, +uint64_t offset, +uint64_t skip_start_bytes, +uint64_t skip_end_bytes) { int ret = VMDK_OK; int64_t cluster_bytes; @@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs, * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ -ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, offset, skip_start_bytes, skip_end_bytes); if (ret) { return ret; -- 2.6.2
[Qemu-block] [PATCH v3 0/6] Optiomize VMDK I/O by allocating multiple clusters
Previously posted series patches: v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html This series helps to optimize the I/O performance of VMDK driver. Patch 1 helps us to move vmdk_find_offset_in_cluster. Patch 2 performs a simple function re-naming task. Patch 3 adds new functions to help us allocate multiple clusters according to the size requested, perform COW if required and return the offset of the first newly allocated cluster. Also make loading of metadata tables easier and avoid code duplication. Patch 4 performs a simple function re-naming task and re-factors it to make use of new metadata functions to avoid code duplication. Patch 5 helps to set the upper limit of the bytes handled in one cycle. Patch 6 changes the metadata update code to update the L2 tables for multiple clusters at once. Note: v3 has an addition of new optimization of calling bdrv_pwrite_sync() only once for atmost 512 clusters, as a result performance has increased to a great extent (earlier till v2 it was 29%). Optimization test results: This patch series improves 128 KB sequential write performance to an empty VMDK file by 54% Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f vmdk test.vmdk Note: These patches pass all 41/41 tests suitable for the VMDK driver. Changes in v3: - move size_to_clusters() from patch 1 to 3 (fam) - use DIV_ROUND_UP in size_to_clusters (fam) - make patch 2 compilable (fam) - rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam) - combine patch 3 and patch 4 (as in v2) to make them compilable (fam) - call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam) Changes in v2: - segregate the ugly Patch 1 in v1 into 6 readable and sensible patches - include benchmark test results in v2 Ashijeet Acharya (6): vmdk: Move vmdk_find_offset_in_cluster() to the top vmdk: Rename get_whole_cluster() to vmdk_perform_cow() vmdk: New functions to assist allocating multiple clusters vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset() vmdk: Set maximum bytes allocated in one cycle vmdk: Update metadata for multiple clusters block/vmdk.c | 608 --- 1 file changed, 456 insertions(+), 152 deletions(-) -- 2.6.2
[Qemu-block] [PATCH v3 1/6] vmdk: Move vmdk_find_offset_in_cluster() to the top
Move the existing vmdk_find_offset_in_cluster() function to the top of the driver. Also, introduce a new helper function size_to_clusters() which returns the number of clusters for a given size in bytes. Here, we leave the last cluster as we need to perform COW for that one. Signed-off-by: Ashijeet Acharya --- block/vmdk.c | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index a9bd22b..22be887 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs) s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); } +static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, + int64_t offset) +{ +uint64_t extent_begin_offset, extent_relative_offset; +uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; + +extent_begin_offset = +(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; +extent_relative_offset = offset - extent_begin_offset; +return extent_relative_offset % cluster_size; +} + static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { char *desc; @@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } -static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, - int64_t offset) -{ -uint64_t extent_begin_offset, extent_relative_offset; -uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; - -extent_begin_offset = -(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; -extent_relative_offset = offset - extent_begin_offset; -return extent_relative_offset % cluster_size; -} - static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, int64_t sector_num) { -- 2.6.2
Re: [Qemu-block] [Qemu-devel] [PATCH v2 3/7] vmdk: Factor out metadata loading code out of get_cluster_offset()
On Fri, Mar 31, 2017 at 11:33 AM, Fam Zheng wrote: > On Sat, 03/25 16:48, Ashijeet Acharya wrote: >> Move the cluster tables loading code out of the existing >> get_cluster_offset() function and implement it in separate > > Now it's renamed to vmdk_perform_cow() in previous patch, the commit message > following it should be updated. No, I think you confused get_cluster_offset() with get_whole_cluster() here :-) I have a separate patch 5/7 which renames get_cluster_offset(), so the commit message is fine till this stage. > >> get_cluster_table() and vmdk_L2load() functions. This patch will help >> us avoid code duplication in future patches of this series. >> >> Signed-off-by: Ashijeet Acharya >> --- >> block/vmdk.c | 99 >> >> 1 file changed, 99 insertions(+) >> >> diff --git a/block/vmdk.c b/block/vmdk.c >> index f5fda2c..a42322e 100644 >> --- a/block/vmdk.c >> +++ b/block/vmdk.c >> @@ -1037,6 +1037,105 @@ static void vmdk_refresh_limits(BlockDriverState >> *bs, Error **errp) >> } >> >> /* >> + * vmdk_L2load > > Personally, I think vmdk_l2load is good enough, the upper case doesn't add > readability but makes it slightly harder to type. Done. > >> + * >> + * Loads a new L2 table into memory. If the table is in the cache, the cache >> + * is used; otherwise the L2 table is loaded from the image file. >> + * >> + * Returns: >> + * VMDK_OK: on success >> + * VMDK_ERROR:in error cases >> + */ >> +static int vmdk_L2load(VmdkExtent *extent, uint64_t offset, int l2_offset, >> + uint32_t **new_l2_table, int *new_l2_index) >> +{ >> +int min_index, i, j; >> +uint32_t *l2_table; >> +uint32_t min_count; >> + >> +for (i = 0; i < L2_CACHE_SIZE; i++) { >> +if (l2_offset == extent->l2_cache_offsets[i]) { >> +/* increment the hit count */ >> +if (++extent->l2_cache_counts[i] == 0x) { > > Please use UINT32_MAX. Done. > >> +for (j = 0; j < L2_CACHE_SIZE; j++) { >> +extent->l2_cache_counts[j] >>= 1; >> +} >> +} >> +l2_table = extent->l2_cache + (i * extent->l2_size); >> +goto found; >> +} >> +} >> +/* not found: load a new entry in the least used one */ >> +min_index = 0; >> +min_count = 0x; > > Please use UINT32_MAX. Done. > >> +for (i = 0; i < L2_CACHE_SIZE; i++) { >> +if (extent->l2_cache_counts[i] < min_count) { >> +min_count = extent->l2_cache_counts[i]; >> +min_index = i; >> +} >> +} >> +l2_table = extent->l2_cache + (min_index * extent->l2_size); >> +if (bdrv_pread(extent->file, >> +(int64_t)l2_offset * 512, >> +l2_table, >> +extent->l2_size * sizeof(uint32_t) >> +) != extent->l2_size * sizeof(uint32_t)) { >> +return VMDK_ERROR; >> +} >> + >> +extent->l2_cache_offsets[min_index] = l2_offset; >> +extent->l2_cache_counts[min_index] = 1; >> +found: >> +*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % >> extent->l2_size; >> +*new_l2_table = l2_table; >> + >> +return VMDK_OK; >> +} >> + >> +/* >> + * get_cluster_table >> + * >> + * for a given offset, load (and allocate if needed) the l2 table. >> + * >> + * Returns: >> + * VMDK_OK:on success >> + * >> + * VMDK_UNALLOC: if cluster is not mapped >> + * >> + * VMDK_ERROR: in error cases >> + */ >> +static int get_cluster_table(VmdkExtent *extent, uint64_t offset, >> + int *new_l1_index, int *new_l2_offset, >> + int *new_l2_index, uint32_t **new_l2_table) > > Again, a static function must be introduced with the code change where it is > used, at least for once. It keeps the compiler happy (-Wunused-function) and > makes reviewing easy. Done. Ashijeet
Re: [Qemu-block] [Qemu-devel] [PATCH v2 7/7] vmdk: Update metadata for multiple clusters
On Fri, Mar 31, 2017 at 2:38 PM, Fam Zheng wrote: > On Fri, 03/31 14:17, Ashijeet Acharya wrote: >> On Fri, Mar 31, 2017 at 12:56 PM, Fam Zheng wrote: >> > On Sat, 03/25 16:48, Ashijeet Acharya wrote: >> >> Include a next pointer in VmdkMetaData struct to point to the previous >> >> allocated L2 table. Modify vmdk_L2update to start updating metadata for >> >> allocation of multiple clusters at once. >> >> >> >> Signed-off-by: Ashijeet Acharya >> >> --- >> >> block/vmdk.c | 131 >> >> ++- >> >> 1 file changed, 102 insertions(+), 29 deletions(-) >> >> >> >> diff --git a/block/vmdk.c b/block/vmdk.c >> >> index 3de8b8f..4517409 100644 >> >> --- a/block/vmdk.c >> >> +++ b/block/vmdk.c >> >> @@ -137,6 +137,8 @@ typedef struct VmdkMetaData { >> >> int valid; >> >> uint32_t *l2_cache_entry; >> >> uint32_t nb_clusters; >> >> +uint32_t offset; >> >> +struct VmdkMetaData *next; >> >> } VmdkMetaData; >> >> >> >> typedef struct VmdkGrainMarker { >> >> @@ -1037,29 +1039,81 @@ static void vmdk_refresh_limits(BlockDriverState >> >> *bs, Error **errp) >> >> } >> >> } >> >> >> >> -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, >> >> - uint32_t offset) >> >> +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent, >> >> + VmdkMetaData *m_data, bool zeroed) >> >> { >> >> -offset = cpu_to_le32(offset); >> >> +int i; >> >> +uint32_t offset, temp_offset; >> >> + >> >> +if (zeroed) { >> >> +temp_offset = VMDK_GTE_ZEROED; >> >> +} else { >> >> +temp_offset = m_data->offset; >> >> +} >> >> + >> >> +temp_offset = cpu_to_le32(temp_offset); >> >> + >> >> /* update L2 table */ >> >> -if (bdrv_pwrite_sync(extent->file, >> >> +offset = temp_offset; >> >> +for (i = 0; i < m_data->nb_clusters; i++) { >> >> +if (bdrv_pwrite_sync(extent->file, >> >> ((int64_t)m_data->l2_offset * 512) >> >> -+ (m_data->l2_index * sizeof(offset)), >> >> -&offset, sizeof(offset)) < 0) { >> >> -return VMDK_ERROR; >> >> ++ ((m_data->l2_index + i) * sizeof(offset)), >> >> +&(offset), sizeof(offset)) < 0) { >> >> +return VMDK_ERROR; >> >> +} >> >> +if (!zeroed) { >> >> +offset += 128; >> >> +} >> >> } >> >> + >> >> /* update backup L2 table */ >> >> +offset = temp_offset; >> >> if (extent->l1_backup_table_offset != 0) { >> >> m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; >> >> -if (bdrv_pwrite_sync(extent->file, >> >> -((int64_t)m_data->l2_offset * 512) >> >> -+ (m_data->l2_index * sizeof(offset)), >> >> -&offset, sizeof(offset)) < 0) { >> >> -return VMDK_ERROR; >> >> +for (i = 0; i < m_data->nb_clusters; i++) { >> >> +if (bdrv_pwrite_sync(extent->file, >> >> +((int64_t)m_data->l2_offset * 512) >> >> ++ ((m_data->l2_index + i) * sizeof(offset)), >> >> +&(offset), sizeof(offset)) < 0) { >> >> +return VMDK_ERROR; >> >> +} >> >> +if (!zeroed) { >> >> +offset += 128; >> >> +} >> >> } >> >> } >> >> + >> >> +offset = temp_offset; >> >> if (m_data->l2_cache_entry) { >> >> -*m_data->l2_cache_entry = offset; >> >> +for (i = 0; i < m_data->nb_clusters; i++) { >> >> +*m_data->l2_cache_entry = offset; >> >> +m_data->l2_cache_entry++; >> >>