Am 02.09.2013 um 12:04 hat Max Reitz geschrieben: > Add functionality for expanding zero clusters. This is necessary for > downgrading the image version to one without zero cluster support. > > For non-backed images, this function may also just discard zero clusters > instead of truly expanding them. > > Signed-off-by: Max Reitz <mre...@redhat.com> > --- > block/qcow2-cluster.c | 228 > +++++++++++++++++++++++++++++++++++++++++++++++++ > block/qcow2-refcount.c | 29 ++++--- > block/qcow2.h | 5 ++ > 3 files changed, 248 insertions(+), 14 deletions(-) > > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c > index 2d5aa92..c90fb51 100644 > --- a/block/qcow2-cluster.c > +++ b/block/qcow2-cluster.c > @@ -1497,3 +1497,231 @@ fail: > > return ret; > } > + > +/* > + * Expands all zero clusters in a specific L1 table (or deallocates them, for > + * non-backed non-pre-allocated zero clusters). > + * > + * expanded_clusters is a bitmap where every bit corresponds to one cluster > in > + * the image file; a bit gets set if the corresponding cluster has been used > for > + * zero expansion (i.e., has been filled with zeroes and is referenced from > an > + * L2 table). nb_clusters contains the total cluster count of the image file, > + * i.e., the number of bits in expanded_clusters. > + */ > +static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t > *l1_table, > + int l1_size, uint8_t > *expanded_clusters, > + uint64_t nb_clusters) > +{ > + BDRVQcowState *s = bs->opaque; > + bool is_active_l1 = (l1_table == s->l1_table); > + uint64_t *l2_table = NULL; > + int ret; > + int i, j; > + > + if (!is_active_l1) { > + /* inactive L2 tables require a buffer to be stored in when loading > + * them from disk */ > + l2_table = qemu_blockalign(bs, s->cluster_size); > + } > + > + for (i = 0; i < l1_size; i++) { > + uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; > + bool l2_dirty = false; > + > + if (!l2_offset) { > + /* unallocated */ > + continue; > + } > + > + if (is_active_l1) { > + /* get active L2 tables from cache */ > + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, > + (void **)&l2_table); > + } else { > + /* load inactive L2 tables from disk */ > + ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, > + (void *)l2_table, s->cluster_sectors); > + } > + if (ret < 0) { > + goto fail; > + } > + > + for (j = 0; j < s->l2_size; j++) { > + uint64_t l2_entry = be64_to_cpu(l2_table[j]); > + int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index; > + int cluster_type = qcow2_get_cluster_type(l2_entry); > + > + if (cluster_type == QCOW2_CLUSTER_NORMAL) { > + cluster_index = offset >> s->cluster_bits; > + assert((cluster_index >= 0) && (cluster_index < > nb_clusters)); > + if (expanded_clusters[cluster_index / 8] & > + (1 << (cluster_index % 8))) { > + /* Probably a shared L2 table; this cluster was a zero > + * cluster which has been expanded, its refcount > + * therefore most likely requires an update. */ > + ret = qcow2_update_cluster_refcount(bs, cluster_index, 1, > + QCOW2_DISCARD_NEVER); > + if (ret < 0) { > + goto fail; > + } > + /* Since we just increased the refcount, the COPIED flag > may > + * no longer be set. */ > + l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED); > + l2_dirty = true; > + } > + continue; > + } > + else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) > { > + continue; > + } > + > + if (!offset) { > + /* not preallocated */ > + if (!bs->backing_hd) { > + /* not backed; therefore we can simply deallocate the > + * cluster */ > + l2_table[j] = 0; > + l2_dirty = true; > + continue; > + } > + > + offset = qcow2_alloc_clusters(bs, s->cluster_size); > + if (offset < 0) { > + ret = offset; > + goto fail; > + } > + } > + > + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT, > + offset, s->cluster_size); > + if (ret < 0) { > + qcow2_free_clusters(bs, offset, s->cluster_size, > + QCOW2_DISCARD_ALWAYS); > + goto fail; > + } > + > + ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE, > + s->cluster_sectors); > + if (ret < 0) { > + qcow2_free_clusters(bs, offset, s->cluster_size, > + QCOW2_DISCARD_ALWAYS); > + goto fail; > + } > + > + l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); > + l2_dirty = true; > + > + cluster_index = offset >> s->cluster_bits; > + assert((cluster_index >= 0) && (cluster_index < nb_clusters)); > + expanded_clusters[cluster_index / 8] |= 1 << (cluster_index % 8); > + } > + > + if (is_active_l1) { > + if (l2_dirty) { > + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); > + qcow2_cache_depends_on_flush(s->l2_table_cache); > + } > + ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); > + if (ret < 0) { > + l2_table = NULL; > + goto fail; > + } > + } else { > + if (l2_dirty) { > + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT & > + ~(QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2), > l2_offset, > + s->cluster_size); > + if (ret < 0) { > + goto fail; > + } > + > + ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, > + (void *)l2_table, s->cluster_sectors); > + if (ret < 0) { > + goto fail; > + } > + } > + } > + } > + > + ret = 0; > + > +fail: > + if (l2_table) { > + if (!is_active_l1) { > + qemu_vfree(l2_table); > + } else { > + if (ret < 0) { > + qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); > + } else { > + ret = qcow2_cache_put(bs, s->l2_table_cache, > + (void **)&l2_table); > + } > + } > + } > + return ret; > +} > + > +/* > + * For backed images, expands all zero clusters on the image. For non-backed > + * images, deallocates all non-pre-allocated zero clusters (and claims the > + * allocation for pre-allocated ones). This is important for downgrading to a > + * qcow2 version which doesn't yet support metadata zero clusters. > + */ > +int qcow2_expand_zero_clusters(BlockDriverState *bs) > +{ > + BDRVQcowState *s = bs->opaque; > + uint64_t *l1_table = NULL; > + uint64_t nb_clusters; > + uint8_t *expanded_clusters; > + int ret; > + int i, j; > + > + nb_clusters = bs->total_sectors >> (s->cluster_bits - BDRV_SECTOR_BITS);
Shouldn't this value actually be rounded up? > + expanded_clusters = g_malloc0(nb_clusters / 8); Just like the size here? > + > + ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, > + expanded_clusters, nb_clusters); > + if (ret < 0) { > + goto fail; > + } > + > + /* Inactive L1 tables may point to active L2 tables - therefore it is > + * necessary to flush the L2 table cache before trying to access the L2 > + * tables pointed to by inactive L1 entries (else we might try to expand > + * zero clusters that have already been expanded) */ > + ret = qcow2_cache_flush(bs, s->l2_table_cache); > + if (ret < 0) { > + goto fail; > + } I'm afraid that this is not enough in fact. If you modify an active L2 table through an inactive L1, you bypass the cache and use bdrv_write(). The cache will still have a stale copy of the table. Can't we just access inactive L2 tables through the cache as well? Or can we get into trouble in other places if we do that? If so, we'd have to implement a function that flushes _and empties_ the cache. (And then we could always go the bdrv_read/write path, so there's some code to save either way) > + for (i = 0; i < s->nb_snapshots; i++) { > + int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) + > + BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE; > + > + l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); > + > + ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset / > + BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors); > + if (ret < 0) { > + goto fail; > + } > + > + for (j = 0; j < s->snapshots[i].l1_size; j++) { > + be64_to_cpus(&l1_table[j]); > + } > + > + ret = expand_zero_clusters_in_l1(bs, l1_table, > s->snapshots[i].l1_size, > + expanded_clusters, nb_clusters); > + if (ret < 0) { > + goto fail; > + } > + } > + > + ret = 0; > + > +fail: > + g_free(expanded_clusters); > + g_free(l1_table); > + return ret; > +} Kevin