On 05/08/2012 01:34 AM, Dong Xu Wang wrote:
Provide a new file format: add-cow. The usage can be found in add-cow.txt of
this patch.
CC: Kevin Wolf<kw...@redhat.com>
CC: Stefan Hajnoczi<stefa...@linux.vnet.ibm.com>
Signed-off-by: Dong Xu Wang<wdon...@linux.vnet.ibm.com>
You should split out the spec to be the first patch. That makes it easier for
people to review the specification independent of the code and also makes
subsequent code review easier.
---
Makefile.objs | 1 +
block.c | 2 +-
block.h | 1 +
block/add-cow-cache.c | 193 +++++++++++++++++++++
block/add-cow.c | 446 ++++++++++++++++++++++++++++++++++++++++++++++++
block/add-cow.h | 83 +++++++++
block_int.h | 1 +
docs/specs/add-cow.txt | 68 ++++++++
qemu-img.c | 39 +++++
9 files changed, 833 insertions(+), 1 deletion(-)
create mode 100644 block/add-cow-cache.c
create mode 100644 block/add-cow.c
create mode 100644 block/add-cow.h
create mode 100644 docs/specs/add-cow.txt
diff --git a/Makefile.objs b/Makefile.objs
index 70c5c79..10c5c52 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -52,6 +52,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o
dmg.o bochs.o vpc.o vv
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
qcow2-cache.o
block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-nested-y += qed-check.o
+block-nested-y += add-cow.o add-cow-cache.o
block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
block-nested-y += stream.o
block-nested-$(CONFIG_WIN32) += raw-win32.o
diff --git a/block.c b/block.c
index 43c794c..206860c 100644
--- a/block.c
+++ b/block.c
@@ -196,7 +196,7 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
}
/* check if the path starts with "<protocol>:" */
-static int path_has_protocol(const char *path)
+int path_has_protocol(const char *path)
{
#ifdef _WIN32
if (is_windows_drive(path) ||
diff --git a/block.h b/block.h
index f163e54..f74c79e 100644
--- a/block.h
+++ b/block.h
@@ -319,6 +319,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size,
QEMUSnapshotInfo *sn);
char *get_human_readable_size(char *buf, int buf_size, int64_t size);
int path_is_absolute(const char *path);
+int path_has_protocol(const char *path);
void path_combine(char *dest, int dest_size,
const char *base_path,
const char *filename);
diff --git a/block/add-cow-cache.c b/block/add-cow-cache.c
new file mode 100644
index 0000000..3ae23c1
--- /dev/null
+++ b/block/add-cow-cache.c
@@ -0,0 +1,193 @@
+/*
+ * Cache For QEMU ADD-COW Disk Format
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
This is not a valid copyright block. Look at hw/virtio.c for an example. If
you want to do LGPL instead of GPL, it should also be 2.1 or later.
+#include "block_int.h"
+#include "qemu-common.h"
+#include "add-cow.h"
+
+/* Based on qcow2-cache.c */
If the code is based on qcow2, then you need to preserve the qcow2 copyrights in
this file.
+AddCowCache *add_cow_cache_create(BlockDriverState *bs, int num_tables,
+ bool writethrough)
+{
+ AddCowCache *c;
+ int i;
+
+ c = g_malloc0(sizeof(*c));
+ c->size = num_tables;
+ c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
+ c->writethrough = writethrough;
+ c->entry_size = ADD_COW_CACHE_ENTRY_SIZE;
+
+ for (i = 0; i< c->size; i++) {
+ c->entries[i].table = qemu_blockalign(bs, c->entry_size);
+ c->entries[i].offset = -1;
+ }
+
+ return c;
+}
+
+int add_cow_cache_destroy(BlockDriverState *bs, AddCowCache *c)
+{
+ int i;
+
+ for (i = 0; i< c->size; i++) {
+ qemu_vfree(c->entries[i].table);
+ }
+
+ g_free(c->entries);
+ g_free(c);
+
+ return 0;
+}
+
+static int add_cow_cache_entry_flush(BlockDriverState *bs,
+ AddCowCache *c,
+ int i)
+{
+ BDRVAddCowState *s = bs->opaque;
+ int ret = 0;
+
+ if (!c->entries[i].dirty || -1 == c->entries[i].offset) {
+ return ret;
+ }
+
+ ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + c->entries[i].offset,
+ c->entries[i].table,
+ MIN(c->entry_size, s->bitmap_size - c->entries[i].offset));
This is a synchronous I/O function. We shouldn't introduce new formats that
have *any* synchronous I/O in them.
Honestly, I think using coroutines for this format is a mistake. It's simple
enough that doing a state machine would be straight forward enough.
+ if (ret< 0) {
+ return ret;
+ }
+
+ c->entries[i].dirty = false;
+
+ return 0;
+}
+
+int add_cow_cache_flush(BlockDriverState *bs, AddCowCache *c)
+{
+ BDRVAddCowState *s = bs->opaque;
+ int result = 0;
+ int ret;
+ int i;
+
+ ret = bdrv_flush(s->image_hd);
+ if (ret< 0) {
+ return result;
+ }
+
+ for (i = 0; i< c->size; i++) {
+ ret = add_cow_cache_entry_flush(bs, c, i);
+ if (ret< 0&& result != -ENOSPC) {
+ result = ret;
+ }
+ }
+
+ if (result == 0) {
+ ret = bdrv_flush(bs->file);
+ if (ret< 0) {
+ result = ret;
+ }
+ }
+
+ return result;
+}
+
+static int add_cow_cache_find_entry_to_replace(AddCowCache *c)
+{
+ int i;
+ int min_count = INT_MAX;
+ int min_index = -1;
+
+
+ for (i = 0; i< c->size; i++) {
+ if (c->entries[i].cache_hits< min_count) {
+ min_index = i;
+ min_count = c->entries[i].cache_hits;
+ }
+
+ c->entries[i].cache_hits /= 2;
+ }
+
+ if (min_index == -1) {
+ abort();
+ }
+ return min_index;
+}
+
+static int add_cow_cache_do_get(BlockDriverState *bs, AddCowCache *c,
+ uint64_t offset, void **table)
+{
+ int i, ret;
+
+ for (i = 0; i< c->size; i++) {
+ if (c->entries[i].offset == offset) {
+ goto found;
+ }
+ }
+
+ i = add_cow_cache_find_entry_to_replace(c);
+ if (i< 0) {
+ return i;
+ }
+
+ ret = add_cow_cache_entry_flush(bs, c, i);
+ if (ret< 0) {
+ return ret;
+ }
+ c->entries[i].offset = -1;
+ ret = bdrv_pread(bs->file, sizeof(AddCowHeader) + offset,
+ c->entries[i].table, c->entry_size);
+ if (ret< 0) {
+ return ret;
+ }
+
+ c->entries[i].cache_hits = 32;
+ c->entries[i].offset = offset;
+
+found:
+ c->entries[i].cache_hits++;
+ *table = c->entries[i].table;
+
+ return 0;
+}
+
+int add_cow_cache_get(BlockDriverState *bs, AddCowCache *c, uint64_t
sector_num,
+ void **table)
+{
+ /* each byte in bitmap indicates 8 * SECTORS_PER_CLUSTER clusters */
+ uint64_t offset = offset_in_bitmap(sector_num)& (~(c->entry_size - 1));
+ return add_cow_cache_do_get(bs, c, offset, table);
+}
+
+void add_cow_cache_entry_mark_dirty(AddCowCache *c, void *table)
+{
+ int i;
+
+ for (i = 0; i< c->size; i++) {
+ if (c->entries[i].table == table) {
+ goto found;
+ }
+ }
+ abort();
+
+found:
+ c->entries[i].dirty = true;
+}
+
+bool add_cow_cache_set_writethrough(BlockDriverState *bs, AddCowCache *c,
+ bool enable)
+{
+ bool old = c->writethrough;
+
+ if (!old&& enable) {
+ add_cow_cache_flush(bs, c);
+ }
+
+ c->writethrough = enable;
+ return old;
+}
This cache confuses me a bit. It's an LRU? Couldn't we share some code here
with the QED cache?
diff --git a/block/add-cow.c b/block/add-cow.c
new file mode 100644
index 0000000..8bf27ab
--- /dev/null
+++ b/block/add-cow.c
@@ -0,0 +1,446 @@
+/*
+ * QEMU ADD-COW Disk Format
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ * Dong Xu Wang<wdon...@linux.vnet.ibm.com>
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include "add-cow.h"
+
+static int add_cow_probe(const uint8_t *buf, int buf_size, const char
*filename)
+{
+ const AddCowHeader *header = (const AddCowHeader *)buf;
+
+ if (be64_to_cpu(header->magic) == ADD_COW_MAGIC&&
+ be32_to_cpu(header->version) == ADD_COW_VERSION) {
+ return 100;
+ } else {
+ return 0;
+ }
+}
+
+static int add_cow_create(const char *filename, QEMUOptionParameter *options)
+{
+ AddCowHeader header;
+ int64_t image_sectors = 0;
+ const char *backing_filename = NULL;
+ const char *image_filename = NULL;
+ int ret;
+ BlockDriverState *bs, *image_bs = NULL;
+
+ while (options&& options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ image_sectors = options->value.n / BDRV_SECTOR_SIZE;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ backing_filename = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
+ image_filename = options->value.s;
+ }
+ options++;
+ }
+
+ if (!backing_filename || !image_filename) {
+ error_report("Both backing_file and image_file should be given.");
+ return -EINVAL;
+ }
+
+ ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
+ | BDRV_O_CACHE_WB);
+ if (ret< 0) {
+ return ret;
+ }
+ image_sectors = image_bs->total_sectors;
+ bdrv_delete(image_bs);
+
+ ret = bdrv_create_file(filename, NULL);
+ if (ret< 0) {
+ return ret;
+ }
+
+ ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+ if (ret< 0) {
+ return ret;
+ }
+
+ memset(&header, 0, sizeof(header));
+ header.magic = cpu_to_be64(ADD_COW_MAGIC);
+ header.version = cpu_to_be32(ADD_COW_VERSION);
+ pstrcpy(header.backing_file, sizeof(header.backing_file),
backing_filename);
+ pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
+
+ ret = bdrv_pwrite(bs, 0,&header, sizeof(header));
+ if (ret< 0) {
+ bdrv_delete(bs);
+ return ret;
+ }
+
+ BlockDriver *drv = bdrv_find_format("add-cow");
Please make all declarations at the top level of a function. While this allowed
by C99, it shouldn't been IMHO. We are pretty consistent about this in QEMU.
+ assert(drv != NULL);
+ ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
+ if (ret< 0) {
+ bdrv_delete(bs);
+ return ret;
+ }
+
+ ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
+ bdrv_delete(bs);
+ return ret;
+}
+
+static int add_cow_open(BlockDriverState *bs, int flags)
+{
+ AddCowHeader header;
+ char image_filename[ADD_COW_FILE_LEN];
+ BlockDriver *image_drv = NULL;
+ int ret;
+ bool writethrough;
+ int sector_per_byte;
+ BDRVAddCowState *s = bs->opaque;
+
+ ret = bdrv_pread(bs->file, 0,&header, sizeof(header));
+ if (ret != sizeof(header)) {
+ goto fail;
+ }
+
+ if (be64_to_cpu(header.magic) != ADD_COW_MAGIC) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (be32_to_cpu(header.version) != ADD_COW_VERSION) {
+ char version[64];
+ snprintf(version, sizeof(version), "ADD-COW version %d",
+ be32_to_cpu(header.version));
+ qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bs->device_name, "add-cow", version);
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ QEMU_BUILD_BUG_ON(sizeof(bs->backing_file) != sizeof(header.backing_file));
+ pstrcpy(bs->backing_file, sizeof(bs->backing_file), header.backing_file);
+
+ if (header.image_file[0] == '\0') {
+ ret = -ENOENT;
+ goto fail;
+ }
+ header.image_file[ADD_COW_FILE_LEN - 1] = '\0';
+ s->image_hd = bdrv_new("");
+ if (path_has_protocol(header.image_file)) {
+ pstrcpy(image_filename, sizeof(image_filename), header.image_file);
+ } else {
+ path_combine(image_filename, sizeof(image_filename),
+ bs->filename, header.image_file);
+ }
+
+ image_drv = bdrv_find_format("raw");
+ ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
+ if (ret< 0) {
+ bdrv_delete(s->image_hd);
+ goto fail;
+ }
+ bs->total_sectors = s->image_hd->total_sectors;
+ s->cluster_size = ADD_COW_CLUSTER_SIZE;
+ sector_per_byte = SECTORS_PER_CLUSTER * 8;
+ s->bitmap_size =
+ (bs->total_sectors + sector_per_byte - 1) / sector_per_byte;
+ writethrough = ((flags& BDRV_O_CACHE_WB) == 0);
+ s->bitmap_cache =
+ add_cow_cache_create(bs, ADD_COW_CACHE_SIZE, writethrough);
+
+ qemu_co_mutex_init(&s->lock);
+ return 0;
+fail:
+ if (s->bitmap_cache) {
+ add_cow_cache_destroy(bs, s->bitmap_cache);
+ }
+ return ret;
+}
+
+static void add_cow_close(BlockDriverState *bs)
+{
+ BDRVAddCowState *s = bs->opaque;
+ add_cow_cache_destroy(bs, s->bitmap_cache);
+ bdrv_delete(s->image_hd);
+}
+
+static bool is_allocated(BlockDriverState *bs, int64_t sector_num)
+{
+ BDRVAddCowState *s = bs->opaque;
+ int64_t cluster_num = sector_num / SECTORS_PER_CLUSTER;
+ uint8_t *table = NULL;
+ int ret = add_cow_cache_get(bs, s->bitmap_cache,
+ sector_num, (void **)&table);
+
+ if (ret< 0) {
+ return 0;
+ }
+ return table[cluster_num / 8 % ADD_COW_CACHE_ENTRY_SIZE]
+& (1<< (cluster_num % 8));
+}
+
+static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int *num_same)
+{
+ int changed;
+ int64_t cluster_num;
+
+ if (nb_sectors == 0) {
+ *num_same = 0;
+ return 0;
+ }
+
+ cluster_num = sector_num / SECTORS_PER_CLUSTER;
+ changed = is_allocated(bs, sector_num);
+ *num_same =
+ MIN(nb_sectors, (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
+
+ for (cluster_num = sector_num / SECTORS_PER_CLUSTER + 1;
+ cluster_num<= (sector_num + nb_sectors - 1) / SECTORS_PER_CLUSTER;
+ cluster_num++) {
+ if (is_allocated(bs, cluster_num * SECTORS_PER_CLUSTER) != changed) {
+ break;
+ }
+ *num_same = MIN(nb_sectors,
+ (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
+ }
+
+ return changed;
+}
+
+static coroutine_fn int add_cow_co_readv(BlockDriverState *bs,
+ int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov)
+{
+ BDRVAddCowState *s = bs->opaque;
+ int cur_nr_sectors;
+ uint64_t bytes_done = 0;
+ QEMUIOVector hd_qiov;
+ int n, ret = 0;
+
+ qemu_iovec_init(&hd_qiov, qiov->niov);
+ qemu_co_mutex_lock(&s->lock);
+ while (remaining_sectors != 0) {
+ cur_nr_sectors = remaining_sectors;
+ if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors,&n)) {
+ cur_nr_sectors = n;
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+ cur_nr_sectors * BDRV_SECTOR_SIZE);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(s->image_hd, sector_num, n,&hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret< 0) {
+ goto fail;
+ }
+ } else {
+ cur_nr_sectors = n;
+ if (bs->backing_hd) {
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+ cur_nr_sectors * BDRV_SECTOR_SIZE);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(bs->backing_hd, sector_num,
+ n,&hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret< 0) {
+ goto fail;
+ }
+ } else {
+ qemu_iovec_memset(&hd_qiov, 0,
+ BDRV_SECTOR_SIZE * cur_nr_sectors);
+ }
+ }
+ remaining_sectors -= cur_nr_sectors;
+ sector_num += cur_nr_sectors;
+ bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
+ }
+fail:
+ qemu_co_mutex_unlock(&s->lock);
+ qemu_iovec_destroy(&hd_qiov);
+ return ret;
+}
+
+static int coroutine_fn copy_sectors(BlockDriverState *bs,
+ int n_start, int n_end)
+{
+ BDRVAddCowState *s = bs->opaque;
+ QEMUIOVector qiov;
+ struct iovec iov;
+ int n, ret;
+
+ n = n_end - n_start;
+ if (n<= 0) {
+ return 0;
+ }
+
+ iov.iov_len = n * BDRV_SECTOR_SIZE;
+ iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+
+ qemu_iovec_init_external(&qiov,&iov, 1);
+
+ ret = bdrv_co_readv(bs->backing_hd, n_start, n,&qiov);
+ if (ret< 0) {
+ goto out;
+ }
+ ret = bdrv_co_writev(s->image_hd, n_start, n,&qiov);
+ if (ret< 0) {
+ goto out;
+ }
+
+ ret = 0;
+out:
+ qemu_vfree(iov.iov_base);
+ return ret;
+}
+
+static coroutine_fn int add_cow_co_writev(BlockDriverState *bs,
+ int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov)
+{
+ BDRVAddCowState *s = bs->opaque;
+ int ret = 0, i;
+ QEMUIOVector hd_qiov;
+ uint8_t *table;
+ bool changed = false;
+
+ qemu_co_mutex_lock(&s->lock);
+ qemu_iovec_init(&hd_qiov, qiov->niov);
+ ret = bdrv_co_writev(s->image_hd,
+ sector_num,
+ remaining_sectors, qiov);
+
+ if (ret< 0) {
+ goto fail;
+ }
+ /* copy content of unmodified sectors */
+ if (!is_cluster_head(sector_num)&& !is_allocated(bs, sector_num)) {
+ qemu_co_mutex_unlock(&s->lock);
+ ret = copy_sectors(bs, sector_num& ~(SECTORS_PER_CLUSTER - 1),
+ sector_num);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret< 0) {
+ goto fail;
+ }
+ }
+
+ if (!is_cluster_tail(sector_num + remaining_sectors - 1)
+&& !is_allocated(bs, sector_num + remaining_sectors - 1)) {
+ qemu_co_mutex_unlock(&s->lock);
+ ret = copy_sectors(bs, sector_num + remaining_sectors,
+ ((sector_num + remaining_sectors) | (SECTORS_PER_CLUSTER - 1)) +
1);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret< 0) {
+ goto fail;
+ }
+ }
+
+ for (i = sector_num / SECTORS_PER_CLUSTER;
+ i<= (sector_num + remaining_sectors - 1) / SECTORS_PER_CLUSTER;
+ i++) {
+ ret = add_cow_cache_get(bs, s->bitmap_cache,
+ i * SECTORS_PER_CLUSTER, (void **)&table);
+ if (ret< 0) {
+ return 0;
+ }
+ if ((table[i / 8]& (1<< (i % 8))) == 0) {
+ table[i / 8] |= (1<< (i % 8));
+ changed = true;
+ add_cow_cache_entry_mark_dirty(s->bitmap_cache, table);
+ }
+
+ }
+ ret = 0;
+fail:
+ if (changed) {
+ ret = add_cow_cache_flush(bs, s->bitmap_cache);
+ }
+ qemu_co_mutex_unlock(&s->lock);
+ qemu_iovec_destroy(&hd_qiov);
+ return ret;
+}
+
+static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t size)
+{
+ BDRVAddCowState *s = bs->opaque;
+ int sector_per_byte = SECTORS_PER_CLUSTER * 8;
+ int ret;
+ int64_t old_image_sector = s->image_hd->total_sectors;
+ int64_t bitmap_size =
+ (size / BDRV_SECTOR_SIZE + sector_per_byte - 1) / sector_per_byte;
+
+ ret = bdrv_truncate(bs->file,
+ sizeof(AddCowHeader) + bitmap_size);
+ if (ret< 0) {
+ bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
+ return ret;
+ }
+ return 0;
+}
+
+static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
+{
+ BDRVAddCowState *s = bs->opaque;
+ int ret;
+
+ qemu_co_mutex_lock(&s->lock);
+ ret = add_cow_cache_flush(bs, s->bitmap_cache);
+ qemu_co_mutex_unlock(&s->lock);
+ if (ret< 0) {
+ return ret;
+ }
+
+ return bdrv_co_flush(bs->file);
+}
+
+static QEMUOptionParameter add_cow_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a base image"
+ },
+ {
+ .name = BLOCK_OPT_IMAGE_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a image file"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FMT,
+ .type = OPT_STRING,
+ .help = "Image format of the base image"
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_add_cow = {
+ .format_name = "add-cow",
+ .instance_size = sizeof(BDRVAddCowState),
+ .bdrv_probe = add_cow_probe,
+ .bdrv_open = add_cow_open,
+ .bdrv_close = add_cow_close,
+ .bdrv_create = add_cow_create,
+ .bdrv_co_readv = add_cow_co_readv,
+ .bdrv_co_writev = add_cow_co_writev,
+ .bdrv_truncate = bdrv_add_cow_truncate,
+ .bdrv_co_is_allocated = add_cow_is_allocated,
+
+ .create_options = add_cow_create_options,
+ .bdrv_co_flush_to_disk = add_cow_co_flush,
+};
+
+static void bdrv_add_cow_init(void)
+{
+ bdrv_register(&bdrv_add_cow);
+}
+
+block_init(bdrv_add_cow_init);
diff --git a/block/add-cow.h b/block/add-cow.h
new file mode 100644
index 0000000..46567bb
--- /dev/null
+++ b/block/add-cow.h
@@ -0,0 +1,83 @@
+/*
+ * QEMU ADD-COW Disk Format
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ * Dong Xu Wang<wdon...@linux.vnet.ibm.com>
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_ADD_COW_H
+#define BLOCK_ADD_COW_H
+
+#define ADD_COW_MAGIC (((uint64_t)'A'<< 56) | ((uint64_t)'D'<< 48) | \
+ ((uint64_t)'D'<< 40) | ((uint64_t)'_'<< 32) | \
+ ((uint64_t)'C'<< 24) | ((uint64_t)'O'<< 16) | \
+ ((uint64_t)'W'<< 8) | 0xFF)
+#define ADD_COW_VERSION 1
+#define ADD_COW_FILE_LEN 1024
+#define ADD_COW_CACHE_SIZE 16
+#define ADD_COW_CACHE_ENTRY_SIZE 65536
+#define ADD_COW_CLUSTER_SIZE 65536
+#define SECTORS_PER_CLUSTER (ADD_COW_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
+
+typedef struct AddCowHeader {
+ uint64_t magic;
+ uint32_t version;
+ char backing_file[ADD_COW_FILE_LEN];
+ char image_file[ADD_COW_FILE_LEN];
+ char reserved[500];
+} QEMU_PACKED AddCowHeader;
+
+typedef struct AddCowCachedTable {
+ void *table;
+ int64_t offset;
+ bool dirty;
+ int cache_hits;
+} AddCowCachedTable;
+
+typedef struct AddCowCache {
+ AddCowCachedTable *entries;
+ int entry_size;
+ int size;
+ bool writethrough;
+} AddCowCache;
+
+typedef struct BDRVAddCowState {
+ BlockDriverState *image_hd;
+ CoMutex lock;
+ int cluster_size;
+ AddCowCache *bitmap_cache;
+ uint64_t bitmap_size;
+} BDRVAddCowState;
+
+/* convert sector_num to offset in bitmap */
+static inline int64_t offset_in_bitmap(int64_t sector_num)
+{
+ int64_t cluster_num = sector_num / SECTORS_PER_CLUSTER;
+ return cluster_num / 8;
+}
+
+static inline bool is_cluster_head(int64_t sector_num)
+{
+ return sector_num % SECTORS_PER_CLUSTER == 0;
+}
+
+static inline bool is_cluster_tail(int64_t sector_num)
+{
+ return (sector_num + 1) % SECTORS_PER_CLUSTER == 0;
+}
+
+AddCowCache *add_cow_cache_create(BlockDriverState *bs, int num_tables,
+ bool writethrough);
+int add_cow_cache_destroy(BlockDriverState *bs, AddCowCache *c);
+void add_cow_cache_entry_mark_dirty(AddCowCache *c, void *table);
+int add_cow_cache_get(BlockDriverState *bs, AddCowCache *c, uint64_t offset,
+ void **table);
+int add_cow_cache_flush(BlockDriverState *bs, AddCowCache *c);
+bool add_cow_cache_set_writethrough(BlockDriverState *bs, AddCowCache *c,
+ bool enable);
+#endif
diff --git a/block_int.h b/block_int.h
index 086832a..83e75ea 100644
--- a/block_int.h
+++ b/block_int.h
@@ -51,6 +51,7 @@
#define BLOCK_OPT_PREALLOC "preallocation"
#define BLOCK_OPT_SUBFMT "subformat"
#define BLOCK_OPT_COMPAT_LEVEL "compat"
+#define BLOCK_OPT_IMAGE_FILE "image_file"
typedef struct BdrvTrackedRequest BdrvTrackedRequest;
diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
new file mode 100644
index 0000000..28085d4
--- /dev/null
+++ b/docs/specs/add-cow.txt
@@ -0,0 +1,68 @@
+== General ==
+
+Raw file format does not support backing_file and copy on write feature.
+The add-cow image format makes it possible to use backing files with raw
+image by keeping a separate .add-cow metadata file. Once all sectors
+have been written to in the raw image it is safe to discard the .add-cow
+and backing files and instead use the raw image directly.
+
+When using add-cow, procedures may like this:
+(ubuntu.img is a disk image which has been installed OS.)
+ 1) Create a raw image with the same size of ubuntu.img
+ qemu-img create -f raw test.raw 8G
+ 2) Create a add-cow image which will store dirty bitmap
+ qemu-img create -f add-cow test.add-cow -o
backing_file=ubuntu.img,image_file=test.raw
+ 3) Run qemu with add-cow image
+ qemu -drive if=virtio,file=test.add-cow
+
+=Specification=
+
+The file format looks like this:
+
+ +---------------+--------------------------+
+ | Header | COW bitmap |
+ +---------------+--------------------------+
+
+All numbers in add-cow are stored in Big Endian byte order.
+
+== Header ==
+
+The Header is included in the first bytes:
+
+ Byte 0 - 7: magic
+ add-cow magic string ("ADD_COW\xff")
+
+ 8 - 11: version
+ Version number (only valid value is 1 now)
+
+ 12 - 1035: backing_file
+ backing_file file name related to add-cow file. All
+ unused bytes are padded with zeros. Must not be longer
+ than 1023 bytes.
It's far better to shrink the size of the header and use an offset/len pointer
to the backing file string. Limiting backing files to 1023 is unacceptable.
+
+ 1036 - 2059: image_file
+ image_file is a raw file. All unused bytes are padded
+ with zeros. Must not be longer than 1023 bytes.
Likewise, you should use a combination of offset/len.
I think you should have a feature bitmap here too along with an offset to the
bitmap.
Regards,
Anthony Liguori
+
+ 2060 - 2559: The Reserved field is used to make sure Data field
starts
+ at the multiple of 512, not used currently. All bytes
are
+ filled with 0.
+
+== COW bitmap ==
+
+The "COW bitmap" field starts at the 2560th byte, stores a bitmap related to
+backing_file and image_file. The bitmap will track whether the sector in
+backing_file is dirty or not.
+
+Each bit in the bitmap indicates one cluster's status. One cluster includes 128
+sectors, then each bit indicates 512 * 128 = 64k bytes, So the size of bitmap
is
+calculated according to virtual size of image_file. In each byte, bit 0 to 7
+will track the 1st to 7th cluster in sequence, bit orders in one byte look
like:
+ +----+----+----+----+----+----+----+----+
+ | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
+ +----+----+----+----+----+----+----+----+
+
+If the bit is 0, indicates the sector has not been allocated in image_file,
data
+should be loaded from backing_file while reading; if the bit is 1, indicates
the
+related sector has been dirty, should be loaded from image_file while reading.
+Writing to a sector causes the corresponding bit to be set to 1.
diff --git a/qemu-img.c b/qemu-img.c
index 0ae543c..29865b5 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -628,7 +628,9 @@ static int img_convert(int argc, char **argv)
int c, ret = 0, n, n1, bs_n, bs_i, compress, cluster_size,
cluster_sectors;
int progress = 0, flags;
const char *fmt, *out_fmt, *cache, *out_baseimg, *out_filename;
+ char image_filename[1024], backing_filename[1024];
BlockDriver *drv, *proto_drv;
+ BlockDriver *image_drv, *backing_drv;
BlockDriverState **bs = NULL, *out_bs = NULL;
int64_t total_sectors, nb_sectors, sector_num, bs_offset;
uint64_t bs_sectors;
@@ -637,6 +639,7 @@ static int img_convert(int argc, char **argv)
BlockDriverInfo bdi;
QEMUOptionParameter *param = NULL, *create_options = NULL;
QEMUOptionParameter *out_baseimg_param;
+ QEMUOptionParameter *image_param = NULL;
char *options = NULL;
const char *snapshot_name = NULL;
float local_progress;
@@ -797,6 +800,7 @@ static int img_convert(int argc, char **argv)
out_baseimg = out_baseimg_param->value.s;
}
+ image_param = get_option_parameter(param, BLOCK_OPT_SIZE);
/* Check if compression is supported */
if (compress) {
QEMUOptionParameter *encryption =
@@ -828,6 +832,41 @@ static int img_convert(int argc, char **argv)
}
/* Create the new image */
+
+ if (0 == strcmp(out_fmt, "add-cow")) {
+ image_drv = bdrv_find_format("raw");
+ if (!drv) {
+ ret = -1;
+ goto out;
+ }
+ snprintf(image_filename, sizeof(image_filename),
+ "%s"".ct.raw", out_filename);
+ ret = bdrv_create(image_drv, image_filename, image_param);
+ if (ret< 0) {
+ error_report("%s: error while creating image_file: %s",
+ image_filename, strerror(-ret));
+ goto out;
+ }
+ set_option_parameter(param, BLOCK_OPT_IMAGE_FILE, image_filename);
+
+ if (!out_baseimg) {
+ backing_drv = bdrv_find_format("qcow2");
+ if (!drv) {
+ ret = -1;
+ goto out;
+ }
+ snprintf(backing_filename, sizeof(backing_filename),
+ "%s"".ct.qcow2", out_filename);
+ ret = bdrv_create(backing_drv, backing_filename, image_param);
+ if (ret< 0) {
+ error_report("%s: error while creating backing_file: %s",
+ backing_filename, strerror(-ret));
+ goto out;
+ }
+ set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
+ backing_filename);
+ }
+ }
ret = bdrv_create(drv, out_filename, param);
if (ret< 0) {
if (ret == -ENOTSUP) {