Am 15.11.2011 06:28, schrieb Dong Xu Wang:
> From: Dong Xu Wang <wdon...@linux.vnet.ibm.com>
> 
> Provide a new file format: add-cow. The usage can be found in add-cow.txt of
> this patch.
> 
> Signed-off-by: Dong Xu Wang <wdon...@linux.vnet.ibm.com>
> ---
>  Makefile.objs          |    1 +
>  block.c                |    2 +-
>  block.h                |    1 +
>  block/add-cow.c        |  417 
> ++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h            |    1 +
>  docs/specs/add-cow.txt |   57 +++++++
>  6 files changed, 478 insertions(+), 1 deletions(-)
>  create mode 100644 block/add-cow.c
>  create mode 100644 docs/specs/add-cow.txt
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index d7a6539..ad99243 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>  
>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o 
> vpc.o vvfat.o
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o 
> qcow2-cache.o
> +block-nested-y += add-cow.o
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> diff --git a/block.c b/block.c
> index 86910b0..a2be27b 100644
> --- a/block.c
> +++ b/block.c
> @@ -106,7 +106,7 @@ int is_windows_drive(const char *filename)
>  #endif
>  
>  /* check if the path starts with "<protocol>:" */
> -static int path_has_protocol(const char *path)
> +int path_has_protocol(const char *path)
>  {
>  #ifdef _WIN32
>      if (is_windows_drive(path) ||
> diff --git a/block.h b/block.h
> index 051a25d..836284f 100644
> --- a/block.h
> +++ b/block.h
> @@ -276,6 +276,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, 
> QEMUSnapshotInfo *sn);
>  
>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>  int path_is_absolute(const char *path);
> +int path_has_protocol(const char *path);
>  void path_combine(char *dest, int dest_size,
>                    const char *base_path,
>                    const char *filename);
> diff --git a/block/add-cow.c b/block/add-cow.c
> new file mode 100644
> index 0000000..54d30a9
> --- /dev/null
> +++ b/block/add-cow.c
> @@ -0,0 +1,417 @@
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "module.h"
> +
> +#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | 
> \
> +                            ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
> +                            ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
> +                            ((uint64_t)'W' << 8) | 0xFF)
> +#define ADD_COW_VERSION     1
> +#define ADD_COW_FILE_LEN    1024
> +
> +typedef struct AddCowHeader {
> +    uint64_t        magic;
> +    uint32_t        version;
> +    char            backing_file[ADD_COW_FILE_LEN];
> +    char            image_file[ADD_COW_FILE_LEN];
> +    uint64_t        size;
> +} QEMU_PACKED AddCowHeader;
> +
> +typedef struct BDRVAddCowState {
> +    char                image_file[ADD_COW_FILE_LEN];
> +    BlockDriverState    *image_hd;
> +    uint8_t             *bitmap;
> +    uint64_t            bitmap_size;
> +    CoMutex             lock;
> +} BDRVAddCowState;
> +
> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char 
> *filename)
> +{
> +    const AddCowHeader *header = (const void *)buf;
> +
> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
> +        return 100;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static int add_cow_open(BlockDriverState *bs, int flags)
> +{
> +    AddCowHeader    header;
> +    int64_t         size;
> +    char            image_filename[ADD_COW_FILE_LEN];
> +    int             image_flags;
> +    BlockDriver     *image_drv = NULL;
> +    int             ret;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +
> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
> +    if (ret != sizeof(header)) {
> +        goto fail;
> +    }
> +
> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }

Please have a look at qcow2 for better handling of newer version
numbers. We should try to give a good error message for this case.

> +
> +    size = be64_to_cpu(header.size);
> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
> +
> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) != 
> sizeof(header.image_file));
> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
> +            header.backing_file);
> +    pstrcpy(state->image_file, sizeof(state->image_file),
> +            header.image_file);

You need the same QEMU_BUILD_BUG_ON for the backing file, or you can't
assume that header.image_file is large enough that it doesn't matter
that it isn't necessarily correctly terminated.

> +
> +    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
> +    state->bitmap = g_malloc0(state->bitmap_size);

qemu_blockalign is better if you're using it as a buffer for I/O requests.

> +
> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
> +            state->bitmap_size);
> +    if (ret != state->bitmap_size) {
> +        goto fail;
> +    }
> +   /* If there is a image_file, must be together with backing_file */

Indentation is off.

> +    if (state->image_file[0] != '\0') {
> +        state->image_hd = bdrv_new("");
> +
> +        if (path_has_protocol(state->image_file)) {
> +            pstrcpy(image_filename, sizeof(image_filename),
> +                    state->image_file);
> +        } else {
> +            path_combine(image_filename, sizeof(image_filename),
> +                         bs->filename, state->image_file);
> +        }
> +
> +        image_drv = bdrv_find_format("raw");
> +        image_flags =
> +             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) | 
> BDRV_O_RDWR;

As Marcelo said, why play with the flags? BDRV_O_SNAPSHOT and
BDRV_O_NO_BACKING should already be cleared by bdrv_open_common, and I
can't see a reason why you need to open the image r/w when the add-cow
image is opened read-only.

> +        state->image_hd->keep_read_only = 0;
> +
> +        ret = bdrv_open(state->image_hd, image_filename, image_flags,
> +                image_drv);
> +        if (ret < 0) {
> +            bdrv_delete(state->image_hd);
> +            state->image_hd = NULL;
> +            goto fail;
> +        }
> +    }
> +    if (state->image_file[0] == '\0') {

You can move this check up, then the above if block can become
unconditional.

> +        ret = -ENOENT;
> +        goto fail;
> +    }
> +
> +    qemu_co_mutex_init(&state->lock);
> +    return 0;
> + fail:
> +    g_free(state->bitmap);
> +    state->bitmap = NULL;

Resetting it to NULL is not required, the memory will be freed anyway.

> +    return ret;
> +}
> +
> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
> +{
> +    uint64_t offset = bitnum / 8;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);

Unnecessary cast.

Also, to keep things consistent with other format drivers, call it s
instead of state.

> +    state->bitmap[offset] |= (1 << (bitnum % 8));
> +}
> +
> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
> +{
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    uint64_t offset = bitnum / 8;
> +    return !!(state->bitmap[offset] & (1 << (bitnum % 8)));
> +}
> +
> +static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors, int *num_same)
> +{
> +    int changed;
> +    uint64_t bitmap_size = ((BDRVAddCowState *)(bs->opaque))->bitmap_size;

BDRVAddCowState *s = bs->opaque; and then use s->bitmap_size instead of
copying it to a local variable.

> +
> +    /* Beyond the end of bitmap, return error or read from backing_file? */
> +    if (((sector_num + nb_sectors + 7) / 8) > bitmap_size) {
> +        return 0;
> +    }
> +
> +    if (nb_sectors == 0) {
> +        *num_same = nb_sectors;
> +        return 0;
> +    }
> +
> +    changed = is_bit_set(bs, sector_num);
> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
> +            break;
> +        }
> +    }
> +
> +    return changed;
> +}
> +
> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors)
> +{
> +    int i, ret = 0;
> +    bool changed = false;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    uint64_t start_pos = sector_num / 8;
> +    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
> +
> +    if (start_pos > state->bitmap_size) {
> +        return -1;
> +    }
> +
> +    for (i = 0; i < nb_sectors; i++) {
> +        if (changed || !is_bit_set(bs, sector_num + i)) {
> +            changed = true;
> +        }

Wait... if (changed == true) changed = true? What is this good for?

> +        add_cow_set_bit(bs, sector_num + i);
> +    }
> +
> +    if (changed) {
> +        ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + start_pos,
> +            state->bitmap + start_pos,
> +            MIN(((end_pos - start_pos) & (~512)) + 512,
> +                state->bitmap_size - start_pos));

-EMAGIC

Please calculate that MIN(...) separately and give the variable a
meaningful name. Trying to guess what you're doing here:

(end_pos - start_pos) & (~512)) + 512

This is the size of the updated area in the bitmap, in bytes. It is
rounded up to the next sector; if it's already on a sector boundary,
make it the next sector boundary (why?)

state->bitmap_size - start_pos

Ok, makes sense, you're trying to avoid writing after the end of the
array if the caller asked for too many sectors.


Please make sure to keep the write request sector aligned, so that
bdrv_pwrite doesn't have to perform a read-modify-write operation.

> +    }
> +    return ret;
> +}
> +
> +static void add_cow_close(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    g_free(state->bitmap);
> +    state->bitmap = NULL;

Resetting to NULL is unnecessary.

> +}
> +
> +static int add_cow_create(const char *filename, QEMUOptionParameter *options)
> +{
> +    AddCowHeader header;
> +    int64_t image_sectors = 0;
> +    const char *backing_filename = NULL;
> +    const char *image_filename = NULL;
> +    int ret;
> +    BlockDriverState *bs, *image_bs = NULL;
> +
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
> +            backing_filename = options->value.s;
> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
> +            image_filename = options->value.s;
> +        }
> +        options++;
> +    }
> +
> +    if (!backing_filename || !image_filename) {
> +        error_report("Both backing_file and image_file should be given.");
> +        return -EINVAL;
> +    }
> +    /* Make sure image file exists */
> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
> +            | BDRV_O_CACHE_WB);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    bdrv_delete(image_bs);
> +
> +    ret = bdrv_create_file(filename, NULL);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    memset(&header, 0, sizeof(header));
> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
> +    header.version = cpu_to_be32(ADD_COW_VERSION);
> +    pstrcpy(header.backing_file, sizeof(header.backing_file), 
> backing_filename);
> +    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
> +    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
> +
> +    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
> +    if (ret < 0) {
> +        bdrv_delete(bs);
> +        return ret;
> +    }
> +
> +    BlockDriver *drv = bdrv_find_format("add-cow");
> +    assert(drv != NULL);
> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
> +    if (ret < 0) {
> +        bdrv_delete(bs);
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
> +    bdrv_delete(bs);
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_readv(BlockDriverState *bs, int64_t 
> sector_num,
> +                         int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int cur_nr_sectors;
> +    uint64_t bytes_done = 0;
> +    QEMUIOVector hd_qiov;
> +    int n, ret = 0;
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    while (remaining_sectors != 0) {
> +        cur_nr_sectors = remaining_sectors;
> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
> +            cur_nr_sectors = n;
> +            qemu_iovec_reset(&hd_qiov);
> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
> +            if (ret < 0) {
> +                goto fail;
> +            }
> +        } else {
> +            cur_nr_sectors = n;
> +            if (bs->backing_hd) {

I thought there are no add-cow images without backing file? (It isn't
checked in open, but in create)

> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
> +                                    n, &hd_qiov);
> +                if (ret < 0) {
> +                    goto fail;
> +                }
> +            } else {
> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_memset(&hd_qiov, 0,
> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);

There's nothing to memset in a qiov that you have just reset and that
has a size of 0 now.

> +            }
> +        }
> +        remaining_sectors -= cur_nr_sectors;
> +        sector_num += cur_nr_sectors;
> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, int64_t 
> sector_num,
> +                          int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = 0;
> +    QEMUIOVector hd_qiov;
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    qemu_iovec_reset(&hd_qiov);
> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
> +    ret = bdrv_co_writev(s->image_hd,
> +                     sector_num,
> +                     remaining_sectors, &hd_qiov);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
> +{
> +    int ret = 0;
> +    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
> +    int64_t be_offset = cpu_to_be64(offset);
> +    BDRVAddCowState *state = bs->opaque;
> +    int64_t old_image_sector = state->image_hd->total_sectors;
> +
> +    ret = bdrv_truncate(state->image_hd, offset);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
> +            + sizeof(AddCowHeader));
> +    if (ret < 0) {
> +        bdrv_truncate(state->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
> +        return ret;
> +    }
> +
> +    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
> +        &be_offset, sizeof(uint64_t));
> +    if (ret < 0) {
> +        bdrv_truncate(state->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
> +    }
> +
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *state = bs->opaque;
> +    int ret = bdrv_co_flush(state->image_hd);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return bdrv_co_flush(bs->file);
> +}
> +
> +static QEMUOptionParameter add_cow_create_options[] = {
> +    {
> +        .name = BLOCK_OPT_SIZE,
> +        .type = OPT_SIZE,
> +        .help = "Virtual disk size"
> +    },
> +    {
> +        .name = BLOCK_OPT_BACKING_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a base image"
> +    },
> +    {
> +        .name = BLOCK_OPT_IMAGE_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a image file"
> +    },
> +    { NULL }
> +};
> +
> +static BlockDriver bdrv_add_cow = {
> +    .format_name                = "add-cow",
> +    .instance_size              = sizeof(BDRVAddCowState),
> +    .bdrv_probe                 = add_cow_probe,
> +    .bdrv_open                  = add_cow_open,
> +    .bdrv_close                 = add_cow_close,
> +    .bdrv_create                = add_cow_create,
> +    .bdrv_is_allocated          = add_cow_is_allocated,
> +
> +    .bdrv_co_readv              = add_cow_co_readv,
> +    .bdrv_co_writev             = add_cow_co_writev,
> +    .bdrv_truncate              = bdrv_add_cow_truncate,
> +
> +    .create_options             = add_cow_create_options,
> +    .bdrv_co_flush_to_disk      = add_cow_co_flush,
> +};
> +
> +static void bdrv_add_cow_init(void)
> +{
> +    bdrv_register(&bdrv_add_cow);
> +}
> +
> +block_init(bdrv_add_cow_init);
> diff --git a/block_int.h b/block_int.h
> index 1ec4921..d6e8337 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -43,6 +43,7 @@
>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>  #define BLOCK_OPT_PREALLOC      "preallocation"
>  #define BLOCK_OPT_SUBFMT        "subformat"
> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>  
>  typedef struct AIOPool {
>      void (*cancel)(BlockDriverAIOCB *acb);
> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
> new file mode 100644
> index 0000000..e403c84
> --- /dev/null
> +++ b/docs/specs/add-cow.txt
> @@ -0,0 +1,57 @@
> +== General ==
> +
> +Raw file format does not support backing_file and copy on write feature. Then
> +you can use add-cow file to implement these features.
> +
> +When using add-cow, procedures may like this:
> +(ubuntu.img is a disk image which has been installed OS.)
> +    1)  Create a raw image with the same size of ubuntu.img
> +            qemu-img create -f raw test.raw 8G
> +    2)  Create a add-cow image which will store dirty bitmap
> +            qemu-img create -f add-cow test.add-cow -o 
> backing_file=ubuntu.img,image_file=test.raw
> +    3)  Run qemu with add-cow image
> +            qemu -drive if=virtio,file=test.add-cow
> +
> +While QEMU is running, virtual size of image_file and backing_file must be 
> the
> +same. So if image_file does not have the same virtual size as backing_file's 
> in
> +step 2), qemu-img will truncate it.
> +
> +=Specification=
> +
> +The file format looks like this:
> +
> + +----------+----------+----------+-----+
> + |  Header  |   Data   |   Data   | ... |
> + +----------+----------+----------+-----+

This looks as if the file was divided in some kind of blocks of the same
size. This isn't really true.

One change that I would definitely suggest to make is that the bitmap
should start at byte 512 rather than directly after the header.
Unaligned requests with cache=none are expensive.

> +
> + All numbers in add-cow are stored in Big Endian byte order.

For the bitmap, you should addtionally specify which order the bits in
the bitmap have.

> +== Header ==
> +
> +The Header is included in the first bytes:
> +
> +    Byte  0 -  7:       magic
> +                        add-cow magic string ("ADD_COW\xff")
> +
> +          8 -  11:      version
> +                        Version number (only valid value is 1 now)
> +
> +          12 - 1035:    backing_file
> +                        backing_file file name related to add-cow file. While
> +                        using backing_file, must together with image_file.

Pad unused bytes with zeros.

> +
> +         1036 - 2059:   image_file
> +                        image_file is a raw file, While using image_file, 
> must
> +                        together with image_file.

Same here.

> +
> +         2060 - 2067:   size
> +                        Virtual disk size of image_file in bytes.
> +
> +== Data ==
> +
> +The Data field stores a bitmap related to backing_file and image_file. The 
> bitmap
> +will track whether the cluster in backing_file is dirty or not.

So 0 = load from backing file, 1 = load from image? I think it's better
to be explicit on this.

> +
> +Each bit in the bitmap indicates one cluster. So the size of bitmap is 
> calculated
> +according to virtual size of backing_file.

I think you mean s/cluster/sector/g

Kevin

Reply via email to