Am 28.10.2010 13:01, schrieb Stefan Hajnoczi: > This patch introduces the qed on-disk layout and implements image > creation. Later patches add read/write and other functionality. > > Signed-off-by: Stefan Hajnoczi <stefa...@linux.vnet.ibm.com> > --- > Makefile.objs | 1 + > block/qed.c | 548 > +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > block/qed.h | 145 +++++++++++++++ > block_int.h | 1 + > 4 files changed, 695 insertions(+), 0 deletions(-) > create mode 100644 block/qed.c > create mode 100644 block/qed.h > > diff --git a/Makefile.objs b/Makefile.objs > index f07fb01..7bae72a 100644 > --- a/Makefile.objs > +++ b/Makefile.objs > @@ -14,6 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o > > block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o > vpc.o vvfat.o > block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o > +block-nested-y += qed.o > block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o > block-nested-$(CONFIG_WIN32) += raw-win32.o > block-nested-$(CONFIG_POSIX) += raw-posix.o > diff --git a/block/qed.c b/block/qed.c > new file mode 100644 > index 0000000..8469cf0 > --- /dev/null > +++ b/block/qed.c > @@ -0,0 +1,548 @@ > +/* > + * QEMU Enhanced Disk Format > + * > + * Copyright IBM, Corp. 2010 > + * > + * Authors: > + * Stefan Hajnoczi <stefa...@linux.vnet.ibm.com> > + * Anthony Liguori <aligu...@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU LGPL, version 2 or later. > + * See the COPYING.LIB file in the top-level directory. > + * > + */ > + > +#include "qed.h" > + > +static int bdrv_qed_probe(const uint8_t *buf, int buf_size, > + const char *filename) > +{ > + const QEDHeader *header = (const QEDHeader *)buf; > + > + if (buf_size < sizeof(*header)) { > + return 0; > + } > + if (le32_to_cpu(header->magic) != QED_MAGIC) { > + return 0; > + } > + return 100; > +} > + > +/** > + * Check whether an image format is raw > + * > + * @fmt: Backing file format, may be NULL > + */ > +static bool qed_fmt_is_raw(const char *fmt) > +{ > + return fmt && strcmp(fmt, "raw") == 0; > +}
People shouldn't use them directly, but should we also consider file, host_device, etc.? > + > +static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) > +{ > + cpu->magic = le32_to_cpu(le->magic); > + cpu->cluster_size = le32_to_cpu(le->cluster_size); > + cpu->table_size = le32_to_cpu(le->table_size); > + cpu->header_size = le32_to_cpu(le->header_size); > + cpu->features = le64_to_cpu(le->features); > + cpu->compat_features = le64_to_cpu(le->compat_features); > + cpu->autoclear_features = le64_to_cpu(le->autoclear_features); > + cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); > + cpu->image_size = le64_to_cpu(le->image_size); > + cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset); > + cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size); > +} > + > +static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) > +{ > + le->magic = cpu_to_le32(cpu->magic); > + le->cluster_size = cpu_to_le32(cpu->cluster_size); > + le->table_size = cpu_to_le32(cpu->table_size); > + le->header_size = cpu_to_le32(cpu->header_size); > + le->features = cpu_to_le64(cpu->features); > + le->compat_features = cpu_to_le64(cpu->compat_features); > + le->autoclear_features = cpu_to_le64(cpu->autoclear_features); > + le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); > + le->image_size = cpu_to_le64(cpu->image_size); > + le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset); > + le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size); > +} > + > +static int qed_write_header_sync(BDRVQEDState *s) > +{ > + QEDHeader le; > + int ret; > + > + qed_header_cpu_to_le(&s->header, &le); > + ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le)); > + if (ret != sizeof(le)) { > + return ret; > + } > + return 0; > +} > + > +static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t > table_size) > +{ > + uint64_t table_entries; > + uint64_t l2_size; > + > + table_entries = (table_size * cluster_size) / sizeof(uint64_t); > + l2_size = table_entries * cluster_size; > + > + return l2_size * table_entries; > +} > + > +static bool qed_is_cluster_size_valid(uint32_t cluster_size) > +{ > + if (cluster_size < QED_MIN_CLUSTER_SIZE || > + cluster_size > QED_MAX_CLUSTER_SIZE) { > + return false; > + } > + if (cluster_size & (cluster_size - 1)) { > + return false; /* not power of 2 */ > + } > + return true; > +} > + > +static bool qed_is_table_size_valid(uint32_t table_size) > +{ > + if (table_size < QED_MIN_TABLE_SIZE || > + table_size > QED_MAX_TABLE_SIZE) { > + return false; > + } > + if (table_size & (table_size - 1)) { > + return false; /* not power of 2 */ > + } > + return true; > +} > + > +static bool qed_is_image_size_valid(uint64_t image_size, uint32_t > cluster_size, > + uint32_t table_size) > +{ > + if (image_size % BDRV_SECTOR_SIZE != 0) { > + return false; /* not multiple of sector size */ > + } > + if (image_size > qed_max_image_size(cluster_size, table_size)) { > + return false; /* image is too large */ > + } > + return true; > +} > + > +/** > + * Read a string of known length from the image file > + * > + * @file: Image file > + * @offset: File offset to start of string, in bytes > + * @n: String length in bytes > + * @buf: Destination buffer > + * @buflen: Destination buffer length in bytes > + * @ret: 0 on success, -errno on failure > + * > + * The string is NUL-terminated. > + */ > +static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, > + char *buf, size_t buflen) > +{ > + int ret; > + if (n >= buflen) { > + return -EINVAL; > + } > + ret = bdrv_pread(file, offset, buf, n); > + if (ret < 0) { > + return ret; > + } > + buf[n] = '\0'; > + return 0; > +} > + > +static int bdrv_qed_open(BlockDriverState *bs, int flags) > +{ > + BDRVQEDState *s = bs->opaque; > + QEDHeader le_header; > + int64_t file_size; > + int ret; > + > + s->bs = bs; > + > + ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); > + if (ret < 0) { > + return ret; > + } > + ret = 0; /* ret should always be 0 or -errno */ > + qed_header_le_to_cpu(&le_header, &s->header); > + > + if (s->header.magic != QED_MAGIC) { > + return -ENOENT; > + } ENOENT seems a bit odd for a wrong magic number, especially if it's used for the error message. Wouldn't EINVAL or ENOTSUP be a closer match? > + if (s->header.features & ~QED_FEATURE_MASK) { > + return -ENOTSUP; /* image uses unsupported feature bits */ > + } > + if (!qed_is_cluster_size_valid(s->header.cluster_size)) { > + return -EINVAL; > + } > + > + /* Round down file size to the last cluster */ > + file_size = bdrv_getlength(bs->file); > + if (file_size < 0) { > + return file_size; > + } > + s->file_size = qed_start_of_cluster(s, file_size); > + > + if (!qed_is_table_size_valid(s->header.table_size)) { > + return -EINVAL; > + } > + if (!qed_is_image_size_valid(s->header.image_size, > + s->header.cluster_size, > + s->header.table_size)) { > + return -EINVAL; > + } > + if (!qed_check_table_offset(s, s->header.l1_table_offset)) { > + return -EINVAL; > + } > + > + s->table_nelems = (s->header.cluster_size * s->header.table_size) / > + sizeof(uint64_t); > + s->l2_shift = ffs(s->header.cluster_size) - 1; > + s->l2_mask = s->table_nelems - 1; > + s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1; > + > + if ((s->header.features & QED_F_BACKING_FILE)) { > + ret = qed_read_string(bs->file, s->header.backing_filename_offset, > + s->header.backing_filename_size, > bs->backing_file, > + sizeof(bs->backing_file)); > + if (ret < 0) { > + return ret; > + } > + > + if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) { > + pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw"); > + } > + } > + > + /* Reset unknown autoclear feature bits. This is a backwards > + * compatibility mechanism that allows images to be opened by older > + * programs, which "knock out" unknown feature bits. When an image is > + * opened by a newer program again it can detect that the autoclear > + * feature is no longer valid. > + */ > + if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 && > + !bdrv_is_read_only(bs->file)) { > + s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK; > + > + ret = qed_write_header_sync(s); > + if (ret) { > + return ret; > + } > + > + /* From here on only known autoclear feature bits are valid */ > + bdrv_flush(bs->file); > + } > + > + return ret; > +} > + > +static void bdrv_qed_close(BlockDriverState *bs) > +{ > +} > + > +static void bdrv_qed_flush(BlockDriverState *bs) > +{ > + bdrv_flush(bs->file); > +} This conflicts with one of my recent changes. bdrv_flush should return an int now. > + > +static int qed_create(const char *filename, uint32_t cluster_size, > + uint64_t image_size, uint32_t table_size, > + const char *backing_file, const char *backing_fmt) > +{ > + QEDHeader header = { > + .magic = QED_MAGIC, > + .cluster_size = cluster_size, > + .table_size = table_size, > + .header_size = 1, > + .features = 0, > + .compat_features = 0, > + .l1_table_offset = cluster_size, > + .image_size = image_size, > + }; > + QEDHeader le_header; > + uint8_t *l1_table = NULL; > + size_t l1_size = header.cluster_size * header.table_size; > + int ret = 0; > + BlockDriverState *bs = NULL; > + > + ret = bdrv_create_file(filename, NULL); > + if (ret < 0) { > + return ret; > + } > + > + ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB); > + if (ret < 0) { > + return ret; > + } > + > + if (backing_file) { > + header.features |= QED_F_BACKING_FILE; > + header.backing_filename_offset = sizeof(le_header); > + header.backing_filename_size = strlen(backing_file); > + > + if (qed_fmt_is_raw(backing_fmt)) { > + header.features |= QED_F_BACKING_FORMAT_NO_PROBE; > + } > + } > + > + qed_header_cpu_to_le(&header, &le_header); > + ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); > + if (ret < 0) { > + goto out; > + } > + ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, > + header.backing_filename_size); > + if (ret < 0) { > + goto out; > + } > + > + l1_table = qemu_mallocz(l1_size); > + ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); > + if (ret < 0) { > + goto out; > + } > + > + ret = 0; /* success */ > +out: > + qemu_free(l1_table); > + bdrv_delete(bs); > + return ret; > +} > + > +static int bdrv_qed_create(const char *filename, QEMUOptionParameter > *options) > +{ > + uint64_t image_size = 0; > + uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; > + uint32_t table_size = QED_DEFAULT_TABLE_SIZE; > + const char *backing_file = NULL; > + const char *backing_fmt = NULL; > + > + while (options && options->name) { > + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { > + image_size = options->value.n; > + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { > + backing_file = options->value.s; > + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { > + backing_fmt = options->value.s; I'm not sure here. It doesn't really matter for QED, but I find it strange that -o backing_fmt=foobar works and gives you a non-raw backing file. Should we check if the format exists at least? Kevin