From: Stefan Hajnoczi <stefa...@linux.vnet.ibm.com> The blkverify block driver makes investigating image format data corruption much easier. A raw image initialized with the same contents as the test image (e.g. qcow2 file) must be provided. The raw image mirrors read/write operations and is used to verify that data read from the test image is correct.
See docs/blkverify.txt for more information. Signed-off-by: Stefan Hajnoczi <stefa...@linux.vnet.ibm.com> Signed-off-by: Kevin Wolf <kw...@redhat.com> --- Makefile.objs | 2 +- block/blkverify.c | 382 ++++++++++++++++++++++++++++++++++++++++++++++++++++ docs/blkverify.txt | 69 ++++++++++ 3 files changed, 452 insertions(+), 1 deletions(-) create mode 100644 block/blkverify.c create mode 100644 docs/blkverify.txt diff --git a/Makefile.objs b/Makefile.objs index 3ef6d80..dad4593 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -14,7 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o block-nested-$(CONFIG_WIN32) += raw-win32.o block-nested-$(CONFIG_POSIX) += raw-posix.o block-nested-$(CONFIG_CURL) += curl.o diff --git a/block/blkverify.c b/block/blkverify.c new file mode 100644 index 0000000..4202685 --- /dev/null +++ b/block/blkverify.c @@ -0,0 +1,382 @@ +/* + * Block protocol for block driver correctness testing + * + * Copyright (C) 2010 IBM, Corp. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include <stdarg.h> +#include "qemu_socket.h" /* for EINPROGRESS on Windows */ +#include "block_int.h" + +typedef struct { + BlockDriverState *test_file; +} BDRVBlkverifyState; + +typedef struct BlkverifyAIOCB BlkverifyAIOCB; +struct BlkverifyAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + + /* Request metadata */ + bool is_write; + int64_t sector_num; + int nb_sectors; + + int ret; /* first completed request's result */ + unsigned int done; /* completion counter */ + bool *finished; /* completion signal for cancel */ + + QEMUIOVector *qiov; /* user I/O vector */ + QEMUIOVector raw_qiov; /* cloned I/O vector for raw file */ + void *buf; /* buffer for raw file I/O */ + + void (*verify)(BlkverifyAIOCB *acb); +}; + +static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb) +{ + BlkverifyAIOCB *acb = (BlkverifyAIOCB *)blockacb; + bool finished = false; + + /* Wait until request completes, invokes its callback, and frees itself */ + acb->finished = &finished; + while (!finished) { + qemu_aio_wait(); + } +} + +static AIOPool blkverify_aio_pool = { + .aiocb_size = sizeof(BlkverifyAIOCB), + .cancel = blkverify_aio_cancel, +}; + +static void blkverify_err(BlkverifyAIOCB *acb, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "blkverify: %s sector_num=%ld nb_sectors=%d ", + acb->is_write ? "write" : "read", acb->sector_num, + acb->nb_sectors); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); + exit(1); +} + +/* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */ +static int blkverify_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVBlkverifyState *s = bs->opaque; + int ret; + char *raw, *c; + + /* Parse the blkverify: prefix */ + if (strncmp(filename, "blkverify:", strlen("blkverify:"))) { + return -EINVAL; + } + filename += strlen("blkverify:"); + + /* Parse the raw image filename */ + c = strchr(filename, ':'); + if (c == NULL) { + return -EINVAL; + } + + raw = strdup(filename); + raw[c - filename] = '\0'; + ret = bdrv_file_open(&bs->file, raw, flags); + free(raw); + if (ret < 0) { + return ret; + } + filename = c + 1; + + /* Open the test file */ + s->test_file = bdrv_new(""); + ret = bdrv_open(s->test_file, filename, flags, NULL); + if (ret < 0) { + bdrv_delete(s->test_file); + s->test_file = NULL; + return ret; + } + + return 0; +} + +static void blkverify_close(BlockDriverState *bs) +{ + BDRVBlkverifyState *s = bs->opaque; + + bdrv_delete(s->test_file); + s->test_file = NULL; +} + +static void blkverify_flush(BlockDriverState *bs) +{ + BDRVBlkverifyState *s = bs->opaque; + + /* Only flush test file, the raw file is not important */ + bdrv_flush(s->test_file); +} + +static int64_t blkverify_getlength(BlockDriverState *bs) +{ + BDRVBlkverifyState *s = bs->opaque; + + return bdrv_getlength(s->test_file); +} + +/** + * Check that I/O vector contents are identical + * + * @a: I/O vector + * @b: I/O vector + * @ret: Offset to first mismatching byte or -1 if match + */ +static ssize_t blkverify_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) +{ + int i; + ssize_t offset = 0; + + assert(a->niov == b->niov); + for (i = 0; i < a->niov; i++) { + size_t len = 0; + uint8_t *p = (uint8_t *)a->iov[i].iov_base; + uint8_t *q = (uint8_t *)b->iov[i].iov_base; + + assert(a->iov[i].iov_len == b->iov[i].iov_len); + while (len < a->iov[i].iov_len && *p++ == *q++) { + len++; + } + + offset += len; + + if (len != a->iov[i].iov_len) { + return offset; + } + } + return -1; +} + +typedef struct { + int src_index; + struct iovec *src_iov; + void *dest_base; +} IOVectorSortElem; + +static int sortelem_cmp_src_base(const void *a, const void *b) +{ + const IOVectorSortElem *elem_a = a; + const IOVectorSortElem *elem_b = b; + + /* Don't overflow */ + if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) { + return -1; + } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) { + return 1; + } else { + return 0; + } +} + +static int sortelem_cmp_src_index(const void *a, const void *b) +{ + const IOVectorSortElem *elem_a = a; + const IOVectorSortElem *elem_b = b; + + return elem_a->src_index - elem_b->src_index; +} + +/** + * Copy contents of I/O vector + * + * The relative relationships of overlapping iovecs are preserved. This is + * necessary to ensure identical semantics in the cloned I/O vector. + */ +static void blkverify_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, + void *buf) +{ + IOVectorSortElem sortelems[src->niov]; + void *last_end; + int i; + + /* Sort by source iovecs by base address */ + for (i = 0; i < src->niov; i++) { + sortelems[i].src_index = i; + sortelems[i].src_iov = &src->iov[i]; + } + qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base); + + /* Allocate buffer space taking into account overlapping iovecs */ + last_end = NULL; + for (i = 0; i < src->niov; i++) { + struct iovec *cur = sortelems[i].src_iov; + ptrdiff_t rewind = 0; + + /* Detect overlap */ + if (last_end && last_end > cur->iov_base) { + rewind = last_end - cur->iov_base; + } + + sortelems[i].dest_base = buf - rewind; + buf += cur->iov_len - MIN(rewind, cur->iov_len); + last_end = MAX(cur->iov_base + cur->iov_len, last_end); + } + + /* Sort by source iovec index and build destination iovec */ + qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index); + for (i = 0; i < src->niov; i++) { + qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len); + } +} + +static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, + int64_t sector_num, QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aio_pool, bs, cb, opaque); + + acb->bh = NULL; + acb->is_write = is_write; + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + acb->ret = -EINPROGRESS; + acb->done = 0; + acb->qiov = qiov; + acb->buf = NULL; + acb->verify = NULL; + acb->finished = NULL; + return acb; +} + +static void blkverify_aio_bh(void *opaque) +{ + BlkverifyAIOCB *acb = opaque; + + qemu_bh_delete(acb->bh); + if (acb->buf) { + qemu_iovec_destroy(&acb->raw_qiov); + qemu_vfree(acb->buf); + } + acb->common.cb(acb->common.opaque, acb->ret); + if (acb->finished) { + *acb->finished = true; + } + qemu_aio_release(acb); +} + +static void blkverify_aio_cb(void *opaque, int ret) +{ + BlkverifyAIOCB *acb = opaque; + + switch (++acb->done) { + case 1: + acb->ret = ret; + break; + + case 2: + if (acb->ret != ret) { + blkverify_err(acb, "return value mismatch %d != %d", acb->ret, ret); + } + + if (acb->verify) { + acb->verify(acb); + } + + acb->bh = qemu_bh_new(blkverify_aio_bh, acb); + qemu_bh_schedule(acb->bh); + break; + } +} + +static void blkverify_verify_readv(BlkverifyAIOCB *acb) +{ + ssize_t offset = blkverify_iovec_compare(acb->qiov, &acb->raw_qiov); + if (offset != -1) { + blkverify_err(acb, "contents mismatch in sector %ld", + acb->sector_num + (offset / BDRV_SECTOR_SIZE)); + } +} + +static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVBlkverifyState *s = bs->opaque; + BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov, + nb_sectors, cb, opaque); + + acb->verify = blkverify_verify_readv; + acb->buf = qemu_blockalign(bs->file, qiov->size); + qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); + blkverify_iovec_clone(&acb->raw_qiov, qiov, acb->buf); + + if (!bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors, + blkverify_aio_cb, acb)) { + blkverify_aio_cb(acb, -EIO); + } + if (!bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors, + blkverify_aio_cb, acb)) { + blkverify_aio_cb(acb, -EIO); + } + return &acb->common; +} + +static BlockDriverAIOCB *blkverify_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVBlkverifyState *s = bs->opaque; + BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov, + nb_sectors, cb, opaque); + + if (!bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors, + blkverify_aio_cb, acb)) { + blkverify_aio_cb(acb, -EIO); + } + if (!bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, + blkverify_aio_cb, acb)) { + blkverify_aio_cb(acb, -EIO); + } + return &acb->common; +} + +static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + BDRVBlkverifyState *s = bs->opaque; + + /* Only flush test file, the raw file is not important */ + return bdrv_aio_flush(s->test_file, cb, opaque); +} + +static BlockDriver bdrv_blkverify = { + .format_name = "blkverify", + .protocol_name = "blkverify", + + .instance_size = sizeof(BDRVBlkverifyState), + + .bdrv_getlength = blkverify_getlength, + + .bdrv_file_open = blkverify_open, + .bdrv_close = blkverify_close, + .bdrv_flush = blkverify_flush, + + .bdrv_aio_readv = blkverify_aio_readv, + .bdrv_aio_writev = blkverify_aio_writev, + .bdrv_aio_flush = blkverify_aio_flush, +}; + +static void bdrv_blkverify_init(void) +{ + bdrv_register(&bdrv_blkverify); +} + +block_init(bdrv_blkverify_init); diff --git a/docs/blkverify.txt b/docs/blkverify.txt new file mode 100644 index 0000000..d556dc4 --- /dev/null +++ b/docs/blkverify.txt @@ -0,0 +1,69 @@ += Block driver correctness testing with blkverify = + +== Introduction == + +This document describes how to use the blkverify protocol to test that a block +driver is operating correctly. + +It is difficult to test and debug block drivers against real guests. Often +processes inside the guest will crash because corrupt sectors were read as part +of the executable. Other times obscure errors are raised by a program inside +the guest. These issues are extremely hard to trace back to bugs in the block +driver. + +Blkverify solves this problem by catching data corruption inside QEMU the first +time bad data is read and reporting the disk sector that is corrupted. + +== How it works == + +The blkverify protocol has two child block devices, the "test" device and the +"raw" device. Read/write operations are mirrored to both devices so their +state should always be in sync. + +The "raw" device is a raw image, a flat file, that has identical starting +contents to the "test" image. The idea is that the "raw" device will handle +read/write operations correctly and not corrupt data. It can be used as a +reference for comparison against the "test" device. + +After a mirrored read operation completes, blkverify will compare the data and +raise an error if it is not identical. This makes it possible to catch the +first instance where corrupt data is read. + +== Example == + +Imagine raw.img has 0xcd repeated throughout its first sector: + + $ ./qemu-io -c 'read -v 0 512' raw.img + 00000000: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + 00000010: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + [...] + 000001e0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + 000001f0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + read 512/512 bytes at offset 0 + 512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec) + +And test.img is corrupt, its first sector is zeroed when it shouldn't be: + + $ ./qemu-io -c 'read -v 0 512' test.img + 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + [...] + 000001e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 000001f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + read 512/512 bytes at offset 0 + 512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec) + +This error is caught by blkverify: + + $ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img + blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0 + +A more realistic scenario is verifying the installation of a guest OS: + + $ ./qemu-img create raw.img 16G + $ ./qemu-img create -f qcow2 test.qcow2 16G + $ x86_64-softmmu/qemu-system-x86_64 -cdrom debian.iso \ + -drive file=blkverify:raw.img:test.qcow2 + +If the installation is aborted when blkverify detects corruption, use qemu-io +to explore the contents of the disk image at the sector in question. -- 1.7.2.2