On 04/13/2012 12:23 PM, Paolo Bonzini wrote: > This patch adds the implementation of a new job that mirrors a disk to > a new image while letting the guest continue using the old image. > The target is treated as a "black box" and data is copied from the > source to the target in the background. > > The mirror job is never-ending, but it is logically structured into > two phases: 1) copy all data as fast as possible until the target > first gets in sync with the source; 2) keep target in sync and > ensure that reopening to the target gets a correct (full) copy > of the source data. > > The second phase is indicated by the progress in "info block-jobs" > reporting the current offset to be equal to the length of the file. > When the job is cancelled in the second phase, QEMU will run the > job until the source is clean and quiescent, then it will report > successful completion of the job. (Note that it could already happen > that management lost the race against QEMU and got a completion > event instead of cancellation). > > Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> > --- > Makefile.objs | 2 +- > block/mirror.c | 236 > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > block_int.h | 5 ++ > trace-events | 4 + > 4 files changed, 246 insertions(+), 1 deletion(-) > create mode 100644 block/mirror.c > > diff --git a/Makefile.objs b/Makefile.objs > index 5c3bcda..1679461 100644 > --- a/Makefile.objs > +++ b/Makefile.objs > @@ -53,7 +53,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o > qcow2-snapshot.o qcow > block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o > block-nested-y += qed-check.o > block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o > -block-nested-y += stream.o > +block-nested-y += stream.o mirror.o > block-nested-$(CONFIG_WIN32) += raw-win32.o > block-nested-$(CONFIG_POSIX) += raw-posix.o > block-nested-$(CONFIG_LIBISCSI) += iscsi.o > diff --git a/block/mirror.c b/block/mirror.c > new file mode 100644 > index 0000000..5a3395a > --- /dev/null > +++ b/block/mirror.c > @@ -0,0 +1,236 @@ > +/* > + * Image mirroring > + * > + * Copyright Red Hat, Inc. 2012 > + * > + * Authors: > + * Paolo Bonzini <pbonz...@redhat.com> > + * > + * This work is licensed under the terms of the GNU LGPL, version 2 or later. > + * See the COPYING.LIB file in the top-level directory. > + * > + */ > + > +#include "trace.h" > +#include "block_int.h" > +#include "qemu/ratelimit.h" > + > +enum { > + /* > + * Size of data buffer for populating the image file. This should be > large > + * enough to process multiple clusters in a single call, so that > populating > + * contiguous regions of the image is efficient. > + */ > + BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */ > +}; > + > +#define SLICE_TIME 100000000ULL /* ns */ > + > +typedef struct MirrorBlockJob { > + BlockJob common; > + RateLimit limit; > + BlockDriverState *target; > + bool full; > +} MirrorBlockJob; > + > +static int coroutine_fn mirror_populate(BlockDriverState *source, > + BlockDriverState *target, > + int64_t sector_num, int nb_sectors, > + void *buf) > +{ > + struct iovec iov = { > + .iov_base = buf, > + .iov_len = nb_sectors * 512, > + }; > + QEMUIOVector qiov; > + int ret; > + > + qemu_iovec_init_external(&qiov, &iov, 1); > + > + /* Copy-on-read the unallocated clusters */ > + ret = bdrv_co_readv(source, sector_num, nb_sectors, &qiov); > + if (ret < 0) { > + return ret; > + } > + return bdrv_co_writev(target, sector_num, nb_sectors, &qiov); > +} > + > +static void coroutine_fn mirror_run(void *opaque) > +{ > + MirrorBlockJob *s = opaque; > + BlockDriverState *bs = s->common.bs; > + BlockDriverState *base; > + int64_t sector_num, end; > + int ret = 0; > + int n; > + bool synced = false; > + void *buf; > + > + if (block_job_is_cancelled(&s->common)) { > + goto immediate_exit; > + } > + > + s->common.len = bdrv_getlength(bs); > + if (s->common.len < 0) { > + block_job_complete(&s->common, s->common.len); > + return; > + } > + > + base = s->full ? NULL : bs->backing_hd; > + end = s->common.len >> BDRV_SECTOR_BITS; > + buf = qemu_blockalign(bs, BLOCK_SIZE); > + > + /* First part, loop on the sectors and initialize the dirty bitmap. */ > + for (sector_num = 0; sector_num < end; ) { > + int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1; > + ret = bdrv_co_is_allocated_above(bs, base, > + sector_num, next - sector_num, &n); > + > + if (ret < 0) { > + break; > + } else if (ret == 1) { > + bdrv_set_dirty(bs, sector_num, n); > + sector_num = next; > + } else { > + sector_num += n; > + } > + } > + > + if (ret < 0) { > + block_job_complete(&s->common, ret); > + } > + > + sector_num = -1; > + for (;;) { > + int64_t cnt; > + s->common.busy = true; > + if (bdrv_get_dirty_count(bs) == 0) { > + /* Switch out of the streaming phase. From now on, if the > + * job is cancelled we will actually complete all pending > + * I/O and report completion, so that drive-reopen can be > + * used to pivot to the mirroring target. > + */ > + synced = true; > + sector_num = -1; > + s->common.offset = end * BDRV_SECTOR_SIZE; > + } > + > + if (synced && block_job_is_cancelled(&s->common)) { > + /* The dirty bitmap is not updated while operations are pending. > + * If we're about to exit, wait for pending operations or we may > + * exit while the source has dirty data to copy! > + */ > + while (bdrv_get_dirty_count(bs) == 0 && > + !QLIST_EMPTY(&bs->tracked_requests)) { > + qemu_aio_wait(); > + } > + } > + > + if (bdrv_get_dirty_count(bs) != 0) { > + int nb_sectors; > + sector_num = bdrv_get_next_dirty(bs, sector_num); > + nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - sector_num); > + trace_mirror_one_iteration(s, sector_num); > + bdrv_reset_dirty(bs, sector_num, BDRV_SECTORS_PER_DIRTY_CHUNK); > + ret = mirror_populate(bs, s->target, sector_num, nb_sectors, > buf); > + if (ret < 0) { > + break; > + } > + } > + > + ret = 0; > + cnt = bdrv_get_dirty_count(bs); > + if (synced) { > + if (!block_job_is_cancelled(&s->common)) { > + s->common.busy = false; > + co_sleep_ns(rt_clock, cnt == 0 ? SLICE_TIME : 0); > + } else if (cnt == 0 && QLIST_EMPTY(&bs->tracked_requests)) { > + /* The two disks are in sync. Exit and report successful > + * successful completion. > + */ > + s->common.cancelled = false; > + break; > + } > + > + /* We get here either to poll the target, or because the job > + * was cancelled. In the latter case, we still have an > + * opportunity to do I/O (without going to sleep) before > + * exiting. > + */ > + } else { > + uint64_t delay_ns; > + > + /* Publish progress */ > + s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE; > + > + if (s->common.speed) { > + delay_ns = ratelimit_calculate_delay(&s->limit, > BDRV_SECTORS_PER_DIRTY_CHUNK); > + } else { > + delay_ns = 0; > + } > + > + /* Note that even when no rate limit is applied we need to yield > + * with no pending I/O here so that qemu_aio_flush() returns. > + */ > + s->common.busy = false; > + co_sleep_ns(rt_clock, delay_ns); > + if (block_job_is_cancelled(&s->common)) { > + break; > + } > + } > + } > + > +immediate_exit: > + bdrv_set_dirty_tracking(bs, false); > + bdrv_close(s->target); > + bdrv_delete(s->target); > + block_job_complete(&s->common, ret); > +} > + > +static int mirror_set_speed(BlockJob *job, int64_t value) > +{ > + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); > + > + if (value < 0) { > + return -EINVAL; > + } > + ratelimit_set_speed(&s->limit, value / BDRV_SECTOR_SIZE, SLICE_TIME); > + return 0; > +} > + > +static BlockJobType mirror_job_type = { > + .instance_size = sizeof(MirrorBlockJob), > + .job_type = "mirror", > + .set_speed = mirror_set_speed, > +}; > + > +int mirror_start(BlockDriverState *bs, > + const char *target, BlockDriver *drv, int flags, > + BlockDriverCompletionFunc *cb, > + void *opaque, bool full) > +{ > + MirrorBlockJob *s; > + int ret; > + > + s = block_job_create(&mirror_job_type, bs, cb, opaque); > + if (!s) { > + return -EBUSY; /* bs must already be in use */ > + } > + > + s->target = bdrv_new(""); > + ret = bdrv_open(s->target, target, > + flags | BDRV_O_NO_BACKING | BDRV_O_NO_FLUSH | > BDRV_O_CACHE_WB, > + drv); > + > + if (ret < 0) { > + bdrv_delete(s->target); > + return ret; > + } > + > + s->full = full; > + bdrv_set_dirty_tracking(bs, true); > + s->common.co = qemu_coroutine_create(mirror_run); > + trace_mirror_start(bs, s, s->common.co, opaque); > + qemu_coroutine_enter(s->common.co, s); > + return 0; > +}
Something to note: mirror_start() will leave the BlockDriverState busy, and the block job dangling, if the bdrv_open() fails (for instance, unable to open an existing image: https://bugzilla.redhat.com/show_bug.cgi?id=814102). Re-arranging mirror_start() to not create the block job if opening the BDS fails should fix this, like so: int mirror_start(BlockDriverState *bs, const char *target, BlockDriver *drv, int flags, int64_t speed, BlockDriverCompletionFunc *cb, void *opaque, bool full) { MirrorBlockJob *s; BlockDriverState *target_bs; int ret = 0; target_bs = bdrv_new(""); ret = bdrv_open(target_bs, target, flags | BDRV_O_NO_BACKING | BDRV_O_NO_FLUSH | BDRV_O_CACHE_WB, drv); if (ret < 0) { bdrv_delete(target_bs); goto exit; } s = block_job_create(&mirror_job_type, bs, speed, cb, opaque); if (!s) { bdrv_delete(target_bs); ret = -EBUSY; /* bs must already be in use */ goto exit; } s->target = target_bs; s->full = full; bdrv_set_dirty_tracking(bs, true); s->common.co = qemu_coroutine_create(mirror_run); trace_mirror_start(bs, s, s->common.co, opaque); exit: return ret; } > diff --git a/block_int.h b/block_int.h > index eae24d2..683d59d 100644 > --- a/block_int.h > +++ b/block_int.h > @@ -432,4 +432,9 @@ int stream_start(BlockDriverState *bs, BlockDriverState > *base, > const char *base_id, BlockDriverCompletionFunc *cb, > void *opaque); > > +int mirror_start(BlockDriverState *bs, > + const char *target, BlockDriver *drv, int flags, > + BlockDriverCompletionFunc *cb, > + void *opaque, bool full); > + > #endif /* BLOCK_INT_H */ > diff --git a/trace-events b/trace-events > index a5f276d..23aad83 100644 > --- a/trace-events > +++ b/trace-events > @@ -71,6 +71,10 @@ bdrv_co_write_zeroes(void *bs, int64_t sector_num, int > nb_sector) "bs %p sector_ > bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, > void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p" > bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, > int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num > %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d" > > +# block/mirror.c > +mirror_one_iteration(void *s, int64_t sector_num) "s %p sector_num %"PRId64"" > +mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p > opaque %p" > + > # block/stream.c > stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int > is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d" > stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p > base %p s %p co %p opaque %p"