Re: [Qemu-devel] [PATCH 4/8] block: add mirror job

Jeff Cody Wed, 30 May 2012 13:10:33 -0700

On 04/13/2012 12:23 PM, Paolo Bonzini wrote:
> This patch adds the implementation of a new job that mirrors a disk to
> a new image while letting the guest continue using the old image.
> The target is treated as a "black box" and data is copied from the
> source to the target in the background.
> 
> The mirror job is never-ending, but it is logically structured into
> two phases: 1) copy all data as fast as possible until the target
> first gets in sync with the source; 2) keep target in sync and
> ensure that reopening to the target gets a correct (full) copy
> of the source data.
> 
> The second phase is indicated by the progress in "info block-jobs"
> reporting the current offset to be equal to the length of the file.
> When the job is cancelled in the second phase, QEMU will run the
> job until the source is clean and quiescent, then it will report
> successful completion of the job.  (Note that it could already happen
> that management lost the race against QEMU and got a completion
> event instead of cancellation).
> 
> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
> ---
>  Makefile.objs  |    2 +-
>  block/mirror.c |  236 
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h    |    5 ++
>  trace-events   |    4 +
>  4 files changed, 246 insertions(+), 1 deletion(-)
>  create mode 100644 block/mirror.c
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 5c3bcda..1679461 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -53,7 +53,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o 
> qcow2-snapshot.o qcow
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> -block-nested-y += stream.o
> +block-nested-y += stream.o mirror.o
>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>  block-nested-$(CONFIG_LIBISCSI) += iscsi.o
> diff --git a/block/mirror.c b/block/mirror.c
> new file mode 100644
> index 0000000..5a3395a
> --- /dev/null
> +++ b/block/mirror.c
> @@ -0,0 +1,236 @@
> +/*
> + * Image mirroring
> + *
> + * Copyright Red Hat, Inc. 2012
> + *
> + * Authors:
> + *  Paolo Bonzini  <pbonz...@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "trace.h"
> +#include "block_int.h"
> +#include "qemu/ratelimit.h"
> +
> +enum {
> +    /*
> +     * Size of data buffer for populating the image file.  This should be 
> large
> +     * enough to process multiple clusters in a single call, so that 
> populating
> +     * contiguous regions of the image is efficient.
> +     */
> +    BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
> +};
> +
> +#define SLICE_TIME 100000000ULL /* ns */
> +
> +typedef struct MirrorBlockJob {
> +    BlockJob common;
> +    RateLimit limit;
> +    BlockDriverState *target;
> +    bool full;
> +} MirrorBlockJob;
> +
> +static int coroutine_fn mirror_populate(BlockDriverState *source,
> +                                        BlockDriverState *target,
> +                                        int64_t sector_num, int nb_sectors,
> +                                        void *buf)
> +{
> +    struct iovec iov = {
> +        .iov_base = buf,
> +        .iov_len  = nb_sectors * 512,
> +    };
> +    QEMUIOVector qiov;
> +    int ret;
> +
> +    qemu_iovec_init_external(&qiov, &iov, 1);
> +
> +    /* Copy-on-read the unallocated clusters */
> +    ret = bdrv_co_readv(source, sector_num, nb_sectors, &qiov);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    return bdrv_co_writev(target, sector_num, nb_sectors, &qiov);
> +}
> +
> +static void coroutine_fn mirror_run(void *opaque)
> +{
> +    MirrorBlockJob *s = opaque;
> +    BlockDriverState *bs = s->common.bs;
> +    BlockDriverState *base;
> +    int64_t sector_num, end;
> +    int ret = 0;
> +    int n;
> +    bool synced = false;
> +    void *buf;
> +
> +    if (block_job_is_cancelled(&s->common)) {
> +        goto immediate_exit;
> +    }
> +
> +    s->common.len = bdrv_getlength(bs);
> +    if (s->common.len < 0) {
> +        block_job_complete(&s->common, s->common.len);
> +        return;
> +    }
> +
> +    base = s->full ? NULL : bs->backing_hd;
> +    end = s->common.len >> BDRV_SECTOR_BITS;
> +    buf = qemu_blockalign(bs, BLOCK_SIZE);
> +
> +    /* First part, loop on the sectors and initialize the dirty bitmap.  */
> +    for (sector_num = 0; sector_num < end; ) {
> +        int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
> +        ret = bdrv_co_is_allocated_above(bs, base,
> +                                         sector_num, next - sector_num, &n);
> +
> +        if (ret < 0) {
> +            break;
> +        } else if (ret == 1) {
> +            bdrv_set_dirty(bs, sector_num, n);
> +            sector_num = next;
> +        } else {
> +            sector_num += n;
> +        }
> +    }
> +
> +    if (ret < 0) {
> +        block_job_complete(&s->common, ret);
> +    }
> +
> +    sector_num = -1;
> +    for (;;) {
> +        int64_t cnt;
> +        s->common.busy = true;
> +        if (bdrv_get_dirty_count(bs) == 0) {
> +            /* Switch out of the streaming phase.  From now on, if the
> +             * job is cancelled we will actually complete all pending
> +             * I/O and report completion, so that drive-reopen can be
> +             * used to pivot to the mirroring target.
> +             */
> +            synced = true;
> +            sector_num = -1;
> +            s->common.offset = end * BDRV_SECTOR_SIZE;
> +        }
> +
> +        if (synced && block_job_is_cancelled(&s->common)) {
> +            /* The dirty bitmap is not updated while operations are pending.
> +             * If we're about to exit, wait for pending operations or we may
> +             * exit while the source has dirty data to copy!
> +             */
> +            while (bdrv_get_dirty_count(bs) == 0 &&
> +                   !QLIST_EMPTY(&bs->tracked_requests)) {
> +                qemu_aio_wait();
> +            }
> +        }
> +
> +        if (bdrv_get_dirty_count(bs) != 0) {
> +            int nb_sectors;
> +            sector_num = bdrv_get_next_dirty(bs, sector_num);
> +            nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - sector_num);
> +            trace_mirror_one_iteration(s, sector_num);
> +            bdrv_reset_dirty(bs, sector_num, BDRV_SECTORS_PER_DIRTY_CHUNK);
> +            ret = mirror_populate(bs, s->target, sector_num, nb_sectors, 
> buf);
> +            if (ret < 0) {
> +                break;
> +            }
> +        }
> +
> +        ret = 0;
> +        cnt = bdrv_get_dirty_count(bs);
> +        if (synced) {
> +            if (!block_job_is_cancelled(&s->common)) {
> +                s->common.busy = false;
> +                co_sleep_ns(rt_clock, cnt == 0 ? SLICE_TIME : 0);
> +            } else if (cnt == 0 && QLIST_EMPTY(&bs->tracked_requests)) {
> +                /* The two disks are in sync.  Exit and report successful
> +                 * successful completion.
> +                 */
> +                s->common.cancelled = false;
> +                break;
> +            }
> +
> +            /* We get here either to poll the target, or because the job
> +             * was cancelled.  In the latter case, we still have an
> +             * opportunity to do I/O (without going to sleep) before
> +             * exiting.
> +             */
> +        } else {
> +            uint64_t delay_ns;
> +
> +            /* Publish progress */
> +            s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
> +
> +            if (s->common.speed) {
> +                delay_ns = ratelimit_calculate_delay(&s->limit, 
> BDRV_SECTORS_PER_DIRTY_CHUNK);
> +            } else {
> +                delay_ns = 0;
> +            }
> +
> +            /* Note that even when no rate limit is applied we need to yield
> +             * with no pending I/O here so that qemu_aio_flush() returns.
> +             */
> +            s->common.busy = false;
> +            co_sleep_ns(rt_clock, delay_ns);
> +            if (block_job_is_cancelled(&s->common)) {
> +                break;
> +            }
> +        }
> +    }
> +
> +immediate_exit:
> +    bdrv_set_dirty_tracking(bs, false);
> +    bdrv_close(s->target);
> +    bdrv_delete(s->target);
> +    block_job_complete(&s->common, ret);
> +}
> +
> +static int mirror_set_speed(BlockJob *job, int64_t value)
> +{
> +    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
> +
> +    if (value < 0) {
> +        return -EINVAL;
> +    }
> +    ratelimit_set_speed(&s->limit, value / BDRV_SECTOR_SIZE, SLICE_TIME);
> +    return 0;
> +}
> +
> +static BlockJobType mirror_job_type = {
> +    .instance_size = sizeof(MirrorBlockJob),
> +    .job_type      = "mirror",
> +    .set_speed     = mirror_set_speed,
> +};
> +
> +int mirror_start(BlockDriverState *bs,
> +                 const char *target, BlockDriver *drv, int flags,
> +                 BlockDriverCompletionFunc *cb,
> +                 void *opaque, bool full)
> +{
> +    MirrorBlockJob *s;
> +    int ret;
> +
> +    s = block_job_create(&mirror_job_type, bs, cb, opaque);
> +    if (!s) {
> +        return -EBUSY; /* bs must already be in use */
> +    }
> +
> +    s->target = bdrv_new("");
> +    ret = bdrv_open(s->target, target,
> +                    flags | BDRV_O_NO_BACKING | BDRV_O_NO_FLUSH | 
> BDRV_O_CACHE_WB,
> +                    drv);
> +
> +    if (ret < 0) {
> +        bdrv_delete(s->target);
> +        return ret;
> +    }
> +
> +    s->full = full;
> +    bdrv_set_dirty_tracking(bs, true);
> +    s->common.co = qemu_coroutine_create(mirror_run);
> +    trace_mirror_start(bs, s, s->common.co, opaque);
> +    qemu_coroutine_enter(s->common.co, s);
> +    return 0;
> +}


Something to note: mirror_start() will leave the BlockDriverState busy,
and the block job dangling, if the bdrv_open() fails (for instance,
unable to open an existing image:
https://bugzilla.redhat.com/show_bug.cgi?id=814102).

Re-arranging mirror_start() to not create the block job if opening the
BDS fails should fix this, like so:


int mirror_start(BlockDriverState *bs,
                 const char *target, BlockDriver *drv, int flags,
                 int64_t speed, BlockDriverCompletionFunc *cb,
                 void *opaque, bool full)
{
    MirrorBlockJob *s;
    BlockDriverState *target_bs;
    int ret = 0;


    target_bs = bdrv_new("");
    ret = bdrv_open(target_bs, target,
                    flags | BDRV_O_NO_BACKING | BDRV_O_NO_FLUSH | 
BDRV_O_CACHE_WB,
                    drv);

    if (ret < 0) {
        bdrv_delete(target_bs);
        goto exit;
    }

    s = block_job_create(&mirror_job_type, bs, speed, cb, opaque);
    if (!s) {
        bdrv_delete(target_bs);
        ret = -EBUSY; /* bs must already be in use */
        goto exit;
    }

    s->target = target_bs;
    s->full = full;
    bdrv_set_dirty_tracking(bs, true);
    s->common.co = qemu_coroutine_create(mirror_run);
    trace_mirror_start(bs, s, s->common.co, opaque);
exit:
    return ret;
}


> diff --git a/block_int.h b/block_int.h
> index eae24d2..683d59d 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -432,4 +432,9 @@ int stream_start(BlockDriverState *bs, BlockDriverState 
> *base,
>                   const char *base_id, BlockDriverCompletionFunc *cb,
>                   void *opaque);
>  
> +int mirror_start(BlockDriverState *bs,
> +                 const char *target, BlockDriver *drv, int flags,
> +                 BlockDriverCompletionFunc *cb,
> +                 void *opaque, bool full);
> +
>  #endif /* BLOCK_INT_H */
> diff --git a/trace-events b/trace-events
> index a5f276d..23aad83 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -71,6 +71,10 @@ bdrv_co_write_zeroes(void *bs, int64_t sector_num, int 
> nb_sector) "bs %p sector_
>  bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, 
> void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
>  bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, 
> int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num 
> %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
>  
> +# block/mirror.c
> +mirror_one_iteration(void *s, int64_t sector_num) "s %p sector_num %"PRId64""
> +mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p 
> opaque %p"
> +
>  # block/stream.c
>  stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int 
> is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
>  stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p 
> base %p s %p co %p opaque %p"

Re: [Qemu-devel] [PATCH 4/8] block: add mirror job

Reply via email to