On Wed, May 08, 2013 at 02:39:25PM +0200, Kevin Wolf wrote: > Am 29.04.2013 um 09:42 hat Stefan Hajnoczi geschrieben: > > diff --git a/include/block/blockjob.h b/include/block/blockjob.h > > index c290d07..6f42495 100644 > > --- a/include/block/blockjob.h > > +++ b/include/block/blockjob.h > > @@ -50,6 +50,13 @@ typedef struct BlockJobType { > > * manually. > > */ > > void (*complete)(BlockJob *job, Error **errp); > > + > > + /** tracked requests */ > > + int coroutine_fn (*before_read)(BlockDriverState *bs, int64_t > > sector_num, > > + int nb_sectors, QEMUIOVector *qiov); > > + int coroutine_fn (*before_write)(BlockDriverState *bs, int64_t > > sector_num, > > + int nb_sectors, QEMUIOVector *qiov); > > + > > } BlockJobType; > > This is actually a sign that a block job isn't the right tool. Jobs are > something that runs in the background and doesn't have callbacks. You > really want to have a filter here (that happens to be coupled to a job). > Need the BlockBackend split before we can do this right. > > The second thing that this conflicts with is generalising block jobs to > generic background jobs. > > Each hack like this that we accumulate makes it harder to get the real > thing eventually.
In v3 I added a slightly cleaner interface: bdrv_set_before_write_cb(bs, backup_before_write); This way the "before write" callback is not tied to block jobs and could be reused in the future. The alternative is to create a BlockDriver that mostly delegates plus uses bdrv_swap(). The boilerplate involved in that is ugly though, so I'm using bdrv_set_before_write_cb() instead. > > > > /** > > @@ -103,6 +110,9 @@ struct BlockJob { > > /** Speed that was set with @block_job_set_speed. */ > > int64_t speed; > > > > + /** tracked requests */ > > + int cluster_size; > > Sure that this is the right comment here? > > Does really every job need a cluster size? Dropped in v3. > > diff --git a/block.c b/block.c > > index aa9a533..c5c09b7 100644 > > --- a/block.c > > +++ b/block.c > > @@ -54,6 +54,7 @@ > > typedef enum { > > BDRV_REQ_COPY_ON_READ = 0x1, > > BDRV_REQ_ZERO_WRITE = 0x2, > > + BDRV_REQ_BACKUP_ONLY = 0x4, > > } BdrvRequestFlags; > > Without having read the rest of the code, it's unclear to me what this > new flag means. Could use a comment. (Though it has "backup" in its name, > so I suspect having this in generic block layer is wrong.) Agreed. Dropped in v3. > > static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); > > @@ -1902,6 +1903,22 @@ void bdrv_round_to_clusters(BlockDriverState *bs, > > } > > } > > > > +/** > > + * Round a region to job cluster boundaries > > + */ > > +static void round_to_job_clusters(BlockDriverState *bs, > > + int64_t sector_num, int nb_sectors, > > + int job_cluster_size, > > + int64_t *cluster_sector_num, > > + int *cluster_nb_sectors) > > +{ > > + int64_t c = job_cluster_size / BDRV_SECTOR_SIZE; > > + > > + *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); > > + *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + > > + nb_sectors, c); > > +} > > This function has really nothing to do with jobs except that it happens > to be useful in some function actually related to jobs. > > It should probably be renamed and bdrv_round_to_clusters() should call > into it. Dropped in v3. block/backup.c works in bitmap granularity units so we don't need to mess with sectors and cluster sizes. > > + > > static bool tracked_request_overlaps(BdrvTrackedRequest *req, > > int64_t sector_num, int nb_sectors) { > > /* aaaa bbbb */ > > @@ -1916,7 +1933,9 @@ static bool > > tracked_request_overlaps(BdrvTrackedRequest *req, > > } > > > > static void coroutine_fn wait_for_overlapping_requests(BlockDriverState > > *bs, > > - int64_t sector_num, int nb_sectors) > > + int64_t sector_num, > > + int nb_sectors, > > + int > > job_cluster_size) > > { > > BdrvTrackedRequest *req; > > int64_t cluster_sector_num; > > @@ -1932,6 +1951,11 @@ static void coroutine_fn > > wait_for_overlapping_requests(BlockDriverState *bs, > > bdrv_round_to_clusters(bs, sector_num, nb_sectors, > > &cluster_sector_num, &cluster_nb_sectors); > > > > + if (job_cluster_size) { > > + round_to_job_clusters(bs, sector_num, nb_sectors, job_cluster_size, > > + &cluster_sector_num, &cluster_nb_sectors); > > + } > > + > > do { > > retry = false; > > QLIST_FOREACH(req, &bs->tracked_requests, list) { > > @@ -2507,12 +2531,24 @@ static int coroutine_fn > > bdrv_co_do_readv(BlockDriverState *bs, > > bs->copy_on_read_in_flight++; > > } > > > > - if (bs->copy_on_read_in_flight) { > > - wait_for_overlapping_requests(bs, sector_num, nb_sectors); > > + int job_cluster_size = bs->job && bs->job->cluster_size ? > > + bs->job->cluster_size : 0; > > + > > + if (bs->copy_on_read_in_flight || job_cluster_size) { > > + wait_for_overlapping_requests(bs, sector_num, nb_sectors, > > + job_cluster_size); > > } > > > > tracked_request_begin(&req, bs, sector_num, nb_sectors, false); > > > > + if (bs->job && bs->job->job_type->before_read) { > > + ret = bs->job->job_type->before_read(bs, sector_num, nb_sectors, > > qiov); > > + if ((ret < 0) || (flags & BDRV_REQ_BACKUP_ONLY)) { > > + /* Note: We do not return any data to the caller */ > > + goto out; > > + } > > + } > > Ugh, so we're introducing calls of bdrv_co_do_readv() that don't > actually read, but only cause some side-effects of reads, except if > there is no block job running? > > Is it intentional that these fake requests are considered for I/O > throttling? Dropped in v3, using a separate queue of in-flight CoW requests in block/backup.c. > > + > > if (flags & BDRV_REQ_COPY_ON_READ) { > > int pnum; > > > > @@ -2556,6 +2592,17 @@ int coroutine_fn > > bdrv_co_copy_on_readv(BlockDriverState *bs, > > BDRV_REQ_COPY_ON_READ); > > } > > > > +int coroutine_fn bdrv_co_backup(BlockDriverState *bs, > > + int64_t sector_num, int nb_sectors) > > +{ > > + if (!bs->job) { > > + return -ENOTSUP; > > + } > > Don't you expect a very specific job to be running, or is it acceptable > to have streaming running? It looks a bit arbitrary. > > And you make the assumption that the functionality (running only > side-effects of read) can only work correctly in the context of jobs (or > actually not just in the context of jobs, but only when a job is running > at the same time). Not sure why. > > I suspect you're not really checking what you wanted to check here. Dropped in v3. > > + > > + return bdrv_co_do_readv(bs, sector_num, nb_sectors, NULL, > > + BDRV_REQ_BACKUP_ONLY); > > +} > > + > > static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, > > int64_t sector_num, int nb_sectors) > > { > > @@ -2613,12 +2660,23 @@ static int coroutine_fn > > bdrv_co_do_writev(BlockDriverState *bs, > > bdrv_io_limits_intercept(bs, true, nb_sectors); > > } > > > > - if (bs->copy_on_read_in_flight) { > > - wait_for_overlapping_requests(bs, sector_num, nb_sectors); > > + int job_cluster_size = bs->job && bs->job->cluster_size ? > > + bs->job->cluster_size : 0; > > + > > + if (bs->copy_on_read_in_flight || job_cluster_size) { > > + wait_for_overlapping_requests(bs, sector_num, nb_sectors, > > + job_cluster_size); > > } > > > > tracked_request_begin(&req, bs, sector_num, nb_sectors, true); > > > > + if (bs->job && bs->job->job_type->before_write) { > > + ret = bs->job->job_type->before_write(bs, sector_num, nb_sectors, > > qiov); > > + if (ret < 0) { > > + goto out; > > + } > > + } > > + > > if (flags & BDRV_REQ_ZERO_WRITE) { > > ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); > > } else { > > @@ -2637,6 +2695,7 @@ static int coroutine_fn > > bdrv_co_do_writev(BlockDriverState *bs, > > bs->wr_highest_sector = sector_num + nb_sectors - 1; > > } > > > > +out: > > tracked_request_end(&req); > > > > return ret; > > The changes to block.c and friends should be a separate patch, so let me > sum up my expectations here: This patch should be as small as possible, > it should mention the word "backup" as little as possible and it should > build successfully without backup.o. > > This is the very minimum at which we can start talking about the patch > and whether we do proper block filters or what the best way is to work > around the lack of them today. > > And then we get to things like fake requests which aren't my favourite > design either... Split into two patches for v3: block: add bdrv_set_before_write_cb() block: add basic backup support to block driver This patch no longer touches block.c or headers. > > diff --git a/block/Makefile.objs b/block/Makefile.objs > > index 6c4b5bc..dcab6d3 100644 > > --- a/block/Makefile.objs > > +++ b/block/Makefile.objs > > @@ -19,5 +19,6 @@ endif > > common-obj-y += stream.o > > common-obj-y += commit.o > > common-obj-y += mirror.o > > +common-obj-y += backup.o > > > > $(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS) > > diff --git a/block/backup.c b/block/backup.c > > new file mode 100644 > > index 0000000..45d8c21 > > --- /dev/null > > +++ b/block/backup.c > > @@ -0,0 +1,252 @@ > > +/* > > + * QEMU backup > > + * > > + * Copyright (C) 2013 Proxmox Server Solutions > > + * > > + * Authors: > > + * Dietmar Maurer (diet...@proxmox.com) > > + * > > + * This work is licensed under the terms of the GNU GPL, version 2 or > > later. > > + * See the COPYING file in the top-level directory. > > + * > > + */ > > + > > +#include <stdio.h> > > +#include <errno.h> > > +#include <unistd.h> > > + > > +#include "block/block.h" > > +#include "block/block_int.h" > > +#include "block/blockjob.h" > > +#include "qemu/ratelimit.h" > > + > > +#define DEBUG_BACKUP 0 > > + > > +#define DPRINTF(fmt, ...) \ > > + do { \ > > + if (DEBUG_BACKUP) { \ > > + fprintf(stderr, "backup: " fmt, ## __VA_ARGS__); \ > > + } \ > > + } while (0) > > + > > +#define BACKUP_CLUSTER_BITS 16 > > +#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS) > > +#define BACKUP_BLOCKS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE) > > + > > +#define SLICE_TIME 100000000ULL /* ns */ > > + > > +typedef struct BackupBlockJob { > > + BlockJob common; > > + BlockDriverState *target; > > + RateLimit limit; > > + CoRwlock rwlock; > > flush_rwlock? Thanks, renamed in v3. > > + uint64_t sectors_read; > > + HBitmap *bitmap; > > +} BackupBlockJob; > > + > > +static int coroutine_fn backup_do_cow(BlockDriverState *bs, > > + int64_t sector_num, int nb_sectors) > > +{ > > + assert(bs); > > + BackupBlockJob *job = (BackupBlockJob *)bs->job; > > + assert(job); > > + > > + BlockDriver *drv = bs->drv; > > + struct iovec iov; > > + QEMUIOVector bounce_qiov; > > + void *bounce_buffer = NULL; > > + int ret = 0; > > + > > + qemu_co_rwlock_rdlock(&job->rwlock); > > + > > + int64_t start, end; > > + > > + start = sector_num / BACKUP_BLOCKS_PER_CLUSTER; > > + end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_BLOCKS_PER_CLUSTER); > > + > > + DPRINTF("brdv_co_backup_cow enter %s C%" PRId64 " %" PRId64 " %d\n", > > + bdrv_get_device_name(bs), start, sector_num, nb_sectors); > > + > > + for (; start < end; start++) { > > + if (hbitmap_get(job->bitmap, start)) { > > + DPRINTF("brdv_co_backup_cow skip C%" PRId64 "\n", start); > > + continue; /* already copied */ > > + } > > + > > + /* immediately set bitmap (avoid coroutine race) */ > > + hbitmap_set(job->bitmap, start, 1); > > + > > + DPRINTF("brdv_co_backup_cow C%" PRId64 "\n", start); > > + > > + if (!bounce_buffer) { > > + iov.iov_len = BACKUP_CLUSTER_SIZE; > > + iov.iov_base = bounce_buffer = qemu_blockalign(bs, > > iov.iov_len); > > + qemu_iovec_init_external(&bounce_qiov, &iov, 1); > > + } > > + > > + ret = drv->bdrv_co_readv(bs, start * BACKUP_BLOCKS_PER_CLUSTER, > > + BACKUP_BLOCKS_PER_CLUSTER, > > + &bounce_qiov); > > Why do you bypass the block layer here? Necessary because the request would deadlock due to wait_for_overlapping_requests() being called - this internal read overlaps with the guest's write. Anyway, dropped in v3 since I'm using a different approach. > > + if (ret < 0) { > > + DPRINTF("brdv_co_backup_cow bdrv_read C%" PRId64 " failed\n", > > + start); > > + goto out; > > + } > > + > > + job->sectors_read += BACKUP_BLOCKS_PER_CLUSTER; > > + > > + if (!buffer_is_zero(bounce_buffer, BACKUP_CLUSTER_SIZE)) { > > + ret = bdrv_co_writev(job->target, start * > > BACKUP_BLOCKS_PER_CLUSTER, > > + BACKUP_BLOCKS_PER_CLUSTER, > > + &bounce_qiov); > > + if (ret < 0) { > > + DPRINTF("brdv_co_backup_cow dump_cluster_cb C%" PRId64 > > + " failed\n", start); > > + goto out; > > + } > > + } > > + > > + DPRINTF("brdv_co_backup_cow done C%" PRId64 "\n", start); > > + } > > + > > +out: > > + if (bounce_buffer) { > > + qemu_vfree(bounce_buffer); > > + } > > + > > + qemu_co_rwlock_unlock(&job->rwlock); > > + > > + return ret; > > +} > > + > > +static int coroutine_fn backup_before_read(BlockDriverState *bs, > > + int64_t sector_num, > > + int nb_sectors, QEMUIOVector > > *qiov) > > +{ > > + return backup_do_cow(bs, sector_num, nb_sectors); > > +} > > Why do you do copy on _read_? Without actually making use of the read > data like COR in block.c, but by reading the data a second time. > > I mean, obviously this is the implementation of bdrv_co_backup(), but it > doesn't make a lot of sense for guest requests. You could directly call > into the function from below instead of going through block.c for > bdrv_co_backup() and get rid of the fake requests. Dropped in v3. > > +static int coroutine_fn backup_before_write(BlockDriverState *bs, > > + int64_t sector_num, > > + int nb_sectors, QEMUIOVector > > *qiov) > > +{ > > + return backup_do_cow(bs, sector_num, nb_sectors); > > +} > > + > > +static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) > > +{ > > + BackupBlockJob *s = container_of(job, BackupBlockJob, common); > > + > > + if (speed < 0) { > > + error_set(errp, QERR_INVALID_PARAMETER, "speed"); > > + return; > > + } > > + ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); > > +} > > + > > +static BlockJobType backup_job_type = { > > + .instance_size = sizeof(BackupBlockJob), > > + .before_read = backup_before_read, > > + .before_write = backup_before_write, > > + .job_type = "backup", > > + .set_speed = backup_set_speed, > > +}; > > + > > +static void coroutine_fn backup_run(void *opaque) > > +{ > > + BackupBlockJob *job = opaque; > > + BlockDriverState *bs = job->common.bs; > > + assert(bs); > > + > > + int64_t start, end; > > + > > + start = 0; > > + end = DIV_ROUND_UP(bdrv_getlength(bs) / BDRV_SECTOR_SIZE, > > + BACKUP_BLOCKS_PER_CLUSTER); > > + > > + job->bitmap = hbitmap_alloc(end, 0); > > + > > + DPRINTF("backup_run start %s %" PRId64 " %" PRId64 "\n", > > + bdrv_get_device_name(bs), start, end); > > + > > + int ret = 0; > > + > > + for (; start < end; start++) { > > + if (block_job_is_cancelled(&job->common)) { > > + ret = -1; > > + break; > > + } > > + > > + /* we need to yield so that qemu_aio_flush() returns. > > + * (without, VM does not reboot) > > + */ > > + if (job->common.speed) { > > + uint64_t delay_ns = ratelimit_calculate_delay( > > + &job->limit, job->sectors_read); > > + job->sectors_read = 0; > > + block_job_sleep_ns(&job->common, rt_clock, delay_ns); > > + } else { > > + block_job_sleep_ns(&job->common, rt_clock, 0); > > + } > > + > > + if (block_job_is_cancelled(&job->common)) { > > + ret = -1; > > + break; > > + } > > + > > + if (hbitmap_get(job->bitmap, start)) { > > + continue; /* already copied */ > > + } > > Isn't this duplicated and would be checked in the bdrv_co_backup() call > below anyway? > > The semantic difference is that job->common.offset wouldn't be increased > with this additional check. Looks like a bug. Thanks, fixed in v3. > > + DPRINTF("backup_run loop C%" PRId64 "\n", start); > > + > > + /** > > + * This triggers a cluster copy > > + * Note: avoid direct call to brdv_co_backup_cow, because > > + * this does not call tracked_request_begin() > > + */ > > + ret = bdrv_co_backup(bs, start*BACKUP_BLOCKS_PER_CLUSTER, 1); > > + if (ret < 0) { > > + break; > > + } > > + /* Publish progress */ > > + job->common.offset += BACKUP_CLUSTER_SIZE; > > + } > > + > > + /* wait until pending backup_do_cow()calls have completed */ > > Missing space. Thanks, fixed in v3.