During live migration, block drivers with exclusive locking behaviour [such as Sheepdog: http://www.osrg.net/sheepdog/] are problematic, as both source and destination need to have the device open simultaneously. However, the lock is only required while the vm is running, and at most one vm is running at each stage of migration. This patch introduces bdrv_claim and bdrv_release hooks which can be used to claim and release the lock on vm start and stop, allowing Sheepdog-backed guests to migrate.
This functionality could also be more generally useful. For example, it would be possible to take fcntl() locks on qcow2 files, preventing corruption from two qemu processes concurrently modifying qcow2 metadata. Doing this in bdrv_open() is not possible as it would prevent live migration of guests backed by qcow2 files on a shared filesystem. Signed-off-by: <[email protected]> --- block.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ block.h | 4 ++++ block_int.h | 2 ++ monitor.c | 4 +++- qemu-img.c | 31 +++++++++++++++++++++++++++---- qemu-io.c | 11 ++++++++++- qemu-kvm.c | 2 ++ qemu-nbd.c | 3 ++- vl.c | 7 ++++++- 9 files changed, 100 insertions(+), 14 deletions(-) diff --git a/block.c b/block.c --- a/block.c +++ b/block.c @@ -475,6 +475,40 @@ return 0; } +int bdrv_claim(BlockDriverState *bs) +{ + if (bs->drv && bs->drv->bdrv_claim) + return bs->drv->bdrv_claim(bs); + return 0; +} + +int bdrv_claim_all(void) +{ + BlockDriverState *bs; + + for (bs = bdrv_first; bs != NULL; bs = bs->next) { + if (bdrv_claim(bs) < 0) { + bdrv_release_all(); + return -1; + } + } + return 0; +} + +void bdrv_release(BlockDriverState *bs) +{ + if (bs->drv && bs->drv->bdrv_release) + bs->drv->bdrv_release(bs); +} + +void bdrv_release_all(void) +{ + BlockDriverState *bs; + + for (bs = bdrv_first; bs != NULL; bs = bs->next) + bdrv_release(bs); +} + void bdrv_close(BlockDriverState *bs) { if (bs->drv) { @@ -499,13 +533,10 @@ void bdrv_close_all(void) { - BlockDriverState *bs, *n; + BlockDriverState *bs, *n; - for (bs = bdrv_first, n = bs->next; bs; bs = n, n = bs ? bs->next : NULL) { - if (bs && bs->drv && bs->drv->bdrv_close) { - bs->drv->bdrv_close(bs); - } - } + for (bs = bdrv_first, n = bs->next; bs; bs = n, n = bs ? bs->next : NULL) + bdrv_close(bs); } void bdrv_delete(BlockDriverState *bs) @@ -555,15 +586,20 @@ return -ENOTSUP; } + if (bdrv_claim(bs->backing_hd) < 0) + return -EACCES; + total_sectors = bdrv_getlength(bs) >> SECTOR_BITS; for (i = 0; i < total_sectors;) { if (drv->bdrv_is_allocated(bs, i, 65536, &n)) { for(j = 0; j < n; j++) { if (bdrv_read(bs, i, sector, 1) != 0) { + bdrv_release(bs->backing_hd); return -EIO; } if (bdrv_write(bs->backing_hd, i, sector, 1) != 0) { + bdrv_release(bs->backing_hd); return -EIO; } i++; @@ -573,6 +609,8 @@ } } + bdrv_release(bs->backing_hd); + if (drv->bdrv_make_empty) return drv->bdrv_make_empty(bs); diff --git a/block.h b/block.h --- a/block.h +++ b/block.h @@ -58,6 +58,10 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags); int bdrv_open2(BlockDriverState *bs, const char *filename, int flags, BlockDriver *drv); +int bdrv_claim(BlockDriverState *bs); +int bdrv_claim_all(void); +void bdrv_release(BlockDriverState *bs); +void bdrv_release_all(void); void bdrv_close(BlockDriverState *bs); int bdrv_check(BlockDriverState *bs); int bdrv_read(BlockDriverState *bs, int64_t sector_num, diff --git a/block_int.h b/block_int.h --- a/block_int.h +++ b/block_int.h @@ -51,6 +51,8 @@ int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename); int (*bdrv_probe_device)(const char *filename); int (*bdrv_open)(BlockDriverState *bs, const char *filename, int flags); + int (*bdrv_claim)(BlockDriverState *bs); + void (*bdrv_release)(BlockDriverState *bs); int (*bdrv_read)(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors); int (*bdrv_write)(BlockDriverState *bs, int64_t sector_num, diff --git a/monitor.c b/monitor.c --- a/monitor.c +++ b/monitor.c @@ -456,7 +456,9 @@ static void do_quit(Monitor *mon, const QDict *qdict) { - bdrv_close_all(); + if (vm_running) + bdrv_release_all(); + bdrv_close_all(); exit(0); } diff --git a/qemu-img.c b/qemu-img.c --- a/qemu-img.c +++ b/qemu-img.c @@ -471,7 +471,13 @@ if (bdrv_open2(bs, filename, BRDV_O_FLAGS, drv) < 0) { error("Could not open '%s'", filename); } + if (bdrv_claim(bs) < 0) { + error("Could not claim '%s'", filename); + } ret = bdrv_commit(bs); + bdrv_release(bs); + bdrv_delete(bs); + switch(ret) { case 0: printf("Image committed.\n"); @@ -490,7 +496,6 @@ break; } - bdrv_delete(bs); return 0; } @@ -654,6 +659,8 @@ } out_bs = bdrv_new_open(out_filename, out_fmt); + if (bdrv_claim(out_bs) < 0) + error("Unable to claim '%s'", out_filename); bs_i = 0; bs_offset = 0; @@ -790,6 +797,7 @@ } } } + bdrv_release(out_bs); bdrv_delete(out_bs); for (bs_i = 0; bs_i < bs_n; bs_i++) bdrv_delete(bs[bs_i]); @@ -994,6 +1002,10 @@ error("Could not open '%s'", filename); } + if (action != SNAPSHOT_LIST && bdrv_claim(bs) < 0) { + error("Could not claim '%s'", filename); + } + /* Perform the requested action */ switch(action) { case SNAPSHOT_LIST: @@ -1009,27 +1021,38 @@ sn.date_nsec = tv.tv_usec * 1000; ret = bdrv_snapshot_create(bs, &sn); - if (ret) + if (ret) { + bdrv_release(bs); + bdrv_delete(bs); error("Could not create snapshot '%s': %d (%s)", snapshot_name, ret, strerror(-ret)); + } break; case SNAPSHOT_APPLY: ret = bdrv_snapshot_goto(bs, snapshot_name); - if (ret) + if (ret) { + bdrv_release(bs); + bdrv_delete(bs); error("Could not apply snapshot '%s': %d (%s)", snapshot_name, ret, strerror(-ret)); + } break; case SNAPSHOT_DELETE: ret = bdrv_snapshot_delete(bs, snapshot_name); - if (ret) + if (ret) { + bdrv_release(bs); + bdrv_delete(bs); error("Could not delete snapshot '%s': %d (%s)", snapshot_name, ret, strerror(-ret)); + } break; } /* Cleanup */ + if (action != SNAPSHOT_LIST) + bdrv_release(bs); bdrv_delete(bs); return 0; diff --git a/qemu-io.c b/qemu-io.c --- a/qemu-io.c +++ b/qemu-io.c @@ -1287,6 +1287,13 @@ return 1; } + if (bdrv_claim(bs) < 0) { + fprintf(stderr, "%s: can't claim device %s\n", progname, name); + bdrv_close(bs); + bs = NULL; + return 1; + } + if (growable) { bs->growable = 1; } @@ -1517,7 +1524,9 @@ */ qemu_aio_flush(); - if (bs) + if (bs) { + bdrv_release(bs); bdrv_close(bs); + } return 0; } diff --git a/qemu-kvm.c b/qemu-kvm.c --- a/qemu-kvm.c +++ b/qemu-kvm.c @@ -2215,6 +2215,8 @@ } } + if (vm_running) + bdrv_release_all(); bdrv_close_all(); pause_all_threads(); diff --git a/qemu-nbd.c b/qemu-nbd.c --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -331,7 +331,7 @@ if (bs == NULL) return 1; - if (bdrv_open(bs, argv[optind], flags) == -1) + if (bdrv_open(bs, argv[optind], flags) < 0 || bdrv_claim(bs) < 0) return 1; fd_size = bs->total_sectors * 512; @@ -470,6 +470,7 @@ qemu_free(data); close(sharing_fds[0]); + bdrv_release(bs); bdrv_close(bs); qemu_free(sharing_fds); if (socket) diff --git a/vl.c b/vl.c --- a/vl.c +++ b/vl.c @@ -3219,7 +3219,7 @@ void vm_start(void) { - if (!vm_running) { + if (!vm_running && bdrv_claim_all() >= 0) { cpu_enable_ticks(); vm_running = 1; vm_state_notify(1, 0); @@ -3293,6 +3293,7 @@ vm_running = 0; pause_all_vcpus(); vm_state_notify(0, reason); + bdrv_release_all(); } } @@ -4178,6 +4179,10 @@ if ((r = qemu_vmstop_requested())) vm_stop(r); } + + if (vm_running) + bdrv_release_all(); + bdrv_close_all(); pause_all_vcpus(); } -- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
