Block migration can submit multiple AIO reads for the same sector/chunk, but completion of such reads can happen out of order:
migration guest - get_dirty(N) - aio_read(N) - clear_dirty(N) write(N) set_dirty(N) - get_dirty(N) - aio_read(N) If the first aio_read completes after the second, stale data will be migrated to the destination. Fix by not allowing multiple AIOs inflight for the same sector. Signed-off-by: Marcelo Tosatti <mtosa...@redhat.com> Index: qemu-kvm/block-migration.c =================================================================== --- qemu-kvm.orig/block-migration.c +++ qemu-kvm/block-migration.c @@ -49,12 +49,14 @@ typedef struct BlkMigDevState { int64_t total_sectors; int64_t dirty; QSIMPLEQ_ENTRY(BlkMigDevState) entry; + unsigned long *aio_bitmap; } BlkMigDevState; typedef struct BlkMigBlock { uint8_t *buf; BlkMigDevState *bmds; int64_t sector; + int nr_sectors; struct iovec iov; QEMUIOVector qiov; BlockDriverAIOCB *aiocb; @@ -140,6 +142,57 @@ static inline long double compute_read_b return (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time; } +static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) +{ + int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; + + if (bmds->aio_bitmap && + (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) { + return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & + (1UL << (chunk % (sizeof(unsigned long) * 8)))); + } else { + return 0; + } +} + +static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, + int nb_sectors, int set) +{ + int64_t start, end; + unsigned long val, idx, bit; + + start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; + end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; + + for (; start <= end; start++) { + idx = start / (sizeof(unsigned long) * 8); + bit = start % (sizeof(unsigned long) * 8); + val = bmds->aio_bitmap[idx]; + if (set) { + if (!(val & (1UL << bit))) { + val |= 1UL << bit; + } + } else { + if (val & (1UL << bit)) { + val &= ~(1UL << bit); + } + } + bmds->aio_bitmap[idx] = val; + } +} + +static void alloc_aio_bitmap(BlkMigDevState *bmds) +{ + BlockDriverState *bs = bmds->bs; + int64_t bitmap_size; + + bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) + + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; + bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; + + bmds->aio_bitmap = qemu_mallocz(bitmap_size); +} + static void blk_mig_read_cb(void *opaque, int ret) { BlkMigBlock *blk = opaque; @@ -151,6 +204,7 @@ static void blk_mig_read_cb(void *opaque add_avg_read_time(blk->time); QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); + bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0); block_mig_state.submitted--; block_mig_state.read_done++; @@ -194,6 +248,7 @@ static int mig_save_device_bulk(Monitor blk->buf = qemu_malloc(BLOCK_SIZE); blk->bmds = bmds; blk->sector = cur_sector; + blk->nr_sectors = nr_sectors; blk->iov.iov_base = blk->buf; blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; @@ -248,6 +303,7 @@ static void init_blk_migration_it(void * bmds->total_sectors = sectors; bmds->completed_sectors = 0; bmds->shared_base = block_mig_state.shared_base; + alloc_aio_bitmap(bmds); block_mig_state.total_sector_sum += sectors; @@ -329,6 +385,8 @@ static int mig_save_device_dirty(Monitor int nr_sectors; for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { + if (bmds_aio_inflight(bmds, sector)) + qemu_aio_flush(); if (bdrv_get_dirty(bmds->bs, sector)) { if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { @@ -340,6 +398,7 @@ static int mig_save_device_dirty(Monitor blk->buf = qemu_malloc(BLOCK_SIZE); blk->bmds = bmds; blk->sector = sector; + blk->nr_sectors = nr_sectors; if (is_async) { blk->iov.iov_base = blk->buf; @@ -354,6 +413,7 @@ static int mig_save_device_dirty(Monitor goto error; } block_mig_state.submitted++; + bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); } else { if (bdrv_read(bmds->bs, sector, blk->buf, nr_sectors) < 0) { @@ -474,6 +534,7 @@ static void blk_mig_cleanup(Monitor *mon while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); + qemu_free(bmds->aio_bitmap); qemu_free(bmds); }