On Mon, Sep 15, 2025 at 01:59:15PM +0200, Juraj Marcin wrote:
> From: Juraj Marcin <[email protected]>
>
> Currently, when postcopy starts, the source VM starts switchover and
> sends a package containing the state of all non-postcopiable devices.
> When the destination loads this package, the switchover is complete and
> the destination VM starts. However, if the device state load fails or
> the destination side crashes, the source side is already in
> POSTCOPY_ACTIVE state and cannot be recovered, even when it has the most
> up-to-date machine state as the destination has not yet started.
>
> This patch introduces a new POSTCOPY_DEVICE state which is active
> while the destination machine is loading the device state, is not yet
> running, and the source side can be resumed in case of a migration
> failure.
>
> To transition from POSTCOPY_DEVICE to POSTCOPY_ACTIVE, the source
> side uses a PONG message that is a response to a PING message processed
> just before the POSTCOPY_RUN command that starts the destination VM.
> Thus, this change does not require any changes on the destination side
> and is effective even with older destination versions.
>
> Signed-off-by: Juraj Marcin <[email protected]>
> ---
> migration/migration.c | 23 ++++++++++++++++++-----
> migration/savevm.h | 2 ++
> migration/trace-events | 1 +
> qapi/migration.json | 8 ++++++--
> tests/qtest/migration/precopy-tests.c | 3 ++-
> 5 files changed, 29 insertions(+), 8 deletions(-)
>
> diff --git a/migration/migration.c b/migration/migration.c
> index 7222e3de13..e63a7487be 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1223,6 +1223,7 @@ bool migration_is_running(void)
>
> switch (s->state) {
> case MIGRATION_STATUS_ACTIVE:
> + case MIGRATION_STATUS_POSTCOPY_DEVICE:
> case MIGRATION_STATUS_POSTCOPY_ACTIVE:
> case MIGRATION_STATUS_POSTCOPY_PAUSED:
> case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
> @@ -1244,6 +1245,7 @@ static bool migration_is_active(void)
> MigrationState *s = current_migration;
>
> return (s->state == MIGRATION_STATUS_ACTIVE ||
> + s->state == MIGRATION_STATUS_POSTCOPY_DEVICE ||
> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
> }
>
> @@ -1366,6 +1368,7 @@ static void fill_source_migration_info(MigrationInfo
> *info)
> break;
> case MIGRATION_STATUS_ACTIVE:
> case MIGRATION_STATUS_CANCELLING:
> + case MIGRATION_STATUS_POSTCOPY_DEVICE:
> case MIGRATION_STATUS_POSTCOPY_ACTIVE:
> case MIGRATION_STATUS_PRE_SWITCHOVER:
> case MIGRATION_STATUS_DEVICE:
> @@ -1419,6 +1422,7 @@ static void
> fill_destination_migration_info(MigrationInfo *info)
> case MIGRATION_STATUS_CANCELLING:
> case MIGRATION_STATUS_CANCELLED:
> case MIGRATION_STATUS_ACTIVE:
> + case MIGRATION_STATUS_POSTCOPY_DEVICE:
> case MIGRATION_STATUS_POSTCOPY_ACTIVE:
> case MIGRATION_STATUS_POSTCOPY_PAUSED:
> case MIGRATION_STATUS_POSTCOPY_RECOVER:
> @@ -1719,6 +1723,7 @@ bool migration_in_postcopy(void)
> MigrationState *s = migrate_get_current();
>
> switch (s->state) {
> + case MIGRATION_STATUS_POSTCOPY_DEVICE:
> case MIGRATION_STATUS_POSTCOPY_ACTIVE:
> case MIGRATION_STATUS_POSTCOPY_PAUSED:
> case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
> @@ -2564,6 +2569,11 @@ static void *source_return_path_thread(void *opaque)
> tmp32 = ldl_be_p(buf);
> trace_source_return_path_thread_pong(tmp32);
> qemu_sem_post(&ms->rp_state.rp_pong_acks);
> + if (tmp32 == QEMU_VM_PING_PACKAGED_LOADED) {
> + trace_source_return_path_thread_dst_started();
> + migrate_set_state(&ms->state,
> MIGRATION_STATUS_POSTCOPY_DEVICE,
> + MIGRATION_STATUS_POSTCOPY_ACTIVE);
Could this race with the migration thread modifying the state concurrently?
To avoid it, we could have a bool, set it here once, and in the iterations
do something like:
diff --git a/migration/migration.c b/migration/migration.c
index 10c216d25d..55230e10ee 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3449,6 +3449,16 @@ static MigIterateState
migration_iteration_run(MigrationState *s)
trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy);
if (in_postcopy) {
+ if (s->postcopy_package_loaded) {
+ assert(s->state == MIGRATION_STATUS_POSTCOPY_DEVICE);
+ migrate_set_state(s->state, MIGRATION_STATUS_POSTCOPY_DEVICE,
+ MIGRATION_STATUS_POSTCOPY_ACTIVE);
+ /*
+ * Since postcopy cannot be re-initiated, this flag will only
+ * be set at most once for QEMU's whole lifecyce.
+ */
+ s->postcopy_package_loaded = false;
+ }
/*
* Iterate in postcopy until all pending data flushed. Note that
* postcopy completion doesn't rely on can_switchover, because when
> + }
> break;
>
> case MIG_RP_MSG_REQ_PAGES:
> @@ -2814,6 +2824,7 @@ static int postcopy_start(MigrationState *ms, Error
> **errp)
> if (migrate_postcopy_ram()) {
> qemu_savevm_send_ping(fb, 3);
> }
> + qemu_savevm_send_ping(fb, QEMU_VM_PING_PACKAGED_LOADED);
Nitpick: some comment would be nice here describing this ping, especially
when it becomes functional. The name does provide some info, but not
extremely clear what PACKAGED_LOADED mean if in a PONG yet.
>
> qemu_savevm_send_postcopy_run(fb);
>
> @@ -2871,7 +2882,7 @@ static int postcopy_start(MigrationState *ms, Error
> **errp)
>
> /* Now, switchover looks all fine, switching to postcopy-active */
> migrate_set_state(&ms->state, MIGRATION_STATUS_DEVICE,
> - MIGRATION_STATUS_POSTCOPY_ACTIVE);
> + MIGRATION_STATUS_POSTCOPY_DEVICE);
>
> bql_unlock();
>
> @@ -3035,7 +3046,8 @@ static void migration_completion(MigrationState *s)
>
> if (s->state == MIGRATION_STATUS_ACTIVE) {
> ret = migration_completion_precopy(s);
> - } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
> + } else if (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE ||
> + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
Exactly. We need to be prepared that src sending too fast so when device
loading on dest we finished.
> migration_completion_postcopy(s);
> } else {
> ret = -1;
> @@ -3311,8 +3323,8 @@ static MigThrError
> migration_detect_error(MigrationState *s)
> return postcopy_pause(s);
> } else {
> /*
> - * For precopy (or postcopy with error outside IO), we fail
> - * with no time.
> + * For precopy (or postcopy with error outside IO, or before dest
> + * starts), we fail with no time.
> */
> migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
> trace_migration_thread_file_err();
> @@ -3447,7 +3459,8 @@ static MigIterateState
> migration_iteration_run(MigrationState *s)
> {
> uint64_t must_precopy, can_postcopy, pending_size;
> Error *local_err = NULL;
> - bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
> + bool in_postcopy = (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE ||
> + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
> bool can_switchover = migration_can_switchover(s);
> bool complete_ready;
>
> diff --git a/migration/savevm.h b/migration/savevm.h
> index 2d5e9c7166..c4de0325eb 100644
> --- a/migration/savevm.h
> +++ b/migration/savevm.h
> @@ -29,6 +29,8 @@
> #define QEMU_VM_COMMAND 0x08
> #define QEMU_VM_SECTION_FOOTER 0x7e
>
> +#define QEMU_VM_PING_PACKAGED_LOADED 0x42
Only curious how you picked it. :)
It's fine, it can also be 5, as we know exactly what we used to use (1-4).
> +
> bool qemu_savevm_state_blocked(Error **errp);
> void qemu_savevm_non_migratable_list(strList **reasons);
> int qemu_savevm_state_prepare(Error **errp);
> diff --git a/migration/trace-events b/migration/trace-events
> index 706db97def..007b5c407e 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -191,6 +191,7 @@ source_return_path_thread_pong(uint32_t val) "0x%x"
> source_return_path_thread_shut(uint32_t val) "0x%x"
> source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32
> source_return_path_thread_switchover_acked(void) ""
> +source_return_path_thread_dst_started(void) ""
> migration_thread_low_pending(uint64_t pending) "%" PRIu64
> migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t
> bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 "
> time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 "
> max_size %" PRId64
> process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 2387c21e9c..89a20d858d 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -142,6 +142,10 @@
> # @postcopy-active: like active, but now in postcopy mode.
> # (since 2.5)
> #
> +# @postcopy-device: like postcopy-active, but the destination is still
> +# loading device state and is not running yet. If migration fails
> +# during this state, the source side will resume. (since 10.2)
Is it worthwhile to mention we could jump from postcopy-device to
completed? I also wonder if it happens if libvirt would get confused.
Worth checking with Jiri.
Thanks,
> +#
> # @postcopy-paused: during postcopy but paused. (since 3.0)
> #
> # @postcopy-recover-setup: setup phase for a postcopy recovery
> @@ -173,8 +177,8 @@
> ##
> { 'enum': 'MigrationStatus',
> 'data': [ 'none', 'setup', 'cancelling', 'cancelled',
> - 'active', 'postcopy-active', 'postcopy-paused',
> - 'postcopy-recover-setup',
> + 'active', 'postcopy-device', 'postcopy-active',
> + 'postcopy-paused', 'postcopy-recover-setup',
> 'postcopy-recover', 'completed', 'failed', 'colo',
> 'pre-switchover', 'device', 'wait-unplug' ] }
> ##
> diff --git a/tests/qtest/migration/precopy-tests.c
> b/tests/qtest/migration/precopy-tests.c
> index bb38292550..57ca623de5 100644
> --- a/tests/qtest/migration/precopy-tests.c
> +++ b/tests/qtest/migration/precopy-tests.c
> @@ -1316,13 +1316,14 @@ void migration_test_add_precopy(MigrationTestEnv *env)
> }
>
> /* ensure new status don't go unnoticed */
> - assert(MIGRATION_STATUS__MAX == 15);
> + assert(MIGRATION_STATUS__MAX == 16);
>
> for (int i = MIGRATION_STATUS_NONE; i < MIGRATION_STATUS__MAX; i++) {
> switch (i) {
> case MIGRATION_STATUS_DEVICE: /* happens too fast */
> case MIGRATION_STATUS_WAIT_UNPLUG: /* no support in tests */
> case MIGRATION_STATUS_COLO: /* no support in tests */
> + case MIGRATION_STATUS_POSTCOPY_DEVICE: /* postcopy can't be
> cancelled */
> case MIGRATION_STATUS_POSTCOPY_ACTIVE: /* postcopy can't be
> cancelled */
> case MIGRATION_STATUS_POSTCOPY_PAUSED:
> case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
> --
> 2.51.0
>
--
Peter Xu