On Mon, 9 Feb 2026 at 18:55, Prasad Pandit <[email protected]> wrote: > When migration connection is broken, the QEMU and libvirtd(8) > process on the source side receive TCP connection reset > notification. QEMU sets the migration status to FAILED and > proceeds to migration_cleanup(). Meanwhile, Libvirtd(8) sends > a QMP command to migrate_set_capabilities(). > > The migration_cleanup() and qmp_migrate_set_capabilities() > calls race with each other. When the latter is invoked first, > since the migration is not running (FAILED), migration > capabilities are reset to false, so during migration_cleanup() > the QEMU process crashes with assertion failure. > > Introduce a new migration status FAILING and use it as an > interim status when an error occurs. Once migration_cleanup() > is done, it sets the migration status to FAILED. This helps > to avoid the above race condition and ensuing failure. > > Interim status FAILING is set wherever the execution moves > towards migration_cleanup(): > - postcopy_start() > - migration_thread() > - migration_cleanup() > - multifd_send_setup() > - bg_migration_thread() > - migration_completion() > - migration_detect_error() > - bg_migration_completion() > - multifd_send_error_propagate() > - migration_connect_error_propagate() > > The migration status finally moves to FAILED and reports an > appropriate error to the user. > > Interim status FAILING is _NOT_ set in the following routines > because they do not follow the migration_cleanup() path to the > FAILED state: > - cpr_exec_cb() > - qemu_savevm_state() > - postcopy_listen_thread() > - process_incoming_migration_co() > - multifd_recv_terminate_threads() > - migration_channel_process_incoming() > > Signed-off-by: Prasad Pandit <[email protected]> > --- > migration/migration.c | 34 +++++++++++++++++---------- > migration/multifd.c | 4 ++-- > qapi/migration.json | 8 ++++--- > tests/qtest/migration/migration-qmp.c | 3 ++- > tests/qtest/migration/precopy-tests.c | 3 ++- > 5 files changed, 32 insertions(+), 20 deletions(-) > > v1: > - Rebase patch on the latest upstream version > - Add list of functions where interim state FAILING is set > and where it is not changed. > > 67/67 qtest+qtest-x86_64 - qemu:qtest-x86_64/migration-test > OK 113.97s 84 subtests passed > 68/68 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test > OK 107.20s 84 subtests passed > > v0: > - > https://lore.kernel.org/qemu-devel/[email protected]/T/#u > > diff --git a/migration/migration.c b/migration/migration.c > index b103a82fc0..16c71e502a 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -1016,6 +1016,7 @@ bool migration_is_running(void) > case MIGRATION_STATUS_DEVICE: > case MIGRATION_STATUS_WAIT_UNPLUG: > case MIGRATION_STATUS_CANCELLING: > + case MIGRATION_STATUS_FAILING: > case MIGRATION_STATUS_COLO: > return true; > default: > @@ -1158,6 +1159,7 @@ static void fill_source_migration_info(MigrationInfo > *info) > case MIGRATION_STATUS_POSTCOPY_PAUSED: > case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: > case MIGRATION_STATUS_POSTCOPY_RECOVER: > + case MIGRATION_STATUS_FAILING: > /* TODO add some postcopy stats */ > populate_time_info(info, s); > populate_ram_info(info, s); > @@ -1210,6 +1212,7 @@ static void > fill_destination_migration_info(MigrationInfo *info) > case MIGRATION_STATUS_POSTCOPY_PAUSED: > case MIGRATION_STATUS_POSTCOPY_RECOVER: > case MIGRATION_STATUS_FAILED: > + case MIGRATION_STATUS_FAILING: > case MIGRATION_STATUS_COLO: > info->has_status = true; > break; > @@ -1331,6 +1334,9 @@ static void migration_cleanup(MigrationState *s) > if (s->state == MIGRATION_STATUS_CANCELLING) { > migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, > MIGRATION_STATUS_CANCELLED); > + } else if (s->state == MIGRATION_STATUS_FAILING) { > + migrate_set_state(&s->state, MIGRATION_STATUS_FAILING, > + MIGRATION_STATUS_FAILED); > } > > type = migration_has_failed(s) ? MIG_EVENT_PRECOPY_FAILED : > @@ -1383,7 +1389,7 @@ void migration_connect_error_propagate(MigrationState > *s, Error *error) > > switch (current) { > case MIGRATION_STATUS_SETUP: > - next = MIGRATION_STATUS_FAILED; > + next = MIGRATION_STATUS_FAILING; > break; > > case MIGRATION_STATUS_POSTCOPY_PAUSED: > @@ -1397,9 +1403,10 @@ void migration_connect_error_propagate(MigrationState > *s, Error *error) > break; > > case MIGRATION_STATUS_CANCELLING: > + case MIGRATION_STATUS_FAILING: > /* > - * Don't move out of CANCELLING, the only valid transition is to > - * CANCELLED, at migration_cleanup(). > + * Keep the current state, next transition is to be done > + * in migration_cleanup(). > */ > break; > > @@ -1547,6 +1554,7 @@ bool migration_has_failed(MigrationState *s) > { > return (s->state == MIGRATION_STATUS_CANCELLING || > s->state == MIGRATION_STATUS_CANCELLED || > + s->state == MIGRATION_STATUS_FAILING || > s->state == MIGRATION_STATUS_FAILED); > } > > @@ -2472,7 +2480,7 @@ static int postcopy_start(MigrationState *ms, Error > **errp) > if (postcopy_preempt_establish_channel(ms)) { > if (ms->state != MIGRATION_STATUS_CANCELLING) { > migrate_set_state(&ms->state, ms->state, > - MIGRATION_STATUS_FAILED); > + MIGRATION_STATUS_FAILING); > } > error_setg(errp, "%s: Failed to establish preempt channel", > __func__); > @@ -2635,7 +2643,7 @@ fail_closefb: > qemu_fclose(fb); > fail: > if (ms->state != MIGRATION_STATUS_CANCELLING) { > - migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED); > + migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILING); > } > migration_block_activate(NULL); > migration_call_notifiers(MIG_EVENT_PRECOPY_FAILED, NULL); > @@ -2828,7 +2836,7 @@ fail: > } > > if (s->state != MIGRATION_STATUS_CANCELLING) { > - migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); > + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILING); > } > } > > @@ -2865,7 +2873,7 @@ static void bg_migration_completion(MigrationState *s) > > fail: > migrate_set_state(&s->state, current_active_state, > - MIGRATION_STATUS_FAILED); > + MIGRATION_STATUS_FAILING); > } > > typedef enum MigThrError { > @@ -3066,7 +3074,7 @@ static MigThrError > migration_detect_error(MigrationState *s) > * For precopy (or postcopy with error outside IO, or before dest > * starts), we fail with no time. > */ > - migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); > + migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILING); > trace_migration_thread_file_err(); > > /* Time to stop the migration, now. */ > @@ -3297,7 +3305,7 @@ static void migration_iteration_finish(MigrationState > *s) > migrate_start_colo_process(s); > s->vm_old_state = RUN_STATE_RUNNING; > /* Fallthrough */ > - case MIGRATION_STATUS_FAILED: > + case MIGRATION_STATUS_FAILING: > case MIGRATION_STATUS_CANCELLED: > case MIGRATION_STATUS_CANCELLING: > if (!migration_block_activate(&local_err)) { > @@ -3356,7 +3364,7 @@ static void > bg_migration_iteration_finish(MigrationState *s) > switch (s->state) { > case MIGRATION_STATUS_COMPLETED: > case MIGRATION_STATUS_ACTIVE: > - case MIGRATION_STATUS_FAILED: > + case MIGRATION_STATUS_FAILING: > case MIGRATION_STATUS_CANCELLED: > case MIGRATION_STATUS_CANCELLING: > break; > @@ -3541,7 +3549,7 @@ static void *migration_thread(void *opaque) > if (ret) { > migrate_error_propagate(s, local_err); > migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > - MIGRATION_STATUS_FAILED); > + MIGRATION_STATUS_FAILING); > goto out; > } > > @@ -3664,7 +3672,7 @@ static void *bg_migration_thread(void *opaque) > if (ret) { > migrate_error_propagate(s, local_err); > migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > - MIGRATION_STATUS_FAILED); > + MIGRATION_STATUS_FAILING); > goto fail_setup; > } > > @@ -3727,7 +3735,7 @@ static void *bg_migration_thread(void *opaque) > fail: > if (early_fail) { > migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > - MIGRATION_STATUS_FAILED); > + MIGRATION_STATUS_FAILING); > bql_unlock(); > } > > diff --git a/migration/multifd.c b/migration/multifd.c > index ad6261688f..178c6b3350 100644 > --- a/migration/multifd.c > +++ b/migration/multifd.c > @@ -431,7 +431,7 @@ static void multifd_send_error_propagate(Error *err) > s->state == MIGRATION_STATUS_DEVICE || > s->state == MIGRATION_STATUS_ACTIVE) { > migrate_set_state(&s->state, s->state, > - MIGRATION_STATUS_FAILED); > + MIGRATION_STATUS_FAILING); > } > } > } > @@ -986,7 +986,7 @@ bool multifd_send_setup(void) > > err: > migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, > - MIGRATION_STATUS_FAILED); > + MIGRATION_STATUS_FAILING); > return false; > } > > diff --git a/qapi/migration.json b/qapi/migration.json > index f925e5541b..903c1d9618 100644 > --- a/qapi/migration.json > +++ b/qapi/migration.json > @@ -158,7 +158,9 @@ > # > # @completed: migration is finished. > # > -# @failed: some error occurred during migration process. > +# @failing: error occurred during migration, clean-up underway. > +# > +# @failed: error occurred during migration, clean-up done. > # > # @colo: VM is in the process of fault tolerance, VM can not get into > # this state unless colo capability is enabled for migration. > @@ -181,8 +183,8 @@ > 'data': [ 'none', 'setup', 'cancelling', 'cancelled', > 'active', 'postcopy-device', 'postcopy-active', > 'postcopy-paused', 'postcopy-recover-setup', > - 'postcopy-recover', 'completed', 'failed', 'colo', > - 'pre-switchover', 'device', 'wait-unplug' ] } > + 'postcopy-recover', 'completed', 'failing', 'failed', > + 'colo', 'pre-switchover', 'device', 'wait-unplug' ] } > > ## > # @VfioStats: > diff --git a/tests/qtest/migration/migration-qmp.c > b/tests/qtest/migration/migration-qmp.c > index 5c46ceb3e6..8279504db1 100644 > --- a/tests/qtest/migration/migration-qmp.c > +++ b/tests/qtest/migration/migration-qmp.c > @@ -241,7 +241,8 @@ void wait_for_migration_fail(QTestState *from, bool > allow_active) > do { > status = migrate_query_status(from); > bool result = !strcmp(status, "setup") || !strcmp(status, "failed") > || > - (allow_active && !strcmp(status, "active")); > + (allow_active && !strcmp(status, "active")) || > + !strcmp(status, "failing"); > if (!result) { > fprintf(stderr, "%s: unexpected status status=%s > allow_active=%d\n", > __func__, status, allow_active); > diff --git a/tests/qtest/migration/precopy-tests.c > b/tests/qtest/migration/precopy-tests.c > index a5423ca33c..f17dc5176d 100644 > --- a/tests/qtest/migration/precopy-tests.c > +++ b/tests/qtest/migration/precopy-tests.c > @@ -1247,7 +1247,7 @@ void migration_test_add_precopy(MigrationTestEnv *env) > } > > /* ensure new status don't go unnoticed */ > - assert(MIGRATION_STATUS__MAX == 16); > + assert(MIGRATION_STATUS__MAX == 17); > > for (int i = MIGRATION_STATUS_NONE; i < MIGRATION_STATUS__MAX; i++) { > switch (i) { > @@ -1259,6 +1259,7 @@ void migration_test_add_precopy(MigrationTestEnv *env) > case MIGRATION_STATUS_POSTCOPY_PAUSED: > case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: > case MIGRATION_STATUS_POSTCOPY_RECOVER: > + case MIGRATION_STATUS_FAILING: > continue; > default: > migration_test_add_suffix("/migration/cancel/src/after/", > --
Ping..! --- - Prasad
