On Mon, 9 Feb 2026 at 18:55, Prasad Pandit <[email protected]> wrote:
> When migration connection is broken, the QEMU and libvirtd(8)
> process on the source side receive TCP connection reset
> notification. QEMU sets the migration status to FAILED and
> proceeds to migration_cleanup(). Meanwhile, Libvirtd(8) sends
> a QMP command to migrate_set_capabilities().
>
> The migration_cleanup() and qmp_migrate_set_capabilities()
> calls race with each other. When the latter is invoked first,
> since the migration is not running (FAILED), migration
> capabilities are reset to false, so during migration_cleanup()
> the QEMU process crashes with assertion failure.
>
> Introduce a new migration status FAILING and use it as an
> interim status when an error occurs. Once migration_cleanup()
> is done, it sets the migration status to FAILED. This helps
> to avoid the above race condition and ensuing failure.
>
> Interim status FAILING is set wherever the execution moves
> towards migration_cleanup():
>   - postcopy_start()
>   - migration_thread()
>   - migration_cleanup()
>   - multifd_send_setup()
>   - bg_migration_thread()
>   - migration_completion()
>   - migration_detect_error()
>   - bg_migration_completion()
>   - multifd_send_error_propagate()
>   - migration_connect_error_propagate()
>
> The migration status finally moves to FAILED and reports an
> appropriate error to the user.
>
> Interim status FAILING is _NOT_ set in the following routines
> because they do not follow the migration_cleanup() path to the
> FAILED state:
>   - cpr_exec_cb()
>   - qemu_savevm_state()
>   - postcopy_listen_thread()
>   - process_incoming_migration_co()
>   - multifd_recv_terminate_threads()
>   - migration_channel_process_incoming()
>
> Signed-off-by: Prasad Pandit <[email protected]>
> ---
>  migration/migration.c                 | 34 +++++++++++++++++----------
>  migration/multifd.c                   |  4 ++--
>  qapi/migration.json                   |  8 ++++---
>  tests/qtest/migration/migration-qmp.c |  3 ++-
>  tests/qtest/migration/precopy-tests.c |  3 ++-
>  5 files changed, 32 insertions(+), 20 deletions(-)
>
> v1:
>  - Rebase patch on the latest upstream version
>  - Add list of functions where interim state FAILING is set
>    and where it is not changed.
>
> 67/67 qtest+qtest-x86_64 - qemu:qtest-x86_64/migration-test                 
> OK             113.97s   84 subtests passed
> 68/68 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test                 
> OK             107.20s   84 subtests passed
>
> v0:
>  - 
> https://lore.kernel.org/qemu-devel/[email protected]/T/#u
>
> diff --git a/migration/migration.c b/migration/migration.c
> index b103a82fc0..16c71e502a 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1016,6 +1016,7 @@ bool migration_is_running(void)
>      case MIGRATION_STATUS_DEVICE:
>      case MIGRATION_STATUS_WAIT_UNPLUG:
>      case MIGRATION_STATUS_CANCELLING:
> +    case MIGRATION_STATUS_FAILING:
>      case MIGRATION_STATUS_COLO:
>          return true;
>      default:
> @@ -1158,6 +1159,7 @@ static void fill_source_migration_info(MigrationInfo 
> *info)
>      case MIGRATION_STATUS_POSTCOPY_PAUSED:
>      case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
>      case MIGRATION_STATUS_POSTCOPY_RECOVER:
> +    case MIGRATION_STATUS_FAILING:
>          /* TODO add some postcopy stats */
>          populate_time_info(info, s);
>          populate_ram_info(info, s);
> @@ -1210,6 +1212,7 @@ static void 
> fill_destination_migration_info(MigrationInfo *info)
>      case MIGRATION_STATUS_POSTCOPY_PAUSED:
>      case MIGRATION_STATUS_POSTCOPY_RECOVER:
>      case MIGRATION_STATUS_FAILED:
> +    case MIGRATION_STATUS_FAILING:
>      case MIGRATION_STATUS_COLO:
>          info->has_status = true;
>          break;
> @@ -1331,6 +1334,9 @@ static void migration_cleanup(MigrationState *s)
>      if (s->state == MIGRATION_STATUS_CANCELLING) {
>          migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
>                            MIGRATION_STATUS_CANCELLED);
> +    } else if (s->state == MIGRATION_STATUS_FAILING) {
> +        migrate_set_state(&s->state, MIGRATION_STATUS_FAILING,
> +                          MIGRATION_STATUS_FAILED);
>      }
>
>      type = migration_has_failed(s) ? MIG_EVENT_PRECOPY_FAILED :
> @@ -1383,7 +1389,7 @@ void migration_connect_error_propagate(MigrationState 
> *s, Error *error)
>
>      switch (current) {
>      case MIGRATION_STATUS_SETUP:
> -        next = MIGRATION_STATUS_FAILED;
> +        next = MIGRATION_STATUS_FAILING;
>          break;
>
>      case MIGRATION_STATUS_POSTCOPY_PAUSED:
> @@ -1397,9 +1403,10 @@ void migration_connect_error_propagate(MigrationState 
> *s, Error *error)
>          break;
>
>      case MIGRATION_STATUS_CANCELLING:
> +    case MIGRATION_STATUS_FAILING:
>          /*
> -         * Don't move out of CANCELLING, the only valid transition is to
> -         * CANCELLED, at migration_cleanup().
> +         * Keep the current state, next transition is to be done
> +         * in migration_cleanup().
>           */
>          break;
>
> @@ -1547,6 +1554,7 @@ bool migration_has_failed(MigrationState *s)
>  {
>      return (s->state == MIGRATION_STATUS_CANCELLING ||
>              s->state == MIGRATION_STATUS_CANCELLED ||
> +            s->state == MIGRATION_STATUS_FAILING ||
>              s->state == MIGRATION_STATUS_FAILED);
>  }
>
> @@ -2472,7 +2480,7 @@ static int postcopy_start(MigrationState *ms, Error 
> **errp)
>          if (postcopy_preempt_establish_channel(ms)) {
>              if (ms->state != MIGRATION_STATUS_CANCELLING) {
>                  migrate_set_state(&ms->state, ms->state,
> -                                  MIGRATION_STATUS_FAILED);
> +                                  MIGRATION_STATUS_FAILING);
>              }
>              error_setg(errp, "%s: Failed to establish preempt channel",
>                         __func__);
> @@ -2635,7 +2643,7 @@ fail_closefb:
>      qemu_fclose(fb);
>  fail:
>      if (ms->state != MIGRATION_STATUS_CANCELLING) {
> -        migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
> +        migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILING);
>      }
>      migration_block_activate(NULL);
>      migration_call_notifiers(MIG_EVENT_PRECOPY_FAILED, NULL);
> @@ -2828,7 +2836,7 @@ fail:
>      }
>
>      if (s->state != MIGRATION_STATUS_CANCELLING) {
> -        migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
> +        migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILING);
>      }
>  }
>
> @@ -2865,7 +2873,7 @@ static void bg_migration_completion(MigrationState *s)
>
>  fail:
>      migrate_set_state(&s->state, current_active_state,
> -                      MIGRATION_STATUS_FAILED);
> +                      MIGRATION_STATUS_FAILING);
>  }
>
>  typedef enum MigThrError {
> @@ -3066,7 +3074,7 @@ static MigThrError 
> migration_detect_error(MigrationState *s)
>           * For precopy (or postcopy with error outside IO, or before dest
>           * starts), we fail with no time.
>           */
> -        migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
> +        migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILING);
>          trace_migration_thread_file_err();
>
>          /* Time to stop the migration, now. */
> @@ -3297,7 +3305,7 @@ static void migration_iteration_finish(MigrationState 
> *s)
>          migrate_start_colo_process(s);
>          s->vm_old_state = RUN_STATE_RUNNING;
>          /* Fallthrough */
> -    case MIGRATION_STATUS_FAILED:
> +    case MIGRATION_STATUS_FAILING:
>      case MIGRATION_STATUS_CANCELLED:
>      case MIGRATION_STATUS_CANCELLING:
>          if (!migration_block_activate(&local_err)) {
> @@ -3356,7 +3364,7 @@ static void 
> bg_migration_iteration_finish(MigrationState *s)
>      switch (s->state) {
>      case MIGRATION_STATUS_COMPLETED:
>      case MIGRATION_STATUS_ACTIVE:
> -    case MIGRATION_STATUS_FAILED:
> +    case MIGRATION_STATUS_FAILING:
>      case MIGRATION_STATUS_CANCELLED:
>      case MIGRATION_STATUS_CANCELLING:
>          break;
> @@ -3541,7 +3549,7 @@ static void *migration_thread(void *opaque)
>      if (ret) {
>          migrate_error_propagate(s, local_err);
>          migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
> -                          MIGRATION_STATUS_FAILED);
> +                          MIGRATION_STATUS_FAILING);
>          goto out;
>      }
>
> @@ -3664,7 +3672,7 @@ static void *bg_migration_thread(void *opaque)
>      if (ret) {
>          migrate_error_propagate(s, local_err);
>          migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
> -                          MIGRATION_STATUS_FAILED);
> +                          MIGRATION_STATUS_FAILING);
>          goto fail_setup;
>      }
>
> @@ -3727,7 +3735,7 @@ static void *bg_migration_thread(void *opaque)
>  fail:
>      if (early_fail) {
>          migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
> -                MIGRATION_STATUS_FAILED);
> +                MIGRATION_STATUS_FAILING);
>          bql_unlock();
>      }
>
> diff --git a/migration/multifd.c b/migration/multifd.c
> index ad6261688f..178c6b3350 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -431,7 +431,7 @@ static void multifd_send_error_propagate(Error *err)
>              s->state == MIGRATION_STATUS_DEVICE ||
>              s->state == MIGRATION_STATUS_ACTIVE) {
>              migrate_set_state(&s->state, s->state,
> -                              MIGRATION_STATUS_FAILED);
> +                              MIGRATION_STATUS_FAILING);
>          }
>      }
>  }
> @@ -986,7 +986,7 @@ bool multifd_send_setup(void)
>
>  err:
>      migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
> -                      MIGRATION_STATUS_FAILED);
> +                      MIGRATION_STATUS_FAILING);
>      return false;
>  }
>
> diff --git a/qapi/migration.json b/qapi/migration.json
> index f925e5541b..903c1d9618 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -158,7 +158,9 @@
>  #
>  # @completed: migration is finished.
>  #
> -# @failed: some error occurred during migration process.
> +# @failing: error occurred during migration, clean-up underway.
> +#
> +# @failed: error occurred during migration, clean-up done.
>  #
>  # @colo: VM is in the process of fault tolerance, VM can not get into
>  #     this state unless colo capability is enabled for migration.
> @@ -181,8 +183,8 @@
>    'data': [ 'none', 'setup', 'cancelling', 'cancelled',
>              'active', 'postcopy-device', 'postcopy-active',
>              'postcopy-paused', 'postcopy-recover-setup',
> -            'postcopy-recover', 'completed', 'failed', 'colo',
> -            'pre-switchover', 'device', 'wait-unplug' ] }
> +            'postcopy-recover', 'completed', 'failing', 'failed',
> +            'colo', 'pre-switchover', 'device', 'wait-unplug' ] }
>
>  ##
>  # @VfioStats:
> diff --git a/tests/qtest/migration/migration-qmp.c 
> b/tests/qtest/migration/migration-qmp.c
> index 5c46ceb3e6..8279504db1 100644
> --- a/tests/qtest/migration/migration-qmp.c
> +++ b/tests/qtest/migration/migration-qmp.c
> @@ -241,7 +241,8 @@ void wait_for_migration_fail(QTestState *from, bool 
> allow_active)
>      do {
>          status = migrate_query_status(from);
>          bool result = !strcmp(status, "setup") || !strcmp(status, "failed") 
> ||
> -            (allow_active && !strcmp(status, "active"));
> +            (allow_active && !strcmp(status, "active")) ||
> +            !strcmp(status, "failing");
>          if (!result) {
>              fprintf(stderr, "%s: unexpected status status=%s 
> allow_active=%d\n",
>                      __func__, status, allow_active);
> diff --git a/tests/qtest/migration/precopy-tests.c 
> b/tests/qtest/migration/precopy-tests.c
> index a5423ca33c..f17dc5176d 100644
> --- a/tests/qtest/migration/precopy-tests.c
> +++ b/tests/qtest/migration/precopy-tests.c
> @@ -1247,7 +1247,7 @@ void migration_test_add_precopy(MigrationTestEnv *env)
>      }
>
>      /* ensure new status don't go unnoticed */
> -    assert(MIGRATION_STATUS__MAX == 16);
> +    assert(MIGRATION_STATUS__MAX == 17);
>
>      for (int i = MIGRATION_STATUS_NONE; i < MIGRATION_STATUS__MAX; i++) {
>          switch (i) {
> @@ -1259,6 +1259,7 @@ void migration_test_add_precopy(MigrationTestEnv *env)
>          case MIGRATION_STATUS_POSTCOPY_PAUSED:
>          case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
>          case MIGRATION_STATUS_POSTCOPY_RECOVER:
> +        case MIGRATION_STATUS_FAILING:
>              continue;
>          default:
>              migration_test_add_suffix("/migration/cancel/src/after/",
> --


Ping..!
---
  - Prasad


Reply via email to