On Fri, Feb 20, 2026 at 08:51:41PM +0100, Lukas Straub wrote:
> qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b)
> so always open the return path socket on the source. This allows
> us to reuse the return path in other parts like colo. Also take
> the proper locks in colo while we're at it.
> 
> This fixes a crash due to a race between migrate_cancel() and
> the colo thread shutting down.
> 
> Before, the rp socket is opened just before the rp thread is started
> and closed after it terminates and postcopy fast path is closed.
> Now it's the same, only the rp socket stays open until migration_cleanup().
> 
> If there is a rp thread, the rp socket is shut down at the end of migration,
> but the file is still open. COLO is not compatible with postcopy, so this is
> safe as no one else uses the rp socket after this point.
> 
> Signed-off-by: Lukas Straub <[email protected]>
> ---
>  migration/colo.c      | 29 ++++++-------------
>  migration/migration.c | 77 
> ++++++++++++++++++++++++---------------------------
>  2 files changed, 44 insertions(+), 62 deletions(-)
> 
> diff --git a/migration/colo.c b/migration/colo.c
> index 
> ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527
>  100644
> --- a/migration/colo.c
> +++ b/migration/colo.c
> @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
>       * The s->rp_state.from_dst_file and s->to_dst_file may use the
>       * same fd, but we still shutdown the fd for twice, it is harmless.
>       */
> -    if (s->to_dst_file) {
> -        qemu_file_shutdown(s->to_dst_file);
> -    }
> -    if (s->rp_state.from_dst_file) {
> -        qemu_file_shutdown(s->rp_state.from_dst_file);
> +    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> +        if (s->to_dst_file) {
> +            qemu_file_shutdown(s->to_dst_file);
> +        }
> +        if (s->rp_state.from_dst_file) {
> +            qemu_file_shutdown(s->rp_state.from_dst_file);
> +        }

Nit: these "start to take mutex for..." changes should belong to a separate
patch.

>      }
>  
>      old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
> @@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s)
>      Error *local_err = NULL;
>      int ret;
>  
> +    assert(s->rp_state.from_dst_file);
>      if (get_colo_mode() != COLO_MODE_PRIMARY) {
>          error_report("COLO mode must be COLO_MODE_PRIMARY");
>          return;
> @@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s)
>  
>      failover_init_state();
>  
> -    s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> -    if (!s->rp_state.from_dst_file) {
> -        error_report("Open QEMUFile from_dst_file failed");
> -        goto out;
> -    }
> -
>      packets_compare_notifier.notify = colo_compare_notify_checkpoint;
>      colo_compare_register_notifier(&packets_compare_notifier);
>  
> @@ -634,16 +631,6 @@ out:
>      colo_compare_unregister_notifier(&packets_compare_notifier);
>      timer_free(s->colo_delay_timer);
>      qemu_event_destroy(&s->colo_checkpoint_event);
> -
> -    /*
> -     * Must be called after failover BH is completed,
> -     * Or the failover BH may shutdown the wrong fd that
> -     * re-used by other threads after we release here.
> -     */
> -    if (s->rp_state.from_dst_file) {
> -        qemu_fclose(s->rp_state.from_dst_file);
> -        s->rp_state.from_dst_file = NULL;
> -    }
>  }
>  
>  void migrate_start_colo_process(MigrationState *s)
> diff --git a/migration/migration.c b/migration/migration.c
> index 
> f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c
>  100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX];
>  
>  static bool migration_object_check(MigrationState *ms, Error **errp);
>  static bool migration_switchover_start(MigrationState *s, Error **errp);
> -static bool close_return_path_on_source(MigrationState *s);
> +static bool stop_return_path_thread_on_source(MigrationState *s);
>  static void migration_completion_end(MigrationState *s);
>  
>  static void migration_downtime_start(MigrationState *s)
> @@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s)
>      cpr_state_close();
>      cpr_transfer_source_destroy(s);
>  
> -    close_return_path_on_source(s);
> +    stop_return_path_thread_on_source(s);
>  
>      if (s->migration_thread_running) {
>          bql_unlock();
> @@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s)
>          qemu_fclose(tmp);
>      }
>  
> +    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> +        tmp = s->rp_state.from_dst_file;
> +        s->rp_state.from_dst_file = NULL;
> +    }
> +    if (tmp) {
> +        qemu_fclose(tmp);
> +    }
> +
>      assert(!migration_is_active());
>  
>      if (s->state == MIGRATION_STATUS_CANCELLING) {
> @@ -2187,38 +2195,6 @@ static bool 
> migrate_handle_rp_resume_ack(MigrationState *s,
>      return true;
>  }
>  
> -/*
> - * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
> - * existed) in a safe way.
> - */
> -static void migration_release_dst_files(MigrationState *ms)
> -{
> -    QEMUFile *file = NULL;
> -
> -    WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
> -        /*
> -         * Reset the from_dst_file pointer first before releasing it, as we
> -         * can't block within lock section
> -         */
> -        file = ms->rp_state.from_dst_file;
> -        ms->rp_state.from_dst_file = NULL;
> -    }
> -
> -    /*
> -     * Do the same to postcopy fast path socket too if there is.  No
> -     * locking needed because this qemufile should only be managed by
> -     * return path thread.
> -     */
> -    if (ms->postcopy_qemufile_src) {
> -        migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> -        qemu_file_shutdown(ms->postcopy_qemufile_src);
> -        qemu_fclose(ms->postcopy_qemufile_src);
> -        ms->postcopy_qemufile_src = NULL;
> -    }
> -
> -    qemu_fclose(file);
> -}
> -
>  /*
>   * Handles messages sent on the return path towards the source VM
>   *
> @@ -2388,9 +2364,9 @@ out:
>      return NULL;
>  }
>  
> -static void open_return_path_on_source(MigrationState *ms)
> +static void start_return_path_thread_on_source(MigrationState *ms)

Changing this seems OK, but I'm totally confused why you deleted
migration_release_dst_files().  That's the helper to properly close the two
possible qemufiles that rp thread uses.  IIUC you can keep it then use it
in either postcopy_pause() or migration_cleanup() directly.

Then you need to remove the chunk [1] below, making the function only do
"stop" but not close.

>  {
> -    ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
> +    assert(ms->rp_state.from_dst_file);

I don't see why this change is a must, to open from_dst_file earlier.  Can
we keep it as before, or would you justify it in a separate patch?

>  
>      trace_open_return_path_on_source();
>  
> @@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState 
> *ms)
>  }
>  
>  /* Return true if error detected, or false otherwise */
> -static bool close_return_path_on_source(MigrationState *ms)
> +static bool stop_return_path_thread_on_source(MigrationState *ms)
>  {
>      if (!ms->rp_state.rp_thread_created) {
>          return false;
> @@ -2424,7 +2400,17 @@ static bool close_return_path_on_source(MigrationState 
> *ms)
>  
>      qemu_thread_join(&ms->rp_state.rp_thread);
>      ms->rp_state.rp_thread_created = false;
> -    migration_release_dst_files(ms);
> +    /*
> +     * Close the postcopy fast path socket if there is one.
> +     * No locking needed because this qemufile should only be managed by
> +     * return path thread which we just stopped.
> +     */
> +    if (ms->postcopy_qemufile_src) {
> +        migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> +        qemu_file_shutdown(ms->postcopy_qemufile_src);
> +        qemu_fclose(ms->postcopy_qemufile_src);
> +        ms->postcopy_qemufile_src = NULL;
> +    }

[1]

>      trace_migration_return_path_end_after();
>  
>      /* Return path will persist the error in MigrationState when quit */
> @@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s)
>          goto fail;
>      }
>  
> -    if (close_return_path_on_source(s)) {
> +    if (stop_return_path_thread_on_source(s)) {
>          goto fail;
>      }
>  
> @@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s)
>           * path and just wait for the thread to finish. It will be
>           * re-created when we resume.
>           */
> -        close_return_path_on_source(s);
> +        stop_return_path_thread_on_source(s);
> +        QEMUFile *rp_file;
> +        WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> +            rp_file = s->rp_state.from_dst_file;
> +            s->rp_state.from_dst_file = NULL;
> +        }
> +        if (rp_file) {
> +            qemu_fclose(rp_file);
> +        }

Open-code this is going backwards.  Please see if we can reuse
migration_release_dst_files() at least.

Thanks,

>  
>          /*
>           * Current channel is possibly broken. Release it.  Note that this is
> @@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s)
>      if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) {
>          goto fail;
>      }
> +    s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
>  
>      /*
>       * Open the return path. For postcopy, it is used exclusively. For
> @@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s)
>       * QEMU uses the return path.
>       */
>      if (migrate_postcopy_ram() || migrate_return_path()) {
> -        open_return_path_on_source(s);
> +        start_return_path_thread_on_source(s);
>      }
>  
>      /*
> 
> -- 
> 2.39.5
> 

-- 
Peter Xu


Reply via email to