On Fri, Feb 20, 2026 at 08:51:41PM +0100, Lukas Straub wrote:
> qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b)
> so always open the return path socket on the source. This allows
> us to reuse the return path in other parts like colo. Also take
> the proper locks in colo while we're at it.
>
> This fixes a crash due to a race between migrate_cancel() and
> the colo thread shutting down.
>
> Before, the rp socket is opened just before the rp thread is started
> and closed after it terminates and postcopy fast path is closed.
> Now it's the same, only the rp socket stays open until migration_cleanup().
>
> If there is a rp thread, the rp socket is shut down at the end of migration,
> but the file is still open. COLO is not compatible with postcopy, so this is
> safe as no one else uses the rp socket after this point.
>
> Signed-off-by: Lukas Straub <[email protected]>
> ---
> migration/colo.c | 29 ++++++-------------
> migration/migration.c | 77
> ++++++++++++++++++++++++---------------------------
> 2 files changed, 44 insertions(+), 62 deletions(-)
>
> diff --git a/migration/colo.c b/migration/colo.c
> index
> ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527
> 100644
> --- a/migration/colo.c
> +++ b/migration/colo.c
> @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
> * The s->rp_state.from_dst_file and s->to_dst_file may use the
> * same fd, but we still shutdown the fd for twice, it is harmless.
> */
> - if (s->to_dst_file) {
> - qemu_file_shutdown(s->to_dst_file);
> - }
> - if (s->rp_state.from_dst_file) {
> - qemu_file_shutdown(s->rp_state.from_dst_file);
> + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> + if (s->to_dst_file) {
> + qemu_file_shutdown(s->to_dst_file);
> + }
> + if (s->rp_state.from_dst_file) {
> + qemu_file_shutdown(s->rp_state.from_dst_file);
> + }
Nit: these "start to take mutex for..." changes should belong to a separate
patch.
> }
>
> old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
> @@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s)
> Error *local_err = NULL;
> int ret;
>
> + assert(s->rp_state.from_dst_file);
> if (get_colo_mode() != COLO_MODE_PRIMARY) {
> error_report("COLO mode must be COLO_MODE_PRIMARY");
> return;
> @@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s)
>
> failover_init_state();
>
> - s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> - if (!s->rp_state.from_dst_file) {
> - error_report("Open QEMUFile from_dst_file failed");
> - goto out;
> - }
> -
> packets_compare_notifier.notify = colo_compare_notify_checkpoint;
> colo_compare_register_notifier(&packets_compare_notifier);
>
> @@ -634,16 +631,6 @@ out:
> colo_compare_unregister_notifier(&packets_compare_notifier);
> timer_free(s->colo_delay_timer);
> qemu_event_destroy(&s->colo_checkpoint_event);
> -
> - /*
> - * Must be called after failover BH is completed,
> - * Or the failover BH may shutdown the wrong fd that
> - * re-used by other threads after we release here.
> - */
> - if (s->rp_state.from_dst_file) {
> - qemu_fclose(s->rp_state.from_dst_file);
> - s->rp_state.from_dst_file = NULL;
> - }
> }
>
> void migrate_start_colo_process(MigrationState *s)
> diff --git a/migration/migration.c b/migration/migration.c
> index
> f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c
> 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX];
>
> static bool migration_object_check(MigrationState *ms, Error **errp);
> static bool migration_switchover_start(MigrationState *s, Error **errp);
> -static bool close_return_path_on_source(MigrationState *s);
> +static bool stop_return_path_thread_on_source(MigrationState *s);
> static void migration_completion_end(MigrationState *s);
>
> static void migration_downtime_start(MigrationState *s)
> @@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s)
> cpr_state_close();
> cpr_transfer_source_destroy(s);
>
> - close_return_path_on_source(s);
> + stop_return_path_thread_on_source(s);
>
> if (s->migration_thread_running) {
> bql_unlock();
> @@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s)
> qemu_fclose(tmp);
> }
>
> + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> + tmp = s->rp_state.from_dst_file;
> + s->rp_state.from_dst_file = NULL;
> + }
> + if (tmp) {
> + qemu_fclose(tmp);
> + }
> +
> assert(!migration_is_active());
>
> if (s->state == MIGRATION_STATUS_CANCELLING) {
> @@ -2187,38 +2195,6 @@ static bool
> migrate_handle_rp_resume_ack(MigrationState *s,
> return true;
> }
>
> -/*
> - * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
> - * existed) in a safe way.
> - */
> -static void migration_release_dst_files(MigrationState *ms)
> -{
> - QEMUFile *file = NULL;
> -
> - WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
> - /*
> - * Reset the from_dst_file pointer first before releasing it, as we
> - * can't block within lock section
> - */
> - file = ms->rp_state.from_dst_file;
> - ms->rp_state.from_dst_file = NULL;
> - }
> -
> - /*
> - * Do the same to postcopy fast path socket too if there is. No
> - * locking needed because this qemufile should only be managed by
> - * return path thread.
> - */
> - if (ms->postcopy_qemufile_src) {
> - migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> - qemu_file_shutdown(ms->postcopy_qemufile_src);
> - qemu_fclose(ms->postcopy_qemufile_src);
> - ms->postcopy_qemufile_src = NULL;
> - }
> -
> - qemu_fclose(file);
> -}
> -
> /*
> * Handles messages sent on the return path towards the source VM
> *
> @@ -2388,9 +2364,9 @@ out:
> return NULL;
> }
>
> -static void open_return_path_on_source(MigrationState *ms)
> +static void start_return_path_thread_on_source(MigrationState *ms)
Changing this seems OK, but I'm totally confused why you deleted
migration_release_dst_files(). That's the helper to properly close the two
possible qemufiles that rp thread uses. IIUC you can keep it then use it
in either postcopy_pause() or migration_cleanup() directly.
Then you need to remove the chunk [1] below, making the function only do
"stop" but not close.
> {
> - ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
> + assert(ms->rp_state.from_dst_file);
I don't see why this change is a must, to open from_dst_file earlier. Can
we keep it as before, or would you justify it in a separate patch?
>
> trace_open_return_path_on_source();
>
> @@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState
> *ms)
> }
>
> /* Return true if error detected, or false otherwise */
> -static bool close_return_path_on_source(MigrationState *ms)
> +static bool stop_return_path_thread_on_source(MigrationState *ms)
> {
> if (!ms->rp_state.rp_thread_created) {
> return false;
> @@ -2424,7 +2400,17 @@ static bool close_return_path_on_source(MigrationState
> *ms)
>
> qemu_thread_join(&ms->rp_state.rp_thread);
> ms->rp_state.rp_thread_created = false;
> - migration_release_dst_files(ms);
> + /*
> + * Close the postcopy fast path socket if there is one.
> + * No locking needed because this qemufile should only be managed by
> + * return path thread which we just stopped.
> + */
> + if (ms->postcopy_qemufile_src) {
> + migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> + qemu_file_shutdown(ms->postcopy_qemufile_src);
> + qemu_fclose(ms->postcopy_qemufile_src);
> + ms->postcopy_qemufile_src = NULL;
> + }
[1]
> trace_migration_return_path_end_after();
>
> /* Return path will persist the error in MigrationState when quit */
> @@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s)
> goto fail;
> }
>
> - if (close_return_path_on_source(s)) {
> + if (stop_return_path_thread_on_source(s)) {
> goto fail;
> }
>
> @@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s)
> * path and just wait for the thread to finish. It will be
> * re-created when we resume.
> */
> - close_return_path_on_source(s);
> + stop_return_path_thread_on_source(s);
> + QEMUFile *rp_file;
> + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> + rp_file = s->rp_state.from_dst_file;
> + s->rp_state.from_dst_file = NULL;
> + }
> + if (rp_file) {
> + qemu_fclose(rp_file);
> + }
Open-code this is going backwards. Please see if we can reuse
migration_release_dst_files() at least.
Thanks,
>
> /*
> * Current channel is possibly broken. Release it. Note that this is
> @@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s)
> if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) {
> goto fail;
> }
> + s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
>
> /*
> * Open the return path. For postcopy, it is used exclusively. For
> @@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s)
> * QEMU uses the return path.
> */
> if (migrate_postcopy_ram() || migrate_return_path()) {
> - open_return_path_on_source(s);
> + start_return_path_thread_on_source(s);
> }
>
> /*
>
> --
> 2.39.5
>
--
Peter Xu