On Thu, 26 Feb 2026 10:49:07 -0500
Peter Xu <[email protected]> wrote:

> On Fri, Feb 20, 2026 at 08:51:41PM +0100, Lukas Straub wrote:
> > qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b)
> > so always open the return path socket on the source. This allows
> > us to reuse the return path in other parts like colo. Also take
> > the proper locks in colo while we're at it.
> > 
> > This fixes a crash due to a race between migrate_cancel() and
> > the colo thread shutting down.
> > 
> > Before, the rp socket is opened just before the rp thread is started
> > and closed after it terminates and postcopy fast path is closed.
> > Now it's the same, only the rp socket stays open until migration_cleanup().
> > 
> > If there is a rp thread, the rp socket is shut down at the end of migration,
> > but the file is still open. COLO is not compatible with postcopy, so this is
> > safe as no one else uses the rp socket after this point.
> > 
> > Signed-off-by: Lukas Straub <[email protected]>
> > ---
> >  migration/colo.c      | 29 ++++++-------------
> >  migration/migration.c | 77 
> > ++++++++++++++++++++++++---------------------------
> >  2 files changed, 44 insertions(+), 62 deletions(-)
> > 
> > diff --git a/migration/colo.c b/migration/colo.c
> > index 
> > ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527
> >  100644
> > --- a/migration/colo.c
> > +++ b/migration/colo.c
> > @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void)
> >       * The s->rp_state.from_dst_file and s->to_dst_file may use the
> >       * same fd, but we still shutdown the fd for twice, it is harmless.
> >       */
> > -    if (s->to_dst_file) {
> > -        qemu_file_shutdown(s->to_dst_file);
> > -    }
> > -    if (s->rp_state.from_dst_file) {
> > -        qemu_file_shutdown(s->rp_state.from_dst_file);
> > +    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > +        if (s->to_dst_file) {
> > +            qemu_file_shutdown(s->to_dst_file);
> > +        }
> > +        if (s->rp_state.from_dst_file) {
> > +            qemu_file_shutdown(s->rp_state.from_dst_file);
> > +        }  
> 
> Nit: these "start to take mutex for..." changes should belong to a separate
> patch.
> 

Will do.

> >      }
> >  
> >      old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
> > @@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s)
> >      Error *local_err = NULL;
> >      int ret;
> >  
> > +    assert(s->rp_state.from_dst_file);
> >      if (get_colo_mode() != COLO_MODE_PRIMARY) {
> >          error_report("COLO mode must be COLO_MODE_PRIMARY");
> >          return;
> > @@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s)
> >  
> >      failover_init_state();
> >  
> > -    s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> > -    if (!s->rp_state.from_dst_file) {
> > -        error_report("Open QEMUFile from_dst_file failed");
> > -        goto out;
> > -    }
> > -
> >      packets_compare_notifier.notify = colo_compare_notify_checkpoint;
> >      colo_compare_register_notifier(&packets_compare_notifier);
> >  
> > @@ -634,16 +631,6 @@ out:
> >      colo_compare_unregister_notifier(&packets_compare_notifier);
> >      timer_free(s->colo_delay_timer);
> >      qemu_event_destroy(&s->colo_checkpoint_event);
> > -
> > -    /*
> > -     * Must be called after failover BH is completed,
> > -     * Or the failover BH may shutdown the wrong fd that
> > -     * re-used by other threads after we release here.
> > -     */
> > -    if (s->rp_state.from_dst_file) {
> > -        qemu_fclose(s->rp_state.from_dst_file);
> > -        s->rp_state.from_dst_file = NULL;
> > -    }
> >  }
> >  
> >  void migrate_start_colo_process(MigrationState *s)
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 
> > f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c
> >  100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX];
> >  
> >  static bool migration_object_check(MigrationState *ms, Error **errp);
> >  static bool migration_switchover_start(MigrationState *s, Error **errp);
> > -static bool close_return_path_on_source(MigrationState *s);
> > +static bool stop_return_path_thread_on_source(MigrationState *s);
> >  static void migration_completion_end(MigrationState *s);
> >  
> >  static void migration_downtime_start(MigrationState *s)
> > @@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s)
> >      cpr_state_close();
> >      cpr_transfer_source_destroy(s);
> >  
> > -    close_return_path_on_source(s);
> > +    stop_return_path_thread_on_source(s);
> >  
> >      if (s->migration_thread_running) {
> >          bql_unlock();
> > @@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s)
> >          qemu_fclose(tmp);
> >      }
> >  
> > +    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > +        tmp = s->rp_state.from_dst_file;
> > +        s->rp_state.from_dst_file = NULL;
> > +    }
> > +    if (tmp) {
> > +        qemu_fclose(tmp);
> > +    }
> > +
> >      assert(!migration_is_active());
> >  
> >      if (s->state == MIGRATION_STATUS_CANCELLING) {
> > @@ -2187,38 +2195,6 @@ static bool 
> > migrate_handle_rp_resume_ack(MigrationState *s,
> >      return true;
> >  }
> >  
> > -/*
> > - * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
> > - * existed) in a safe way.
> > - */
> > -static void migration_release_dst_files(MigrationState *ms)
> > -{
> > -    QEMUFile *file = NULL;
> > -
> > -    WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
> > -        /*
> > -         * Reset the from_dst_file pointer first before releasing it, as we
> > -         * can't block within lock section
> > -         */
> > -        file = ms->rp_state.from_dst_file;
> > -        ms->rp_state.from_dst_file = NULL;
> > -    }
> > -
> > -    /*
> > -     * Do the same to postcopy fast path socket too if there is.  No
> > -     * locking needed because this qemufile should only be managed by
> > -     * return path thread.
> > -     */
> > -    if (ms->postcopy_qemufile_src) {
> > -        migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> > -        qemu_file_shutdown(ms->postcopy_qemufile_src);
> > -        qemu_fclose(ms->postcopy_qemufile_src);
> > -        ms->postcopy_qemufile_src = NULL;
> > -    }
> > -
> > -    qemu_fclose(file);
> > -}
> > -
> >  /*
> >   * Handles messages sent on the return path towards the source VM
> >   *
> > @@ -2388,9 +2364,9 @@ out:
> >      return NULL;
> >  }
> >  
> > -static void open_return_path_on_source(MigrationState *ms)
> > +static void start_return_path_thread_on_source(MigrationState *ms)  
> 
> Changing this seems OK, but I'm totally confused why you deleted
> migration_release_dst_files().  That's the helper to properly close the two
> possible qemufiles that rp thread uses.

Okay, so my reasoning here is:
I think that closing ms->postcopy_qemufile_src is very closely
related to cleaning up the rp thread so I moved that to
stop_return_path_thread_on_source().

Meanwhile I think s->rp_state.from_dst_file is more closely related to
s->to_dst_file and in fact I think we can remove s->rp_state.from_dst_file
entirely in the future and use s->to_dst_file for send *and* receive
just like you would with a normal socket.

So I close s->rp_state.from_dst_file right besides s->to_dst_file in
this patch. And I do the same open-coded file lock dance like
s->to_dst_file already does.

> IIUC you can keep it then use it
> in either postcopy_pause() or migration_cleanup() directly.

I can do that if you wish.
Should I keep closing ms->postcopy_qemufile_src inside
migration_release_dst_files() or stop_return_path_thread_on_source() ?

> 
> Then you need to remove the chunk [1] below, making the function only do
> "stop" but not close.
> 
> >  {
> > -    ms->rp_state.from_dst_file = 
> > qemu_file_get_return_path(ms->to_dst_file);
> > +    assert(ms->rp_state.from_dst_file);  
> 
> I don't see why this change is a must, to open from_dst_file earlier.  Can
> we keep it as before, or would you justify it in a separate patch?

Well, the issue is that opening the return path file is behind this if:

 if (migrate_postcopy_ram() || migrate_return_path()) {
-        open_return_path_on_source(s);
+        start_return_path_thread_on_source(s);
 }

And I want to always open the return path file, without also starting
the return path thread.

> 
> >  
> >      trace_open_return_path_on_source();
> >  
> > @@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState 
> > *ms)
> >  }
> >  
> >  /* Return true if error detected, or false otherwise */
> > -static bool close_return_path_on_source(MigrationState *ms)
> > +static bool stop_return_path_thread_on_source(MigrationState *ms)
> >  {
> >      if (!ms->rp_state.rp_thread_created) {
> >          return false;
> > @@ -2424,7 +2400,17 @@ static bool 
> > close_return_path_on_source(MigrationState *ms)
> >  
> >      qemu_thread_join(&ms->rp_state.rp_thread);
> >      ms->rp_state.rp_thread_created = false;
> > -    migration_release_dst_files(ms);
> > +    /*
> > +     * Close the postcopy fast path socket if there is one.
> > +     * No locking needed because this qemufile should only be managed by
> > +     * return path thread which we just stopped.
> > +     */
> > +    if (ms->postcopy_qemufile_src) {
> > +        migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
> > +        qemu_file_shutdown(ms->postcopy_qemufile_src);
> > +        qemu_fclose(ms->postcopy_qemufile_src);
> > +        ms->postcopy_qemufile_src = NULL;
> > +    }  
> 
> [1]
> 
> >      trace_migration_return_path_end_after();
> >  
> >      /* Return path will persist the error in MigrationState when quit */
> > @@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s)
> >          goto fail;
> >      }
> >  
> > -    if (close_return_path_on_source(s)) {
> > +    if (stop_return_path_thread_on_source(s)) {
> >          goto fail;
> >      }
> >  
> > @@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s)
> >           * path and just wait for the thread to finish. It will be
> >           * re-created when we resume.
> >           */
> > -        close_return_path_on_source(s);
> > +        stop_return_path_thread_on_source(s);
> > +        QEMUFile *rp_file;
> > +        WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> > +            rp_file = s->rp_state.from_dst_file;
> > +            s->rp_state.from_dst_file = NULL;
> > +        }
> > +        if (rp_file) {
> > +            qemu_fclose(rp_file);
> > +        }  
> 
> Open-code this is going backwards.  Please see if we can reuse
> migration_release_dst_files() at least.
> 
> Thanks,
> 
> >  
> >          /*
> >           * Current channel is possibly broken. Release it.  Note that this 
> > is
> > @@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s)
> >      if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) {
> >          goto fail;
> >      }
> > +    s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
> >  
> >      /*
> >       * Open the return path. For postcopy, it is used exclusively. For
> > @@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s)
> >       * QEMU uses the return path.
> >       */
> >      if (migrate_postcopy_ram() || migrate_return_path()) {
> > -        open_return_path_on_source(s);
> > +        start_return_path_thread_on_source(s);
> >      }
> >
> >
> >  
> >      /*
> > 
> > -- 
> > 2.39.5
> >   
> 

Attachment: pgpa1ha6Of5te.pgp
Description: OpenPGP digital signature

Reply via email to