On Thu, 26 Feb 2026 10:49:07 -0500 Peter Xu <[email protected]> wrote:
> On Fri, Feb 20, 2026 at 08:51:41PM +0100, Lukas Straub wrote: > > qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b) > > so always open the return path socket on the source. This allows > > us to reuse the return path in other parts like colo. Also take > > the proper locks in colo while we're at it. > > > > This fixes a crash due to a race between migrate_cancel() and > > the colo thread shutting down. > > > > Before, the rp socket is opened just before the rp thread is started > > and closed after it terminates and postcopy fast path is closed. > > Now it's the same, only the rp socket stays open until migration_cleanup(). > > > > If there is a rp thread, the rp socket is shut down at the end of migration, > > but the file is still open. COLO is not compatible with postcopy, so this is > > safe as no one else uses the rp socket after this point. > > > > Signed-off-by: Lukas Straub <[email protected]> > > --- > > migration/colo.c | 29 ++++++------------- > > migration/migration.c | 77 > > ++++++++++++++++++++++++--------------------------- > > 2 files changed, 44 insertions(+), 62 deletions(-) > > > > diff --git a/migration/colo.c b/migration/colo.c > > index > > ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527 > > 100644 > > --- a/migration/colo.c > > +++ b/migration/colo.c > > @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void) > > * The s->rp_state.from_dst_file and s->to_dst_file may use the > > * same fd, but we still shutdown the fd for twice, it is harmless. > > */ > > - if (s->to_dst_file) { > > - qemu_file_shutdown(s->to_dst_file); > > - } > > - if (s->rp_state.from_dst_file) { > > - qemu_file_shutdown(s->rp_state.from_dst_file); > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { > > + if (s->to_dst_file) { > > + qemu_file_shutdown(s->to_dst_file); > > + } > > + if (s->rp_state.from_dst_file) { > > + qemu_file_shutdown(s->rp_state.from_dst_file); > > + } > > Nit: these "start to take mutex for..." changes should belong to a separate > patch. > Will do. > > } > > > > old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, > > @@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s) > > Error *local_err = NULL; > > int ret; > > > > + assert(s->rp_state.from_dst_file); > > if (get_colo_mode() != COLO_MODE_PRIMARY) { > > error_report("COLO mode must be COLO_MODE_PRIMARY"); > > return; > > @@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s) > > > > failover_init_state(); > > > > - s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file); > > - if (!s->rp_state.from_dst_file) { > > - error_report("Open QEMUFile from_dst_file failed"); > > - goto out; > > - } > > - > > packets_compare_notifier.notify = colo_compare_notify_checkpoint; > > colo_compare_register_notifier(&packets_compare_notifier); > > > > @@ -634,16 +631,6 @@ out: > > colo_compare_unregister_notifier(&packets_compare_notifier); > > timer_free(s->colo_delay_timer); > > qemu_event_destroy(&s->colo_checkpoint_event); > > - > > - /* > > - * Must be called after failover BH is completed, > > - * Or the failover BH may shutdown the wrong fd that > > - * re-used by other threads after we release here. > > - */ > > - if (s->rp_state.from_dst_file) { > > - qemu_fclose(s->rp_state.from_dst_file); > > - s->rp_state.from_dst_file = NULL; > > - } > > } > > > > void migrate_start_colo_process(MigrationState *s) > > diff --git a/migration/migration.c b/migration/migration.c > > index > > f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c > > 100644 > > --- a/migration/migration.c > > +++ b/migration/migration.c > > @@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX]; > > > > static bool migration_object_check(MigrationState *ms, Error **errp); > > static bool migration_switchover_start(MigrationState *s, Error **errp); > > -static bool close_return_path_on_source(MigrationState *s); > > +static bool stop_return_path_thread_on_source(MigrationState *s); > > static void migration_completion_end(MigrationState *s); > > > > static void migration_downtime_start(MigrationState *s) > > @@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s) > > cpr_state_close(); > > cpr_transfer_source_destroy(s); > > > > - close_return_path_on_source(s); > > + stop_return_path_thread_on_source(s); > > > > if (s->migration_thread_running) { > > bql_unlock(); > > @@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s) > > qemu_fclose(tmp); > > } > > > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { > > + tmp = s->rp_state.from_dst_file; > > + s->rp_state.from_dst_file = NULL; > > + } > > + if (tmp) { > > + qemu_fclose(tmp); > > + } > > + > > assert(!migration_is_active()); > > > > if (s->state == MIGRATION_STATUS_CANCELLING) { > > @@ -2187,38 +2195,6 @@ static bool > > migrate_handle_rp_resume_ack(MigrationState *s, > > return true; > > } > > > > -/* > > - * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if > > - * existed) in a safe way. > > - */ > > -static void migration_release_dst_files(MigrationState *ms) > > -{ > > - QEMUFile *file = NULL; > > - > > - WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { > > - /* > > - * Reset the from_dst_file pointer first before releasing it, as we > > - * can't block within lock section > > - */ > > - file = ms->rp_state.from_dst_file; > > - ms->rp_state.from_dst_file = NULL; > > - } > > - > > - /* > > - * Do the same to postcopy fast path socket too if there is. No > > - * locking needed because this qemufile should only be managed by > > - * return path thread. > > - */ > > - if (ms->postcopy_qemufile_src) { > > - migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src); > > - qemu_file_shutdown(ms->postcopy_qemufile_src); > > - qemu_fclose(ms->postcopy_qemufile_src); > > - ms->postcopy_qemufile_src = NULL; > > - } > > - > > - qemu_fclose(file); > > -} > > - > > /* > > * Handles messages sent on the return path towards the source VM > > * > > @@ -2388,9 +2364,9 @@ out: > > return NULL; > > } > > > > -static void open_return_path_on_source(MigrationState *ms) > > +static void start_return_path_thread_on_source(MigrationState *ms) > > Changing this seems OK, but I'm totally confused why you deleted > migration_release_dst_files(). That's the helper to properly close the two > possible qemufiles that rp thread uses. Okay, so my reasoning here is: I think that closing ms->postcopy_qemufile_src is very closely related to cleaning up the rp thread so I moved that to stop_return_path_thread_on_source(). Meanwhile I think s->rp_state.from_dst_file is more closely related to s->to_dst_file and in fact I think we can remove s->rp_state.from_dst_file entirely in the future and use s->to_dst_file for send *and* receive just like you would with a normal socket. So I close s->rp_state.from_dst_file right besides s->to_dst_file in this patch. And I do the same open-coded file lock dance like s->to_dst_file already does. > IIUC you can keep it then use it > in either postcopy_pause() or migration_cleanup() directly. I can do that if you wish. Should I keep closing ms->postcopy_qemufile_src inside migration_release_dst_files() or stop_return_path_thread_on_source() ? > > Then you need to remove the chunk [1] below, making the function only do > "stop" but not close. > > > { > > - ms->rp_state.from_dst_file = > > qemu_file_get_return_path(ms->to_dst_file); > > + assert(ms->rp_state.from_dst_file); > > I don't see why this change is a must, to open from_dst_file earlier. Can > we keep it as before, or would you justify it in a separate patch? Well, the issue is that opening the return path file is behind this if: if (migrate_postcopy_ram() || migrate_return_path()) { - open_return_path_on_source(s); + start_return_path_thread_on_source(s); } And I want to always open the return path file, without also starting the return path thread. > > > > > trace_open_return_path_on_source(); > > > > @@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState > > *ms) > > } > > > > /* Return true if error detected, or false otherwise */ > > -static bool close_return_path_on_source(MigrationState *ms) > > +static bool stop_return_path_thread_on_source(MigrationState *ms) > > { > > if (!ms->rp_state.rp_thread_created) { > > return false; > > @@ -2424,7 +2400,17 @@ static bool > > close_return_path_on_source(MigrationState *ms) > > > > qemu_thread_join(&ms->rp_state.rp_thread); > > ms->rp_state.rp_thread_created = false; > > - migration_release_dst_files(ms); > > + /* > > + * Close the postcopy fast path socket if there is one. > > + * No locking needed because this qemufile should only be managed by > > + * return path thread which we just stopped. > > + */ > > + if (ms->postcopy_qemufile_src) { > > + migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src); > > + qemu_file_shutdown(ms->postcopy_qemufile_src); > > + qemu_fclose(ms->postcopy_qemufile_src); > > + ms->postcopy_qemufile_src = NULL; > > + } > > [1] > > > trace_migration_return_path_end_after(); > > > > /* Return path will persist the error in MigrationState when quit */ > > @@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s) > > goto fail; > > } > > > > - if (close_return_path_on_source(s)) { > > + if (stop_return_path_thread_on_source(s)) { > > goto fail; > > } > > > > @@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s) > > * path and just wait for the thread to finish. It will be > > * re-created when we resume. > > */ > > - close_return_path_on_source(s); > > + stop_return_path_thread_on_source(s); > > + QEMUFile *rp_file; > > + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { > > + rp_file = s->rp_state.from_dst_file; > > + s->rp_state.from_dst_file = NULL; > > + } > > + if (rp_file) { > > + qemu_fclose(rp_file); > > + } > > Open-code this is going backwards. Please see if we can reuse > migration_release_dst_files() at least. > > Thanks, > > > > > /* > > * Current channel is possibly broken. Release it. Note that this > > is > > @@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s) > > if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) { > > goto fail; > > } > > + s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file); > > > > /* > > * Open the return path. For postcopy, it is used exclusively. For > > @@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s) > > * QEMU uses the return path. > > */ > > if (migrate_postcopy_ram() || migrate_return_path()) { > > - open_return_path_on_source(s); > > + start_return_path_thread_on_source(s); > > } > > > > > > > > /* > > > > -- > > 2.39.5 > > >
pgpa1ha6Of5te.pgp
Description: OpenPGP digital signature
