qemu_file_get_return_path() can not fail (See also commit 3f9d6e77b) so always open the return path socket on the source. This allows us to reuse the return path in other parts like colo. Also take the proper locks in colo while we're at it.
This fixes a crash due to a race between migrate_cancel() and the colo thread shutting down. Before, the rp socket is opened just before the rp thread is started and closed after it terminates and postcopy fast path is closed. Now it's the same, only the rp socket stays open until migration_cleanup(). If there is a rp thread, the rp socket is shut down at the end of migration, but the file is still open. COLO is not compatible with postcopy, so this is safe as no one else uses the rp socket after this point. Signed-off-by: Lukas Straub <[email protected]> --- migration/colo.c | 29 ++++++------------- migration/migration.c | 77 ++++++++++++++++++++++++--------------------------- 2 files changed, 44 insertions(+), 62 deletions(-) diff --git a/migration/colo.c b/migration/colo.c index ce02c71d8857d470be434bdf3a9cacad3baab0d5..0dee33f1145b81af276cf318e2984deae9ae0527 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -173,11 +173,13 @@ static void primary_vm_do_failover(void) * The s->rp_state.from_dst_file and s->to_dst_file may use the * same fd, but we still shutdown the fd for twice, it is harmless. */ - if (s->to_dst_file) { - qemu_file_shutdown(s->to_dst_file); - } - if (s->rp_state.from_dst_file) { - qemu_file_shutdown(s->rp_state.from_dst_file); + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { + if (s->to_dst_file) { + qemu_file_shutdown(s->to_dst_file); + } + if (s->rp_state.from_dst_file) { + qemu_file_shutdown(s->rp_state.from_dst_file); + } } old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, @@ -537,6 +539,7 @@ static void colo_process_checkpoint(MigrationState *s) Error *local_err = NULL; int ret; + assert(s->rp_state.from_dst_file); if (get_colo_mode() != COLO_MODE_PRIMARY) { error_report("COLO mode must be COLO_MODE_PRIMARY"); return; @@ -544,12 +547,6 @@ static void colo_process_checkpoint(MigrationState *s) failover_init_state(); - s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file); - if (!s->rp_state.from_dst_file) { - error_report("Open QEMUFile from_dst_file failed"); - goto out; - } - packets_compare_notifier.notify = colo_compare_notify_checkpoint; colo_compare_register_notifier(&packets_compare_notifier); @@ -634,16 +631,6 @@ out: colo_compare_unregister_notifier(&packets_compare_notifier); timer_free(s->colo_delay_timer); qemu_event_destroy(&s->colo_checkpoint_event); - - /* - * Must be called after failover BH is completed, - * Or the failover BH may shutdown the wrong fd that - * re-used by other threads after we release here. - */ - if (s->rp_state.from_dst_file) { - qemu_fclose(s->rp_state.from_dst_file); - s->rp_state.from_dst_file = NULL; - } } void migrate_start_colo_process(MigrationState *s) diff --git a/migration/migration.c b/migration/migration.c index f36d42ef657bdf26d78ca642d77a9b76e1c0c174..8caa56940beef12de33a799695cf486c8fbd471c 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -97,7 +97,7 @@ static GSList *migration_blockers[MIG_MODE__MAX]; static bool migration_object_check(MigrationState *ms, Error **errp); static bool migration_switchover_start(MigrationState *s, Error **errp); -static bool close_return_path_on_source(MigrationState *s); +static bool stop_return_path_thread_on_source(MigrationState *s); static void migration_completion_end(MigrationState *s); static void migration_downtime_start(MigrationState *s) @@ -1278,7 +1278,7 @@ static void migration_cleanup(MigrationState *s) cpr_state_close(); cpr_transfer_source_destroy(s); - close_return_path_on_source(s); + stop_return_path_thread_on_source(s); if (s->migration_thread_running) { bql_unlock(); @@ -1307,6 +1307,14 @@ static void migration_cleanup(MigrationState *s) qemu_fclose(tmp); } + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { + tmp = s->rp_state.from_dst_file; + s->rp_state.from_dst_file = NULL; + } + if (tmp) { + qemu_fclose(tmp); + } + assert(!migration_is_active()); if (s->state == MIGRATION_STATUS_CANCELLING) { @@ -2187,38 +2195,6 @@ static bool migrate_handle_rp_resume_ack(MigrationState *s, return true; } -/* - * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if - * existed) in a safe way. - */ -static void migration_release_dst_files(MigrationState *ms) -{ - QEMUFile *file = NULL; - - WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { - /* - * Reset the from_dst_file pointer first before releasing it, as we - * can't block within lock section - */ - file = ms->rp_state.from_dst_file; - ms->rp_state.from_dst_file = NULL; - } - - /* - * Do the same to postcopy fast path socket too if there is. No - * locking needed because this qemufile should only be managed by - * return path thread. - */ - if (ms->postcopy_qemufile_src) { - migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src); - qemu_file_shutdown(ms->postcopy_qemufile_src); - qemu_fclose(ms->postcopy_qemufile_src); - ms->postcopy_qemufile_src = NULL; - } - - qemu_fclose(file); -} - /* * Handles messages sent on the return path towards the source VM * @@ -2388,9 +2364,9 @@ out: return NULL; } -static void open_return_path_on_source(MigrationState *ms) +static void start_return_path_thread_on_source(MigrationState *ms) { - ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file); + assert(ms->rp_state.from_dst_file); trace_open_return_path_on_source(); @@ -2402,7 +2378,7 @@ static void open_return_path_on_source(MigrationState *ms) } /* Return true if error detected, or false otherwise */ -static bool close_return_path_on_source(MigrationState *ms) +static bool stop_return_path_thread_on_source(MigrationState *ms) { if (!ms->rp_state.rp_thread_created) { return false; @@ -2424,7 +2400,17 @@ static bool close_return_path_on_source(MigrationState *ms) qemu_thread_join(&ms->rp_state.rp_thread); ms->rp_state.rp_thread_created = false; - migration_release_dst_files(ms); + /* + * Close the postcopy fast path socket if there is one. + * No locking needed because this qemufile should only be managed by + * return path thread which we just stopped. + */ + if (ms->postcopy_qemufile_src) { + migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src); + qemu_file_shutdown(ms->postcopy_qemufile_src); + qemu_fclose(ms->postcopy_qemufile_src); + ms->postcopy_qemufile_src = NULL; + } trace_migration_return_path_end_after(); /* Return path will persist the error in MigrationState when quit */ @@ -2787,7 +2773,7 @@ static void migration_completion(MigrationState *s) goto fail; } - if (close_return_path_on_source(s)) { + if (stop_return_path_thread_on_source(s)) { goto fail; } @@ -2941,7 +2927,15 @@ static MigThrError postcopy_pause(MigrationState *s) * path and just wait for the thread to finish. It will be * re-created when we resume. */ - close_return_path_on_source(s); + stop_return_path_thread_on_source(s); + QEMUFile *rp_file; + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { + rp_file = s->rp_state.from_dst_file; + s->rp_state.from_dst_file = NULL; + } + if (rp_file) { + qemu_fclose(rp_file); + } /* * Current channel is possibly broken. Release it. Note that this is @@ -3758,6 +3752,7 @@ void migration_start_outgoing(MigrationState *s) if (!qemu_file_set_blocking(s->to_dst_file, true, &local_err)) { goto fail; } + s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file); /* * Open the return path. For postcopy, it is used exclusively. For @@ -3765,7 +3760,7 @@ void migration_start_outgoing(MigrationState *s) * QEMU uses the return path. */ if (migrate_postcopy_ram() || migrate_return_path()) { - open_return_path_on_source(s); + start_return_path_thread_on_source(s); } /* -- 2.39.5
