On 2/20/2024 2:12 AM, Peter Xu wrote: > On Thu, Feb 08, 2024 at 10:54:02AM -0800, Steve Sistare wrote: >> Check the status returned by migration notifiers and report errors. >> If notifiers fail, call the notifiers again so they can clean up. >> None of the notifiers return an error status at this time. >> >> Signed-off-by: Steve Sistare <steven.sist...@oracle.com> >> --- >> include/migration/misc.h | 3 ++- >> migration/migration.c | 40 +++++++++++++++++++++++++++++----------- >> 2 files changed, 31 insertions(+), 12 deletions(-) >> >> diff --git a/include/migration/misc.h b/include/migration/misc.h >> index 0ea1902..6dc234b 100644 >> --- a/include/migration/misc.h >> +++ b/include/migration/misc.h >> @@ -82,7 +82,8 @@ void migration_add_notifier(NotifierWithReturn *notify, >> void migration_add_notifier_mode(NotifierWithReturn *notify, >> MigrationNotifyFunc func, MigMode mode); >> void migration_remove_notifier(NotifierWithReturn *notify); >> -void migration_call_notifiers(MigrationState *s, MigrationEventType type); >> +int migration_call_notifiers(MigrationState *s, MigrationEventType type, >> + Error **errp); >> bool migration_in_setup(MigrationState *); >> bool migration_has_finished(MigrationState *); >> bool migration_has_failed(MigrationState *); >> diff --git a/migration/migration.c b/migration/migration.c >> index 01d8867..d1fce9e 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -1318,6 +1318,8 @@ void migrate_set_state(int *state, int old_state, int >> new_state) >> >> static void migrate_fd_cleanup(MigrationState *s) >> { >> + Error *local_err = NULL; >> + >> g_free(s->hostname); >> s->hostname = NULL; >> json_writer_free(s->vmdesc); >> @@ -1362,13 +1364,23 @@ static void migrate_fd_cleanup(MigrationState *s) >> MIGRATION_STATUS_CANCELLED); >> } >> >> + if (!migration_has_failed(s) && >> + migration_call_notifiers(s, MIG_EVENT_PRECOPY_DONE, &local_err)) { >> + >> + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); >> + migrate_set_error(s, local_err); >> + error_free(local_err); >> + } >> + >> if (s->error) { >> /* It is used on info migrate. We can't free it */ >> error_report_err(error_copy(s->error)); >> } >> - migration_call_notifiers(s, s->state == MIGRATION_STATUS_COMPLETED ? >> - MIG_EVENT_PRECOPY_DONE : >> - MIG_EVENT_PRECOPY_FAILED); >> + >> + if (migration_has_failed(s)) { >> + migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL); >> + } > > AFAIU, the whole point of such split is, allowing DONE notifies to fail too > and then if that happens we can invoke FAIL notifiers again.
Correct. > > Perhaps we can avoid that complexity, but rather document only SETUP > notifiers can fail? > > The problem is that failing a notifier at this stage (if migration already > finished) can already be too late; dest QEMU can already have started > running, so no way to roll back. We can document that, check and assert > for !SETUP cases to make sure error is never hit? Makes sense. I will modify the patch as you suggest. - Steve >> + >> block_cleanup_parameters(); >> yank_unregister_instance(MIGRATION_YANK_INSTANCE); >> } >> @@ -1481,13 +1493,15 @@ void migration_remove_notifier(NotifierWithReturn >> *notify) >> } >> } >> >> -void migration_call_notifiers(MigrationState *s, MigrationEventType type) >> +int migration_call_notifiers(MigrationState *s, MigrationEventType type, >> + Error **errp) >> { >> MigMode mode = s->parameters.mode; >> MigrationEvent e; >> >> e.type = type; >> - notifier_with_return_list_notify(&migration_state_notifiers[mode], &e, >> 0); >> + return >> notifier_with_return_list_notify(&migration_state_notifiers[mode], >> + &e, errp); >> } >> >> bool migration_in_setup(MigrationState *s) >> @@ -2535,7 +2549,9 @@ static int postcopy_start(MigrationState *ms, Error >> **errp) >> * at the transition to postcopy and after the device state; in >> particular >> * spice needs to trigger a transition now >> */ >> - migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE); >> + if (migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE, errp)) { >> + goto fail; >> + } >> >> migration_downtime_end(ms); >> >> @@ -2555,11 +2571,10 @@ static int postcopy_start(MigrationState *ms, Error >> **errp) >> >> ret = qemu_file_get_error(ms->to_dst_file); >> if (ret) { >> - error_setg(errp, "postcopy_start: Migration stream errored"); >> - migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, >> - MIGRATION_STATUS_FAILED); >> + error_setg_errno(errp, -ret, "postcopy_start: Migration stream >> error"); >> + bql_lock(); >> + goto fail; >> } >> - >> trace_postcopy_preempt_enabled(migrate_postcopy_preempt()); >> >> return ret; >> @@ -2580,6 +2595,7 @@ fail: >> error_report_err(local_err); >> } >> } >> + migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL); >> bql_unlock(); >> return -1; >> } >> @@ -3594,7 +3610,9 @@ void migrate_fd_connect(MigrationState *s, Error >> *error_in) >> rate_limit = migrate_max_bandwidth(); >> >> /* Notify before starting migration thread */ >> - migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP); >> + if (migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP, >> &local_err)) { >> + goto fail; >> + } >> } >> >> migration_rate_set(rate_limit); >> -- >> 1.8.3.1 >> >