On 2/20/2024 2:12 AM, Peter Xu wrote:
> On Thu, Feb 08, 2024 at 10:54:02AM -0800, Steve Sistare wrote:
>> Check the status returned by migration notifiers and report errors.
>> If notifiers fail, call the notifiers again so they can clean up.
>> None of the notifiers return an error status at this time.
>>
>> Signed-off-by: Steve Sistare <steven.sist...@oracle.com>
>> ---
>>  include/migration/misc.h |  3 ++-
>>  migration/migration.c    | 40 +++++++++++++++++++++++++++++-----------
>>  2 files changed, 31 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/migration/misc.h b/include/migration/misc.h
>> index 0ea1902..6dc234b 100644
>> --- a/include/migration/misc.h
>> +++ b/include/migration/misc.h
>> @@ -82,7 +82,8 @@ void migration_add_notifier(NotifierWithReturn *notify,
>>  void migration_add_notifier_mode(NotifierWithReturn *notify,
>>                                   MigrationNotifyFunc func, MigMode mode);
>>  void migration_remove_notifier(NotifierWithReturn *notify);
>> -void migration_call_notifiers(MigrationState *s, MigrationEventType type);
>> +int migration_call_notifiers(MigrationState *s, MigrationEventType type,
>> +                             Error **errp);
>>  bool migration_in_setup(MigrationState *);
>>  bool migration_has_finished(MigrationState *);
>>  bool migration_has_failed(MigrationState *);
>> diff --git a/migration/migration.c b/migration/migration.c
>> index 01d8867..d1fce9e 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -1318,6 +1318,8 @@ void migrate_set_state(int *state, int old_state, int 
>> new_state)
>>  
>>  static void migrate_fd_cleanup(MigrationState *s)
>>  {
>> +    Error *local_err = NULL;
>> +
>>      g_free(s->hostname);
>>      s->hostname = NULL;
>>      json_writer_free(s->vmdesc);
>> @@ -1362,13 +1364,23 @@ static void migrate_fd_cleanup(MigrationState *s)
>>                            MIGRATION_STATUS_CANCELLED);
>>      }
>>  
>> +    if (!migration_has_failed(s) &&
>> +        migration_call_notifiers(s, MIG_EVENT_PRECOPY_DONE, &local_err)) {
>> +
>> +        migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
>> +        migrate_set_error(s, local_err);
>> +        error_free(local_err);
>> +    }
>> +
>>      if (s->error) {
>>          /* It is used on info migrate.  We can't free it */
>>          error_report_err(error_copy(s->error));
>>      }
>> -    migration_call_notifiers(s, s->state == MIGRATION_STATUS_COMPLETED ?
>> -                             MIG_EVENT_PRECOPY_DONE :
>> -                             MIG_EVENT_PRECOPY_FAILED);
>> +
>> +    if (migration_has_failed(s)) {
>> +        migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
>> +    }
> 
> AFAIU, the whole point of such split is, allowing DONE notifies to fail too
> and then if that happens we can invoke FAIL notifiers again.

Correct.

> 
> Perhaps we can avoid that complexity, but rather document only SETUP
> notifiers can fail?
> 
> The problem is that failing a notifier at this stage (if migration already
> finished) can already be too late; dest QEMU can already have started
> running, so no way to roll back.  We can document that, check and assert
> for !SETUP cases to make sure error is never hit?

Makes sense. I will modify the patch as you suggest.

- Steve

>> +
>>      block_cleanup_parameters();
>>      yank_unregister_instance(MIGRATION_YANK_INSTANCE);
>>  }
>> @@ -1481,13 +1493,15 @@ void migration_remove_notifier(NotifierWithReturn 
>> *notify)
>>      }
>>  }
>>  
>> -void migration_call_notifiers(MigrationState *s, MigrationEventType type)
>> +int migration_call_notifiers(MigrationState *s, MigrationEventType type,
>> +                             Error **errp)
>>  {
>>      MigMode mode = s->parameters.mode;
>>      MigrationEvent e;
>>  
>>      e.type = type;
>> -    notifier_with_return_list_notify(&migration_state_notifiers[mode], &e, 
>> 0);
>> +    return 
>> notifier_with_return_list_notify(&migration_state_notifiers[mode],
>> +                                            &e, errp);
>>  }
>>  
>>  bool migration_in_setup(MigrationState *s)
>> @@ -2535,7 +2549,9 @@ static int postcopy_start(MigrationState *ms, Error 
>> **errp)
>>       * at the transition to postcopy and after the device state; in 
>> particular
>>       * spice needs to trigger a transition now
>>       */
>> -    migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE);
>> +    if (migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE, errp)) {
>> +        goto fail;
>> +    }
>>  
>>      migration_downtime_end(ms);
>>  
>> @@ -2555,11 +2571,10 @@ static int postcopy_start(MigrationState *ms, Error 
>> **errp)
>>  
>>      ret = qemu_file_get_error(ms->to_dst_file);
>>      if (ret) {
>> -        error_setg(errp, "postcopy_start: Migration stream errored");
>> -        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
>> -                              MIGRATION_STATUS_FAILED);
>> +        error_setg_errno(errp, -ret, "postcopy_start: Migration stream 
>> error");
>> +        bql_lock();
>> +        goto fail;
>>      }
>> -
>>      trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
>>  
>>      return ret;
>> @@ -2580,6 +2595,7 @@ fail:
>>              error_report_err(local_err);
>>          }
>>      }
>> +    migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL);
>>      bql_unlock();
>>      return -1;
>>  }
>> @@ -3594,7 +3610,9 @@ void migrate_fd_connect(MigrationState *s, Error 
>> *error_in)
>>          rate_limit = migrate_max_bandwidth();
>>  
>>          /* Notify before starting migration thread */
>> -        migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP);
>> +        if (migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP, 
>> &local_err)) {
>> +            goto fail;
>> +        }
>>      }
>>  
>>      migration_rate_set(rate_limit);
>> -- 
>> 1.8.3.1
>>
> 

Reply via email to