* Peter Xu (pet...@redhat.com) wrote: > Introducing new migration state "postcopy-recover". If a migration > procedure is paused and the connection is rebuilt afterward > successfully, we'll switch the source VM state from "postcopy-paused" to > the new state "postcopy-recover", then we'll do the resume logic in the > migration thread (along with the return path thread). > > This patch only do the state switch on source side. Another following up > patch will handle the state switching on destination side using the same > status bit. > > Signed-off-by: Peter Xu <pet...@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com> (Although it's a little large, so you may want to split it) > --- > migration/migration.c | 76 > ++++++++++++++++++++++++++++++++++++++------------- > qapi-schema.json | 4 ++- > 2 files changed, 60 insertions(+), 20 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index deb947b..30dd566 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -495,6 +495,7 @@ static bool migration_is_setup_or_active(int state) > case MIGRATION_STATUS_ACTIVE: > case MIGRATION_STATUS_POSTCOPY_ACTIVE: > case MIGRATION_STATUS_POSTCOPY_PAUSED: > + case MIGRATION_STATUS_POSTCOPY_RECOVER: > case MIGRATION_STATUS_SETUP: > return true; > > @@ -571,6 +572,7 @@ MigrationInfo *qmp_query_migrate(Error **errp) > case MIGRATION_STATUS_CANCELLING: > case MIGRATION_STATUS_POSTCOPY_ACTIVE: > case MIGRATION_STATUS_POSTCOPY_PAUSED: > + case MIGRATION_STATUS_POSTCOPY_RECOVER: > /* TODO add some postcopy stats */ > info->has_status = true; > info->has_total_time = true; > @@ -2035,6 +2037,13 @@ typedef enum MigThrError { > MIG_THR_ERR_FATAL = 2, > } MigThrError; > > +/* Return zero if success, or <0 for error */ > +static int postcopy_do_resume(MigrationState *s) > +{ > + /* TODO: do the resume logic */ > + return 0; > +} > + > /* > * We don't return until we are in a safe state to continue current > * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or > @@ -2043,29 +2052,55 @@ typedef enum MigThrError { > static MigThrError postcopy_pause(MigrationState *s) > { > assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > - migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, > - MIGRATION_STATUS_POSTCOPY_PAUSED); > > - /* Current channel is possibly broken. Release it. */ > - assert(s->to_dst_file); > - qemu_file_shutdown(s->to_dst_file); > - qemu_fclose(s->to_dst_file); > - s->to_dst_file = NULL; > + while (true) { > + migrate_set_state(&s->state, s->state, > + MIGRATION_STATUS_POSTCOPY_PAUSED); > > - error_report("Detected IO failure for postcopy. " > - "Migration paused."); > + /* Current channel is possibly broken. Release it. */ > + assert(s->to_dst_file); > + qemu_file_shutdown(s->to_dst_file); > + qemu_fclose(s->to_dst_file); > + s->to_dst_file = NULL; > > - /* > - * We wait until things fixed up. Then someone will setup the > - * status back for us. > - */ > - while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { > - qemu_sem_wait(&s->postcopy_pause_sem); > - } > + error_report("Detected IO failure for postcopy. " > + "Migration paused."); > + > + /* > + * We wait until things fixed up. Then someone will setup the > + * status back for us. > + */ > + while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { > + qemu_sem_wait(&s->postcopy_pause_sem); > + } > > - trace_postcopy_pause_continued(); > + if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { > + /* Woken up by a recover procedure. Give it a shot */ > + > + /* > + * Firstly, let's wake up the return path now, with a new > + * return path channel. > + */ > + qemu_sem_post(&s->postcopy_pause_rp_sem); > > - return MIG_THR_ERR_RECOVERED; > + /* Do the resume logic */ > + if (postcopy_do_resume(s) == 0) { > + /* Let's continue! */ > + trace_postcopy_pause_continued(); > + return MIG_THR_ERR_RECOVERED; > + } else { > + /* > + * Something wrong happened during the recovery, let's > + * pause again. Pause is always better than throwing > + * data away. > + */ > + continue; > + } > + } else { > + /* This is not right... Time to quit. */ > + return MIG_THR_ERR_FATAL; > + } > + } > } > > static MigThrError migration_detect_error(MigrationState *s) > @@ -2330,7 +2365,10 @@ void migrate_fd_connect(MigrationState *s) > } > > if (resume) { > - /* TODO: do the resume logic */ > + /* Wakeup the main migration thread to do the recovery */ > + migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED, > + MIGRATION_STATUS_POSTCOPY_RECOVER); > + qemu_sem_post(&s->postcopy_pause_sem); > return; > } > > diff --git a/qapi-schema.json b/qapi-schema.json > index ba41f2c..989f95a 100644 > --- a/qapi-schema.json > +++ b/qapi-schema.json > @@ -669,6 +669,8 @@ > # > # @postcopy-paused: during postcopy but paused. (since 2.11) > # > +# @postcopy-recover: trying to recover from a paused postcopy. (since 2.11) > +# > # @completed: migration is finished. > # > # @failed: some error occurred during migration process. > @@ -682,7 +684,7 @@ > { 'enum': 'MigrationStatus', > 'data': [ 'none', 'setup', 'cancelling', 'cancelled', > 'active', 'postcopy-active', 'postcopy-paused', > - 'completed', 'failed', 'colo' ] } > + 'postcopy-recover', 'completed', 'failed', 'colo' ] } > > ## > # @MigrationInfo: > -- > 2.7.4 > > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK