Signed-off-by: Md Haris Iqbal <haris.p...@gmail.com> --- hmp-commands.hx | 20 +++--- hmp.c | 4 +- include/migration/migration.h | 4 ++ migration/migration.c | 153 +++++++++++++++++++++++++++++++++++++----- qapi-schema.json | 16 ++++- qmp-commands.hx | 3 +- vl.c | 4 ++ 7 files changed, 174 insertions(+), 30 deletions(-)
diff --git a/hmp-commands.hx b/hmp-commands.hx index 848efee..8f765fd 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -894,23 +894,25 @@ ETEXI { .name = "migrate", - .args_type = "detach:-d,blk:-b,inc:-i,uri:s", - .params = "[-d] [-b] [-i] uri", + .args_type = "detach:-d,recover:-r,blk:-b,inc:-i,uri:s", + .params = "[-d] [-r] [-b] [-i] uri", .help = "migrate to URI (using -d to not wait for completion)" - "\n\t\t\t -b for migration without shared storage with" - " full copy of disk\n\t\t\t -i for migration without " - "shared storage with incremental copy of disk " - "(base image shared between src and destination)", + "\n\t\t\t -r to recover from a broken migration\n\t\t\t" + " -b for migration without shared storage with" + " full copy of disk\n\t\t\t -i for migration without " + "shared storage with incremental copy of disk " + "(base image shared between src and destination)", .mhandler.cmd = hmp_migrate, }, STEXI -@item migrate [-d] [-b] [-i] @var{uri} +@item migrate [-d] [-r] [-b] [-i] @var{uri} @findex migrate Migrate to @var{uri} (using -d to not wait for completion). - -b for migration with full copy of disk - -i for migration with incremental copy of disk (base image is shared) + -r to recover from a broken migration + -b for migration with full copy of disk + -i for migration with incremental copy of disk (base image is shared) ETEXI { diff --git a/hmp.c b/hmp.c index cc2056e..02ed457 100644 --- a/hmp.c +++ b/hmp.c @@ -1563,12 +1563,14 @@ static void hmp_migrate_status_cb(void *opaque) void hmp_migrate(Monitor *mon, const QDict *qdict) { bool detach = qdict_get_try_bool(qdict, "detach", false); + bool recover = qdict_get_try_bool(qdict, "recover", false); bool blk = qdict_get_try_bool(qdict, "blk", false); bool inc = qdict_get_try_bool(qdict, "inc", false); const char *uri = qdict_get_str(qdict, "uri"); Error *err = NULL; - qmp_migrate(uri, !!blk, blk, !!inc, inc, false, false, &err); + qmp_migrate(uri, !!recover, recover, !!blk, blk, !!inc, inc, false, false, + &err); if (err) { error_report_err(err); return; diff --git a/include/migration/migration.h b/include/migration/migration.h index 3c96623..bcaf55d 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -142,6 +142,7 @@ struct MigrationState int state; /* Old style params from 'migrate' command */ MigrationParams params; + bool in_recovery; /* State related to return path */ struct { @@ -351,6 +352,9 @@ void flush_page_queue(MigrationState *ms); int ram_save_queue_pages(MigrationState *ms, const char *rbname, ram_addr_t start, ram_addr_t len); +int qemu_migrate_postcopy_outgoing_recovery(MigrationState *ms); +int qemu_migrate_postcopy_incoming_recovery(QEMUFile **f,MigrationIncomingState* mis); + PostcopyState postcopy_state_get(void); /* Set the state and return the old state */ PostcopyState postcopy_state_set(PostcopyState new_state); diff --git a/migration/migration.c b/migration/migration.c index 955d5ee..6ed2e82 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -709,6 +709,33 @@ MigrationInfo *qmp_query_migrate(Error **errp) case MIGRATION_STATUS_CANCELLED: info->has_status = true; break; + case MIGRATION_STATUS_POSTCOPY_RECOVERY: + info->has_status = true; + info->has_total_time = true; + info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + info->has_ram = true; + info->ram = g_malloc0(sizeof(*info->ram)); + info->ram->transferred = ram_bytes_transferred(); + info->ram->remaining = ram_bytes_remaining(); + info->ram->total = ram_bytes_total(); + info->ram->duplicate = dup_mig_pages_transferred(); + info->ram->skipped = skipped_mig_pages_transferred(); + info->ram->normal = norm_mig_pages_transferred(); + info->ram->normal_bytes = norm_mig_bytes_transferred(); + info->ram->dirty_pages_rate = s->dirty_pages_rate; + info->ram->mbps = s->mbps; + info->ram->dirty_sync_count = s->dirty_sync_count; + + if (blk_mig_active()) { + info->has_disk = true; + info->disk = g_malloc0(sizeof(*info->disk)); + info->disk->transferred = blk_mig_bytes_transferred(); + info->disk->remaining = blk_mig_bytes_remaining(); + info->disk->total = blk_mig_bytes_total(); + } + + get_xbzrle_cache_stats(info); } info->status = s->state; @@ -993,6 +1020,7 @@ MigrationState *migrate_init(const MigrationParams *params) s->xfer_limit = 0; s->cleanup_bh = 0; s->to_dst_file = NULL; + s->in_recovery = false; s->state = MIGRATION_STATUS_NONE; s->params = *params; s->rp_state.from_dst_file = NULL; @@ -1069,13 +1097,14 @@ bool migration_is_blocked(Error **errp) return false; } -void qmp_migrate(const char *uri, bool has_blk, bool blk, - bool has_inc, bool inc, bool has_detach, bool detach, +void qmp_migrate(const char *uri, bool in_recover, bool recover, bool has_blk, + bool blk, bool has_inc, bool inc, bool has_detach, bool detach, Error **errp) { Error *local_err = NULL; MigrationState *s = migrate_get_current(); MigrationParams params; + bool recovery = in_recover && recover; const char *p; params.blk = has_blk && blk; @@ -1095,7 +1124,39 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, return; } - s = migrate_init(¶ms); + if (recovery ^ atomic_mb_read(&s->in_recovery)) { + if (recovery) { + /* No VM is waiting for recovery and + * recovery option was set + */ + + error_setg(errp, "No VM to recover"); + return; + } else { + /* A VM is waiting for recovery and + * no recovery option is set + */ + + error_setg(errp, "A migration is in recovery state"); + return; + } + } else { + if (!recovery) { + /* No VM is waiting for recovery and + * no recovery option is set + */ + s = migrate_init(¶ms); + } else { + /* A VM is waiting for recovery and + * recovery option was set + */ + s->to_dst_file = NULL; + if (s->rp_state.from_dst_file) { + /* shutdown the rp socket, so causing the rp thread to shutdown */ + qemu_file_shutdown(s->rp_state.from_dst_file); + } + } + } if (strstart(uri, "tcp:", &p)) { tcp_start_outgoing_migration(s, p, &local_err); @@ -1336,6 +1397,8 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, */ static void *source_return_path_thread(void *opaque) { + fprintf(stderr, "Return path started on source\n"); + MigrationState *ms = opaque; QEMUFile *rp = ms->rp_state.from_dst_file; uint16_t header_len, header_type; @@ -1439,8 +1502,8 @@ static void *source_return_path_thread(void *opaque) trace_source_return_path_thread_end(); out: - ms->rp_state.from_dst_file = NULL; qemu_fclose(rp); + fprintf(stderr, "Return path failed on source\n"); return NULL; } @@ -1714,6 +1777,7 @@ static void *migration_thread(void *opaque) bool entered_postcopy = false; /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; + int ret; rcu_register_thread(); @@ -1781,7 +1845,26 @@ static void *migration_thread(void *opaque) } } - if (qemu_file_get_error(s->to_dst_file)) { + if ((ret = qemu_file_get_error(s->to_dst_file))) { + /* This check is based on how the error is set during the network + * recv(). When recv() returns 0 (i.e. no data to read), the error + * is set to -EIO. For all other network errors, it is set + * according to the return value received. + */ + if (ret == -EIO && s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { + /* Network Failure during postcopy */ + + current_active_state = MIGRATION_STATUS_POSTCOPY_RECOVERY; + runstate_set(RUN_STATE_POSTMIGRATE_RECOVERY); + ret = qemu_migrate_postcopy_outgoing_recovery(s); + if(ret == 0) { + current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE; + runstate_set(RUN_STATE_FINISH_MIGRATE); + qemu_file_clear_error(s->to_dst_file); + continue; + } + + } migrate_set_state(&s->state, current_active_state, MIGRATION_STATUS_FAILED); trace_migration_thread_file_err(); @@ -1852,17 +1935,6 @@ static void *migration_thread(void *opaque) void migrate_fd_connect(MigrationState *s) { - /* This is a best 1st approximation. ns to ms */ - s->expected_downtime = max_downtime/1000000; - s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s); - - qemu_file_set_blocking(s->to_dst_file, true); - qemu_file_set_rate_limit(s->to_dst_file, - s->bandwidth_limit / XFER_LIMIT_RATIO); - - /* Notify before starting migration thread */ - notifier_list_notify(&migration_state_notifiers, s); - /* * Open the return path; currently for postcopy but other things might * also want it. @@ -1877,12 +1949,61 @@ void migrate_fd_connect(MigrationState *s) } } + qemu_file_set_blocking(s->to_dst_file, true); + qemu_file_set_rate_limit(s->to_dst_file, + s->bandwidth_limit / XFER_LIMIT_RATIO); + + if (atomic_mb_read(&s->in_recovery)) { + qemu_mutex_lock(&migration_recovery_mutex); + atomic_mb_set(&s->in_recovery, false); + qemu_cond_signal(&migration_recovery_cond); + qemu_mutex_unlock(&migration_recovery_mutex); + + fprintf(stderr, "recovered\n"); + return; + } + + /* This is a best 1st approximation. ns to ms */ + s->expected_downtime = max_downtime/1000000; + s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s); + + + /* Notify before starting migration thread */ + notifier_list_notify(&migration_state_notifiers, s); + migrate_compress_threads_create(); qemu_thread_create(&s->thread, "migration", migration_thread, s, QEMU_THREAD_JOINABLE); s->migration_thread_running = true; } +int qemu_migrate_postcopy_outgoing_recovery(MigrationState* ms) +{ + migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, + MIGRATION_STATUS_POSTCOPY_RECOVERY); + + atomic_mb_set(&ms->in_recovery, true); + /* Code for network recovery to be added here */ + qemu_mutex_lock(&migration_recovery_mutex); + while(atomic_mb_read(&ms->in_recovery) == true) { + fprintf(stderr, "Under recovery, not letting it fail %p\n", ms->to_dst_file); + qemu_cond_wait(&migration_recovery_cond, &migration_recovery_mutex); + } + qemu_mutex_unlock(&migration_recovery_mutex); + + if(ms->to_dst_file != NULL) { + /* Recovery successfull */ + migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_RECOVERY, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + + qemu_savevm_send_open_return_path(ms->to_dst_file); + return 0; + } + + return -1; + +} + PostcopyState postcopy_state_get(void) { return atomic_mb_read(&incoming_postcopy_state); diff --git a/qapi-schema.json b/qapi-schema.json index 5658723..a658462 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -154,12 +154,15 @@ # @watchdog: the watchdog action is configured to pause and has been triggered # # @guest-panicked: guest has been panicked as a result of guest OS panic +# +# @postmigrate-recovery: guest is paused for recovery after a network failure +# (since 2.7) ## { 'enum': 'RunState', 'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused', 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm', 'running', 'save-vm', 'shutdown', 'suspended', 'watchdog', - 'guest-panicked' ] } + 'guest-panicked', 'postmigrate-recovery' ] } ## # @StatusInfo: @@ -438,12 +441,15 @@ # # @failed: some error occurred during migration process. # +# @postcopy-recovery: in recovery mode, after a network failure. (since 2.7) +# # Since: 2.3 # ## { 'enum': 'MigrationStatus', 'data': [ 'none', 'setup', 'cancelling', 'cancelled', - 'active', 'postcopy-active', 'completed', 'failed' ] } + 'active', 'postcopy-active', 'completed', 'failed', + 'postcopy-recovery' ] } ## # @MigrationInfo @@ -2119,6 +2125,8 @@ # # @uri: the Uniform Resource Identifier of the destination VM # +# @recover: #optional recover from a broken migration (since 2.7) +# # @blk: #optional do block migration (full disk copy) # # @inc: #optional incremental disk copy migration @@ -2131,7 +2139,7 @@ # Since: 0.14.0 ## { 'command': 'migrate', - 'data': {'uri': 'str', '*blk': 'bool', '*inc': 'bool', '*detach': 'bool' } } + 'data': {'uri': 'str', '*recover': 'bool', '*blk': 'bool', '*inc': 'bool', '*detach': 'bool' } } ## # @migrate-incoming @@ -2142,6 +2150,8 @@ # @uri: The Uniform Resource Identifier identifying the source or # address to listen on # +# @recover: #optional recover from a broken migration (since 2.7) +# # Returns: nothing on success # # Since: 2.3 diff --git a/qmp-commands.hx b/qmp-commands.hx index 6866264..dd727bf 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -639,7 +639,7 @@ EQMP { .name = "migrate", - .args_type = "detach:-d,blk:-b,inc:-i,uri:s", + .args_type = "detach:-d,recover:-r,blk:-b,inc:-i,uri:s", .mhandler.cmd_new = qmp_marshal_migrate, }, @@ -651,6 +651,7 @@ Migrate to URI. Arguments: +- "recover": recover migration (json-bool, optional) - "blk": block migration, full disk copy (json-bool, optional) - "inc": incremental disk copy (json-bool, optional) - "uri": Destination URI (json-string) diff --git a/vl.c b/vl.c index b3c80d5..f702886 100644 --- a/vl.c +++ b/vl.c @@ -597,6 +597,10 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE_RECOVERY }, + + { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_SHUTDOWN }, { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING }, { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH }, -- 2.7.4