On 11/20/2023 1:13 PM, Fabiano Rosas wrote: > Steve Sistare <steven.sist...@oracle.com> writes: > >> Restoring a snapshot can break a suspended guest. Snapshots suffer from >> the same suspended-state issues that affect live migration, plus they must >> handle an additional problematic scenario, which is that a running vm must >> remain running if it loads a suspended snapshot. >> >> To save, call vm_stop_force_state to completely stop a vm in the suspended >> state, and restore the suspended state using runstate_restore. This >> produces a correct vmstate file and leaves the vm in the state it had prior >> to the save. >> >> To load, if the snapshot is not suspended, then vm_stop_force_state + >> runstate_restore correctly handles all states, and leaves the vm in the >> state it had prior to the load. However, if the snapshot is suspended, >> restoration is trickier. First restore the state to suspended so the >> current state matches the saved state. Then, if the pre-load state is >> running, wakeup to resume running. >> >> Prior to these changes, the vm_stop to RUN_STATE_SAVE_VM and >> RUN_STATE_RESTORE_VM did not change runstate if the current state was >> paused, suspended, or prelaunch, but now vm_stop_force_state forces these >> transitions, so allow them. >> >> Signed-off-by: Steve Sistare <steven.sist...@oracle.com> >> --- >> include/migration/snapshot.h | 7 +++++++ >> migration/migration-hmp-cmds.c | 12 ++++++++---- >> migration/savevm.c | 33 +++++++++++++++++++++------------ >> system/runstate.c | 10 ++++++++++ >> system/vl.c | 2 ++ >> 5 files changed, 48 insertions(+), 16 deletions(-) >> >> diff --git a/include/migration/snapshot.h b/include/migration/snapshot.h >> index e72083b..9e4dcaa 100644 >> --- a/include/migration/snapshot.h >> +++ b/include/migration/snapshot.h >> @@ -16,6 +16,7 @@ >> #define QEMU_MIGRATION_SNAPSHOT_H >> >> #include "qapi/qapi-builtin-types.h" >> +#include "qapi/qapi-types-run-state.h" >> >> /** >> * save_snapshot: Save an internal snapshot. >> @@ -61,4 +62,10 @@ bool delete_snapshot(const char *name, >> bool has_devices, strList *devices, >> Error **errp); >> >> +/** >> + * load_snapshot_resume: Restore runstate after loading snapshot. >> + * @state: state to restore >> + */ >> +void load_snapshot_resume(RunState state); >> + >> #endif >> diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c >> index 86ae832..c31cdc7 100644 >> --- a/migration/migration-hmp-cmds.c >> +++ b/migration/migration-hmp-cmds.c >> @@ -399,15 +399,19 @@ void hmp_info_migrate_parameters(Monitor *mon, const >> QDict *qdict) >> >> void hmp_loadvm(Monitor *mon, const QDict *qdict) >> { >> - int saved_vm_running = runstate_is_running(); >> + RunState saved_state = runstate_get(); >> + >> const char *name = qdict_get_str(qdict, "name"); >> Error *err = NULL; >> >> - vm_stop(RUN_STATE_RESTORE_VM); >> + vm_stop_force_state(RUN_STATE_RESTORE_VM); >> >> - if (load_snapshot(name, NULL, false, NULL, &err) && saved_vm_running) { >> - vm_start(); >> + if (load_snapshot(name, NULL, false, NULL, &err)) { >> + load_snapshot_resume(saved_state); >> + } else { >> + vm_resume(saved_state); > > Here we're starting the VM if load_snapshot() fails. Is that > intentional?
My bad, good catch, I will delete the else clause. >> } >> + >> hmp_handle_error(mon, err); >> } >> >> diff --git a/migration/savevm.c b/migration/savevm.c >> index 78ac2bd..b4b49bb 100644 >> --- a/migration/savevm.c >> +++ b/migration/savevm.c >> @@ -3040,7 +3040,7 @@ bool save_snapshot(const char *name, bool overwrite, >> const char *vmstate, >> QEMUSnapshotInfo sn1, *sn = &sn1; >> int ret = -1, ret2; >> QEMUFile *f; >> - int saved_vm_running; >> + RunState saved_state = runstate_get(); >> uint64_t vm_state_size; >> g_autoptr(GDateTime) now = g_date_time_new_now_local(); >> AioContext *aio_context; >> @@ -3088,10 +3088,8 @@ bool save_snapshot(const char *name, bool overwrite, >> const char *vmstate, >> } >> aio_context = bdrv_get_aio_context(bs); >> >> - saved_vm_running = runstate_is_running(); >> - >> global_state_store(); >> - vm_stop(RUN_STATE_SAVE_VM); >> + vm_stop_force_state(RUN_STATE_SAVE_VM); >> >> bdrv_drain_all_begin(); >> >> @@ -3157,9 +3155,7 @@ bool save_snapshot(const char *name, bool overwrite, >> const char *vmstate, >> >> bdrv_drain_all_end(); >> >> - if (saved_vm_running) { >> - vm_start(); >> - } >> + vm_resume(saved_state); >> return ret == 0; >> } >> >> @@ -3333,6 +3329,20 @@ err_drain: >> return false; >> } >> >> +void load_snapshot_resume(RunState state) >> +{ >> + if (global_state_received() && >> + global_state_get_runstate() == RUN_STATE_SUSPENDED) { >> + >> + vm_resume(RUN_STATE_SUSPENDED); >> + if (state == RUN_STATE_RUNNING) { >> + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); >> + } >> + } else { >> + vm_resume(state); >> + } >> +} >> + >> bool delete_snapshot(const char *name, bool has_devices, >> strList *devices, Error **errp) >> { >> @@ -3397,16 +3407,15 @@ static void snapshot_load_job_bh(void *opaque) >> { >> Job *job = opaque; >> SnapshotJob *s = container_of(job, SnapshotJob, common); >> - int orig_vm_running; >> + RunState orig_state = runstate_get(); >> >> job_progress_set_remaining(&s->common, 1); >> >> - orig_vm_running = runstate_is_running(); >> - vm_stop(RUN_STATE_RESTORE_VM); >> + vm_stop_force_state(RUN_STATE_RESTORE_VM); >> >> s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp); >> - if (s->ret && orig_vm_running) { >> - vm_start(); >> + if (s->ret) { >> + load_snapshot_resume(orig_state); > > Same here, we used to not start the VM if load_snapshot() failed. Here the behavior is the same as the old code. ret=1 means success. That inverted return code has misled us both :) >> } >> >> job_progress_update(&s->common, 1); >> diff --git a/system/runstate.c b/system/runstate.c >> index ea9d6c2..f1d4bc7 100644 >> --- a/system/runstate.c >> +++ b/system/runstate.c >> @@ -77,6 +77,8 @@ typedef struct { >> >> static const RunStateTransition runstate_transitions_def[] = { >> { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE }, >> + { RUN_STATE_PRELAUNCH, RUN_STATE_PAUSED }, >> + { RUN_STATE_PRELAUNCH, RUN_STATE_SUSPENDED }, >> >> { RUN_STATE_DEBUG, RUN_STATE_RUNNING }, >> { RUN_STATE_DEBUG, RUN_STATE_FINISH_MIGRATE }, >> @@ -108,6 +110,8 @@ static const RunStateTransition >> runstate_transitions_def[] = { >> { RUN_STATE_PAUSED, RUN_STATE_POSTMIGRATE }, >> { RUN_STATE_PAUSED, RUN_STATE_PRELAUNCH }, >> { RUN_STATE_PAUSED, RUN_STATE_COLO}, >> + { RUN_STATE_PAUSED, RUN_STATE_SAVE_VM}, >> + { RUN_STATE_PAUSED, RUN_STATE_RESTORE_VM}, >> >> { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING }, >> { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE }, >> @@ -131,6 +135,8 @@ static const RunStateTransition >> runstate_transitions_def[] = { >> >> { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING }, >> { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH }, >> + { RUN_STATE_RESTORE_VM, RUN_STATE_PAUSED }, >> + { RUN_STATE_RESTORE_VM, RUN_STATE_SUSPENDED }, >> >> { RUN_STATE_COLO, RUN_STATE_RUNNING }, >> { RUN_STATE_COLO, RUN_STATE_PRELAUNCH }, >> @@ -149,6 +155,8 @@ static const RunStateTransition >> runstate_transitions_def[] = { >> { RUN_STATE_RUNNING, RUN_STATE_COLO}, >> >> { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING }, >> + { RUN_STATE_SAVE_VM, RUN_STATE_PAUSED }, >> + { RUN_STATE_SAVE_VM, RUN_STATE_SUSPENDED }, >> >> { RUN_STATE_SHUTDOWN, RUN_STATE_PAUSED }, >> { RUN_STATE_SHUTDOWN, RUN_STATE_FINISH_MIGRATE }, >> @@ -161,6 +169,8 @@ static const RunStateTransition >> runstate_transitions_def[] = { >> { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE }, >> { RUN_STATE_SUSPENDED, RUN_STATE_PRELAUNCH }, >> { RUN_STATE_SUSPENDED, RUN_STATE_COLO}, >> + { RUN_STATE_SUSPENDED, RUN_STATE_SAVE_VM }, >> + { RUN_STATE_SUSPENDED, RUN_STATE_RESTORE_VM }, >> >> { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING }, >> { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE }, >> diff --git a/system/vl.c b/system/vl.c >> index bd7fad7..082a45a 100644 >> --- a/system/vl.c >> +++ b/system/vl.c >> @@ -2702,7 +2702,9 @@ void qmp_x_exit_preconfig(Error **errp) >> qemu_machine_creation_done(); >> >> if (loadvm) { >> + RunState state = autostart ? RUN_STATE_RUNNING : runstate_get(); >> load_snapshot(loadvm, NULL, false, NULL, &error_fatal); >> + load_snapshot_resume(state); > > Here it's using error_fatal, so it won't start the VM. Yes, same as the old code. - Steve