Hi Michael,
  No surprise here, I did see some of the same failure messages and they
prompted me to submit the fix.  They are all symptoms of "the possibility of
ram and device state being out of sync" as mentioned in the commit.

I am not familiar with the process for maintaining old releases for qemu.
Perhaps someone on this list can comment on 8.2.3.

- Steve

On 5/13/2024 2:22 PM, Michael Galaxy wrote:
Hi Steve,

We found that this specific change in particular ("migration: stop vm for cpr") fixes a bug that we've identified in testing back-to-back live updates in a lab environment.

More specifically, *without* this change (which is not available in 8.2.2, but *is* available in 9.0.0) causes the metadata save file to be corrupted when doing live updates one after another. Typically we see a corrupted save file somewhere in between 20 and 30 live updates and while doing a git bisect, we found that this change makes the problem go away.

Were you aware? Is there any plan in place to cherry pick this for 8.2.3, perhaps or a plan to release 8.2.3 at some point?

Here are some examples of how the bug manifests in different locations of the QEMU metadata save file:

2024-04-26T13:28:53Z qemu-system-x86_64: Failed to load mtrr_var:base
2024-04-26T13:28:53Z qemu-system-x86_64: Failed to load cpu:env.mtrr_var
2024-04-26T13:28:53Z qemu-system-x86_64: error while loading state for instance 
0x1b of device 'cpu'
2024-04-26T13:28:53Z qemu-system-x86_64: load of migration failed: Input/output 
error

And another:

2024-04-17T16:09:47Z qemu-system-x86_64: check_section_footer: Read section 
footer failed: -5
2024-04-17T16:09:47Z qemu-system-x86_64: load of migration failed: Invalid 
argument

And another:

2024-04-30T21:53:29Z qemu-system-x86_64: Unable to read ID string for section 
163
2024-04-30T21:53:29Z qemu-system-x86_64: load of migration failed: Invalid 
argument

And another:

2024-05-01T16:01:44Z qemu-system-x86_64: Unable to read ID string for section 
164
2024-05-01T16:01:44Z qemu-system-x86_64: load of migration failed: Invalid 
argument
As you can see, they occur quite randomly, but generally it takes at least 20-30+ live updates before the problem occurs.

- Michael

On 2/27/24 23:13, pet...@redhat.com wrote:
From: Steve Sistare<steven.sist...@oracle.com>

When migration for cpr is initiated, stop the vm and set state
RUN_STATE_FINISH_MIGRATE before ram is saved.  This eliminates the
possibility of ram and device state being out of sync, and guarantees
that a guest in the suspended state remains suspended, because qmp_cont
rejects a cont command in the RUN_STATE_FINISH_MIGRATE state.

Signed-off-by: Steve Sistare<steven.sist...@oracle.com>
Reviewed-by: Peter Xu<pet...@redhat.com>
Link:https://urldefense.com/v3/__https://lore.kernel.org/r/1708622920-68779-11-git-send-email-steven.sistare@oracle.com__;!!GjvTz_vk!QLsFOCX-x2U9bzAo98SdidKlomHrmf_t0UmQKtgudoIcaDVoAJOPm39ZqaNP_nT5I8QqVfSgwhDZmg$ Signed-off-by: Peter Xu<pet...@redhat.com>
---
  include/migration/misc.h |  1 +
  migration/migration.h    |  2 --
  migration/migration.c    | 51 ++++++++++++++++++++++++----------------
  3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/include/migration/misc.h b/include/migration/misc.h
index e4933b815b..5d1aa593ed 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -60,6 +60,7 @@ void migration_object_init(void);
  void migration_shutdown(void);
  bool migration_is_idle(void);
  bool migration_is_active(MigrationState *);
+bool migrate_mode_is_cpr(MigrationState *);
typedef enum MigrationEventType {
      MIG_EVENT_PRECOPY_SETUP,
diff --git a/migration/migration.h b/migration/migration.h
index aef8afbe1f..65c0b61cbd 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s);
   */
  void migration_rp_kick(MigrationState *s);
-int migration_stop_vm(RunState state);
-
  #endif
diff --git a/migration/migration.c b/migration/migration.c
index 37c836b0b0..90a90947fb 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -167,11 +167,19 @@ static gint page_request_addr_cmp(gconstpointer ap, 
gconstpointer bp)
      return (a > b) - (a < b);
  }
-int migration_stop_vm(RunState state)
+static int migration_stop_vm(MigrationState *s, RunState state)
  {
-    int ret = vm_stop_force_state(state);
+    int ret;
+
+    migration_downtime_start(s);
+
+    s->vm_old_state = runstate_get();
+    global_state_store();
+
+    ret = vm_stop_force_state(state);
trace_vmstate_downtime_checkpoint("src-vm-stopped");
+    trace_migration_completion_vm_stop(ret);
return ret;
  }
@@ -1602,6 +1610,11 @@ bool migration_is_active(MigrationState *s)
              s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
  }
+bool migrate_mode_is_cpr(MigrationState *s)
+{
+    return s->parameters.mode == MIG_MODE_CPR_REBOOT;
+}
+
  int migrate_init(MigrationState *s, Error **errp)
  {
      int ret;
@@ -2454,10 +2467,7 @@ static int postcopy_start(MigrationState *ms, Error 
**errp)
      bql_lock();
      trace_postcopy_start_set_run();
- migration_downtime_start(ms);
-
-    global_state_store();
-    ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
+    ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE);
      if (ret < 0) {
          goto fail;
      }
@@ -2652,15 +2662,12 @@ static int migration_completion_precopy(MigrationState 
*s,
      int ret;
bql_lock();
-    migration_downtime_start(s);
-
-    s->vm_old_state = runstate_get();
-    global_state_store();
- ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
-    trace_migration_completion_vm_stop(ret);
-    if (ret < 0) {
-        goto out_unlock;
+    if (!migrate_mode_is_cpr(s)) {
+        ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
+        if (ret < 0) {
+            goto out_unlock;
+        }
      }
ret = migration_maybe_pause(s, current_active_state,
@@ -3500,15 +3507,10 @@ static void *bg_migration_thread(void *opaque)
      s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
trace_migration_thread_setup_complete();
-    migration_downtime_start(s);
bql_lock(); - s->vm_old_state = runstate_get();
-
-    global_state_store();
-    /* Forcibly stop VM before saving state of vCPUs and devices */
-    if (migration_stop_vm(RUN_STATE_PAUSED)) {
+    if (migration_stop_vm(s, RUN_STATE_PAUSED)) {
          goto fail;
      }
      /*
@@ -3584,6 +3586,7 @@ void migrate_fd_connect(MigrationState *s, Error 
*error_in)
      Error *local_err = NULL;
      uint64_t rate_limit;
      bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
+    int ret;
/*
       * If there's a previous error, free it and prepare for another one.
@@ -3655,6 +3658,14 @@ void migrate_fd_connect(MigrationState *s, Error 
*error_in)
          return;
      }
+ if (migrate_mode_is_cpr(s)) {
+        ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
+        if (ret < 0) {
+            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
+            goto fail;
+        }
+    }
+
      if (migrate_background_snapshot()) {
          qemu_thread_create(&s->thread, "bg_snapshot",
                  bg_migration_thread, s, QEMU_THREAD_JOINABLE);

Reply via email to