On 6/4/2026 8:18 PM, Peter Xu wrote:
External email: Use caution opening links or attachments


On Thu, Jun 04, 2026 at 06:29:01PM +0300, Avihai Horon wrote:
On 6/4/2026 12:04 AM, Peter Xu wrote:
External email: Use caution opening links or attachments


On Wed, Jun 03, 2026 at 04:05:49PM -0400, Peter Xu wrote:
On Tue, Jun 02, 2026 at 12:26:10PM +0300, Avihai Horon wrote:
Before switchover, the source needs one last exact pending query so
modules can flush dirty state. This is currently done ad hoc in modules
handlers. For example, RAM syncs its dirty bitmap in its save_complete
handler.

This should be a general concept relevant for any module, so extract it
to migration core instead by running a final save_query_pending before
switchover.

The final query requires special handling by modules (e.g., it's called
with BQL locked, during VM stop), so extend save_query_pending
SaveVMHandlers callback and qemu_savevm_query_pending() with a "final"
flag so migration modules can tell the last pending query during
switchover from periodic iteration queries.

Not all modules need the final query; skip it for those who don't need.

Signed-off-by: Avihai Horon <[email protected]>
---
   include/migration/register.h   | 41 ++++++++++++++++++----------------
   migration/savevm.h             |  3 ++-
   hw/s390x/s390-stattrib.c       |  9 ++++++--
   hw/vfio/migration.c            |  6 ++++-
   migration/block-dirty-bitmap.c |  6 ++++-
   migration/migration.c          |  7 ++++--
   migration/ram.c                | 37 ++++++++++++++++++------------
   migration/savevm.c             | 27 +++++++++++++++++-----
   migration/trace-events         |  2 +-
   9 files changed, 91 insertions(+), 47 deletions(-)

diff --git a/include/migration/register.h b/include/migration/register.h
index 5e5e0ee432..6f632123f1 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -171,6 +171,28 @@ typedef struct SaveVMHandlers {
        */
       bool (*is_active_iterate)(void *opaque);

+    /**
+     * @save_query_pending
+     *
+     * This estimates the remaining data to transfer on the source side.
+     *
+     * When @exact is true, a module must report accurate results.  When
+     * @exact is false, a module may report estimates.
+     *
+     * It's highly recommended that modules implement a faster version of
+     * the query path (for example, by proper caching on the counters) if
+     * an accurate query will be time-consuming.
+     *
+     * @opaque: data pointer passed to register_savevm_live()
+     * @pending: pointer to a MigPendingData struct
+     * @exact: set to true for an accurate (slow) query
+     * @final: set to true for the final query during switchover. When final is
+     * true, the query is called with BQL locked. Otherwise, it's called with
+     * BQL unlocked.
+     */
+    void (*save_query_pending)(void *opaque, MigPendingData *pending,
+                               bool exact, bool final);
+
       /* This runs outside the BQL in the migration case, and
        * within the lock in the savevm case.  The callback had better only
        * use data that is local to the migration thread or protected
@@ -210,25 +232,6 @@ typedef struct SaveVMHandlers {
        */
       bool (*save_postcopy_prepare)(QEMUFile *f, void *opaque, Error **errp);

-    /**
-     * @save_query_pending
-     *
-     * This estimates the remaining data to transfer on the source side.
-     *
-     * When @exact is true, a module must report accurate results.  When
-     * @exact is false, a module may report estimates.
-     *
-     * It's highly recommended that modules implement a faster version of
-     * the query path (for example, by proper caching on the counters) if
-     * an accurate query will be time-consuming.
-     *
-     * @opaque: data pointer passed to register_savevm_live()
-     * @pending: pointer to a MigPendingData struct
-     * @exact: set to true for an accurate (slow) query
-     */
-    void (*save_query_pending)(void *opaque, MigPendingData *pending,
-                               bool exact);
-
       /**
        * @load_state
        *
diff --git a/migration/savevm.h b/migration/savevm.h
index 96fdf96d4e..10b3d78a5f 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -45,7 +45,8 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
   void qemu_savevm_state_cleanup(void);
   void qemu_savevm_state_complete_postcopy(QEMUFile *f);
   int qemu_savevm_state_complete_precopy(MigrationState *s);
-void qemu_savevm_query_pending(MigPendingData *pending, bool exact);
+void qemu_savevm_query_pending_iter(MigPendingData *pending, bool exact);
+void qemu_savevm_query_pending_final(MigPendingData *pending);
   int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool 
in_postcopy);
   bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
   void qemu_savevm_state_end(QEMUFile *f);
diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
index c334714b31..aed0c6844d 100644
--- a/hw/s390x/s390-stattrib.c
+++ b/hw/s390x/s390-stattrib.c
@@ -190,12 +190,17 @@ static int cmma_save_setup(QEMUFile *f, void *opaque, 
Error **errp)
   }

   static void cmma_state_pending(void *opaque, MigPendingData *pending,
-                               bool exact)
+                               bool exact, bool final)
   {
       S390StAttribState *sas = S390_STATTRIB(opaque);
       S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
-    long long res = sac->get_dirtycount(sas);
+    long long res;

+    if (final) {
+        return;
+    }
+
+    res = sac->get_dirtycount(sas);
       if (res >= 0) {
           pending->precopy_bytes += res;
       }
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index fb12b9717f..95072d6664 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -622,12 +622,16 @@ static void vfio_state_pending_sync(VFIODevice *vbasedev)
   }

   static void vfio_state_pending(void *opaque, MigPendingData *pending,
-                               bool exact)
+                               bool exact, bool final)
   {
       VFIODevice *vbasedev = opaque;
       VFIOMigration *migration = vbasedev->migration;
       uint64_t precopy_size, stopcopy_size;

+    if (final) {
+        return;
+    }
+
       if (exact) {
           vfio_state_pending_sync(vbasedev);
       }
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 7ef3759e53..66ed91de03 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -767,12 +767,16 @@ static int dirty_bitmap_save_complete(QEMUFile *f, void 
*opaque)
   }

   static void dirty_bitmap_state_pending(void *opaque, MigPendingData *data,
-                                       bool exact)
+                                       bool exact, bool final)
   {
       DBMSaveState *s = &((DBMState *)opaque)->save;
       SaveBitmapState *dbms;
       uint64_t pending = 0;

+    if (final) {
+        return;
+    }
+
       bql_lock();

       QSIMPLEQ_FOREACH(dbms, &s->dbms_list, entry) {
diff --git a/migration/migration.c b/migration/migration.c
index 7488a94206..df8ed3a9fd 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2787,12 +2787,15 @@ static bool migration_switchover_prepare(MigrationState 
*s)
   static bool migration_switchover_start(MigrationState *s, Error **errp)
   {
       ERRP_GUARD();
+    MigPendingData pending = {};

       if (!migration_switchover_prepare(s)) {
           error_setg(errp, "Switchover is interrupted");
           return false;
       }

+    qemu_savevm_query_pending_final(&pending);
Not needed for postcopy, we can move it to migration_completion_precopy().
While currently not needed (because VFIO isn't compatible with postcopy),
AFAIU, generally it is needed because we have here a switchover similar to
precopy case -- the switchover decision and VM stop at stopcopy_start() are
not atomic, so theoretically a device may want to request an ACK in this
gap.

But that's all theoretical and not necessary currently.
True, then it makes more sense to keep it here.  You can enhance the
comment below with VFIO implications for the future DMA faults when it's
ready.

A better idea is.. we can drop migration_bitmap_sync() in
ram_postcopy_send_discard_bitmap() too, then we can keep this one.

But then, let's properly document this, both for precopy and postcopy
cases.  One example:

      /*
       * The final query to the whole system on dirty data to make sure we
       * collect the latest status of the VM.  For precopy, source QEMU will
       * dump all the dirty data during switchover.  For postcopy, this will
       * properly update all the dirty bitmaps to finally generate the
       * correct discard bitmaps; see ram_postcopy_send_discard_bitmap().
       */
The last dirty sync is done a bit differently for postcopy case:
   postcopy_start()
     ram_postcopy_send_discard_bitmap()
       migration_bitmap_sync(rs, false);

1. The last_stage parameter of migration_bitmap_sync() is false (shouldn't
it be true here as well?).
Yes, I think it should.

The only thing uses that last_stage so far should be ARM64 where some dirty
data can be generated to bitmap not ring (only matters when dirty ring
enabled).  In that case AFAIU we need that too for a postcopy switchover.

Ack.


2. It calls migration_bitmap_sync and not migration_bitmap_sync_precopy,
i.e., without precopy_notify calls.
This is another thing I feel like got overlooked in the free page hint
feature.  The only notifiers registered is that balloon device, logically I
think we need these notifiers in postcopy too to make sure we properly stop
the free page hints seeing BEFORE_BITMAP_SYNC, then don't start it
(vm_running=false) in AFTER_BITMAP_SYNC:

virtio_balloon_free_page_hint_notify():

     case PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC:
         virtio_balloon_free_page_stop(dev);
         break;
     case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC:
         if (vdev->vm_running) {
             virtio_balloon_free_page_start(dev);
             break;
         }

Actually, looking more closely in the code, I see that it's not used if postcopy is enabled [1]. So calling the precopy notifiers in postcopy switchover is basically a no-op.

Then it seems fine to call migration_bitmap_sync_precopy from postcopy switchover flow.

[1] See commit fd51e54fa102 ("virtio-balloon: don't start free page hinting if postcopy is possible")

If we need to keep the above differences for postcopy then I guess we'd need
to introduce different final query flow for it. It may be cumbersome.
Do you think we should do this extra effort to support it, or, as you
suggested, we can drop the final switchover ack check for postcopy for now?
IMHO you can add a small patch for replacing the postcopy sync first with
migration_bitmap_sync_precopy(last_stage=true), copy Gavin and Wei in that
patch only:

Gavin Shan <[email protected]>
Wei Wang <[email protected]>

Then this patch can be on top.

Sure, will do.


I'll loop both of them in while replying, to see if we can collect early
comments.

I'd also appreciate a short comment explaining why the final query is
needed.

+
       /* Inactivate disks except in COLO */
       if (!migrate_colo()) {
           /*
@@ -3285,7 +3288,7 @@ static void migration_iteration_go_next(MigPendingData 
*pending)
       /*
        * Do a slow sync first before boosting the iteration count.
        */
-    qemu_savevm_query_pending(pending, true);
+    qemu_savevm_query_pending_iter(pending, true);

       /*
        * Update the dirty information for the whole system for this
@@ -3336,7 +3339,7 @@ static MigIterateState 
migration_iteration_run(MigrationState *s)
       bool complete_ready;

       /* Fast path - get the estimated amount of pending data */
-    qemu_savevm_query_pending(&pending, false);
+    qemu_savevm_query_pending_iter(&pending, false);

       if (in_postcopy) {
           /*
diff --git a/migration/ram.c b/migration/ram.c
index fc38ffbf8a..4e3f442593 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3376,10 +3376,6 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
       rs->last_stage = !migration_in_colo_state();

       WITH_RCU_READ_LOCK_GUARD() {
-        if (!migration_in_postcopy()) {
-            migration_bitmap_sync_precopy(true);
-        }
-
           ret = rdma_registration_start(f, RAM_CONTROL_FINISH);
           if (ret < 0) {
               qemu_file_set_error(f, ret);
@@ -3442,25 +3438,38 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
       return qemu_fflush(f);
   }

-static void ram_state_pending(void *opaque, MigPendingData *pending,
-                              bool exact)
+static void ram_state_pending_sync(bool exact, bool final)
   {
-    RAMState **temp = opaque;
-    RAMState *rs = *temp;
-    uint64_t remaining_size;
-
       /*
        * Sync is not needed either with: (1) a fast query, or (2) after
        * postcopy has started (no new dirty will generate anymore).
        */
-    if (exact && !migration_in_postcopy()) {
+    if (!exact || migration_in_postcopy()) {
+        return;
+    }
+
+    /* Final pending query is called with BQL locked */
+    if (!final) {
           bql_lock();
-        WITH_RCU_READ_LOCK_GUARD() {
-            migration_bitmap_sync_precopy(false);
-        }
+    }
+
+    WITH_RCU_READ_LOCK_GUARD() {
+        migration_bitmap_sync_precopy(final);
+    }
+
+    if (!final) {
           bql_unlock();
       }
+}
+
+static void ram_state_pending(void *opaque, MigPendingData *pending,
+                              bool exact, bool final)
+{
+    RAMState **temp = opaque;
+    RAMState *rs = *temp;
+    uint64_t remaining_size;

+    ram_state_pending_sync(exact, final);
       remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;

       if (migrate_postcopy_ram()) {
diff --git a/migration/savevm.c b/migration/savevm.c
index 23adaf9dd9..22cb6a15c6 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1795,7 +1795,8 @@ int qemu_savevm_state_complete_precopy(MigrationState *s)
       return qemu_fflush(f);
   }

-void qemu_savevm_query_pending(MigPendingData *pending, bool exact)
+static void qemu_savevm_query_pending(MigPendingData *pending, bool exact,
+                                      bool final)
   {
       SaveStateEntry *se;

@@ -1808,7 +1809,7 @@ void qemu_savevm_query_pending(MigPendingData *pending, 
bool exact)
           if (!qemu_savevm_state_active(se)) {
               continue;
           }
-        se->ops->save_query_pending(se->opaque, pending, exact);
+        se->ops->save_query_pending(se->opaque, pending, exact, final);
       }

       pending->total_bytes = pending->precopy_bytes +
@@ -1817,16 +1818,30 @@ void qemu_savevm_query_pending(MigPendingData *pending, 
bool exact)
       /*
        * Update system remaining dirty bytes whenever QEMU queries.  It will
        * make the value to be not as accurate, but should still be pretty
-     * close to reality when this got invoked frequently while iterating.
+     * close to reality when this got invoked frequently while iterating.  
Don't
+     * update in final query as some modules may skip it if not needed.
IMHO this specialty isn't needed.  We can just make all modules report and
making sure we don't deadlock on bql.  AFAIU, CMMA never takes BQL, while
dirty-bitmap should conditionally take it now.
I didn't want to add unnecessary work for the critical downtime path, i.e.,
unnecessary exact queries.
If there's any real slow sync, then likely it needs to happen.. because of
the same reason why RAM do the sync, to make sure the data is latest in
QEMU migration core (say, bitmaps).  If it's a fast sync then it doesn't
matter.

Yes, but what I meant is that currently cmma and block dirty bitmaps don't issue a final slow sync during switchover, only RAM does.

But it does seem harmless to add this final sync for them.


query_pending() so far should achieve both functions: (1) synchronize dirty
info with whatever remote modules like KVM, enforced when exact=true, (2)
report.

VFIO is just special because VFIO dirty info accuracy never rely on
query_pending or whatever it collects, so it's purely a way to estimate for
size of dirty, hence (2) only.

I am not familiar with the other devices, but if you think it's negligible
for them then we can drop this specialty (in VFIO I'd still like to skip the
stopcopy size query; using the cached value is good enough IMHO).
Yes VFIO is fine but it'll be good to comment.  Let's try to drop them, I
want to avoid below "if (!final)" if possible.

Alright, will change that.

Thanks.


Thanks,

Thanks.

        */
-    mig_stats.dirty_bytes_total = pending->total_bytes;
-
-    trace_qemu_savevm_query_pending(exact, pending->precopy_bytes,
+    if (!final) {
+        mig_stats.dirty_bytes_total = pending->total_bytes;
+    }
[a]

Then we remove this too which is counter-intuitive..

The rest looks all reasonable.

Thanks,

+    trace_qemu_savevm_query_pending(exact, final, pending->precopy_bytes,
                                       pending->stopcopy_bytes,
                                       pending->postcopy_bytes,
                                       pending->total_bytes);
   }

+void qemu_savevm_query_pending_iter(MigPendingData *pending, bool exact)
+{
+    qemu_savevm_query_pending(pending, exact, false);
+}
+
+void qemu_savevm_query_pending_final(MigPendingData *pending)
+{
+    g_assert(bql_locked());
+
+    qemu_savevm_query_pending(pending, true, true);
+}
+
   void qemu_savevm_state_cleanup(void)
   {
       SaveStateEntry *se;
diff --git a/migration/trace-events b/migration/trace-events
index de99d976ab..1c9212d3e2 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -7,7 +7,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
   qemu_loadvm_state_post_main(int ret) "%d"
   qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t 
instance_id, uint32_t version_id) "%u(%s) %u %u"
   qemu_savevm_send_packaged(void) ""
-qemu_savevm_query_pending(bool exact, uint64_t precopy, uint64_t stopcopy, uint64_t postcopy, uint64_t total) 
"exact=%d, precopy=%"PRIu64", stopcopy=%"PRIu64", postcopy=%"PRIu64", 
total=%"PRIu64
+qemu_savevm_query_pending(bool exact, bool final, uint64_t precopy, uint64_t stopcopy, uint64_t postcopy, uint64_t 
total) "exact=%d, final=%d, precopy=%"PRIu64", stopcopy=%"PRIu64", 
postcopy=%"PRIu64", total=%"PRIu64
   loadvm_state_switchover_ack_needed(unsigned int switchover_ack_pending_num) 
"Switchover ack pending num=%u"
   loadvm_state_setup(void) ""
   loadvm_state_cleanup(void) ""
--
2.40.1

--
Peter Xu
--
Peter Xu

--
Peter Xu


Reply via email to