On Wed, Jun 03, 2026 at 04:05:49PM -0400, Peter Xu wrote:
> On Tue, Jun 02, 2026 at 12:26:10PM +0300, Avihai Horon wrote:
> > Before switchover, the source needs one last exact pending query so
> > modules can flush dirty state. This is currently done ad hoc in modules
> > handlers. For example, RAM syncs its dirty bitmap in its save_complete
> > handler.
> >
> > This should be a general concept relevant for any module, so extract it
> > to migration core instead by running a final save_query_pending before
> > switchover.
> >
> > The final query requires special handling by modules (e.g., it's called
> > with BQL locked, during VM stop), so extend save_query_pending
> > SaveVMHandlers callback and qemu_savevm_query_pending() with a "final"
> > flag so migration modules can tell the last pending query during
> > switchover from periodic iteration queries.
> >
> > Not all modules need the final query; skip it for those who don't need.
> >
> > Signed-off-by: Avihai Horon <[email protected]>
> > ---
> > include/migration/register.h | 41 ++++++++++++++++++----------------
> > migration/savevm.h | 3 ++-
> > hw/s390x/s390-stattrib.c | 9 ++++++--
> > hw/vfio/migration.c | 6 ++++-
> > migration/block-dirty-bitmap.c | 6 ++++-
> > migration/migration.c | 7 ++++--
> > migration/ram.c | 37 ++++++++++++++++++------------
> > migration/savevm.c | 27 +++++++++++++++++-----
> > migration/trace-events | 2 +-
> > 9 files changed, 91 insertions(+), 47 deletions(-)
> >
> > diff --git a/include/migration/register.h b/include/migration/register.h
> > index 5e5e0ee432..6f632123f1 100644
> > --- a/include/migration/register.h
> > +++ b/include/migration/register.h
> > @@ -171,6 +171,28 @@ typedef struct SaveVMHandlers {
> > */
> > bool (*is_active_iterate)(void *opaque);
> >
> > + /**
> > + * @save_query_pending
> > + *
> > + * This estimates the remaining data to transfer on the source side.
> > + *
> > + * When @exact is true, a module must report accurate results. When
> > + * @exact is false, a module may report estimates.
> > + *
> > + * It's highly recommended that modules implement a faster version of
> > + * the query path (for example, by proper caching on the counters) if
> > + * an accurate query will be time-consuming.
> > + *
> > + * @opaque: data pointer passed to register_savevm_live()
> > + * @pending: pointer to a MigPendingData struct
> > + * @exact: set to true for an accurate (slow) query
> > + * @final: set to true for the final query during switchover. When
> > final is
> > + * true, the query is called with BQL locked. Otherwise, it's called
> > with
> > + * BQL unlocked.
> > + */
> > + void (*save_query_pending)(void *opaque, MigPendingData *pending,
> > + bool exact, bool final);
> > +
> > /* This runs outside the BQL in the migration case, and
> > * within the lock in the savevm case. The callback had better only
> > * use data that is local to the migration thread or protected
> > @@ -210,25 +232,6 @@ typedef struct SaveVMHandlers {
> > */
> > bool (*save_postcopy_prepare)(QEMUFile *f, void *opaque, Error **errp);
> >
> > - /**
> > - * @save_query_pending
> > - *
> > - * This estimates the remaining data to transfer on the source side.
> > - *
> > - * When @exact is true, a module must report accurate results. When
> > - * @exact is false, a module may report estimates.
> > - *
> > - * It's highly recommended that modules implement a faster version of
> > - * the query path (for example, by proper caching on the counters) if
> > - * an accurate query will be time-consuming.
> > - *
> > - * @opaque: data pointer passed to register_savevm_live()
> > - * @pending: pointer to a MigPendingData struct
> > - * @exact: set to true for an accurate (slow) query
> > - */
> > - void (*save_query_pending)(void *opaque, MigPendingData *pending,
> > - bool exact);
> > -
> > /**
> > * @load_state
> > *
> > diff --git a/migration/savevm.h b/migration/savevm.h
> > index 96fdf96d4e..10b3d78a5f 100644
> > --- a/migration/savevm.h
> > +++ b/migration/savevm.h
> > @@ -45,7 +45,8 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
> > void qemu_savevm_state_cleanup(void);
> > void qemu_savevm_state_complete_postcopy(QEMUFile *f);
> > int qemu_savevm_state_complete_precopy(MigrationState *s);
> > -void qemu_savevm_query_pending(MigPendingData *pending, bool exact);
> > +void qemu_savevm_query_pending_iter(MigPendingData *pending, bool exact);
> > +void qemu_savevm_query_pending_final(MigPendingData *pending);
> > int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool
> > in_postcopy);
> > bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
> > void qemu_savevm_state_end(QEMUFile *f);
> > diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
> > index c334714b31..aed0c6844d 100644
> > --- a/hw/s390x/s390-stattrib.c
> > +++ b/hw/s390x/s390-stattrib.c
> > @@ -190,12 +190,17 @@ static int cmma_save_setup(QEMUFile *f, void *opaque,
> > Error **errp)
> > }
> >
> > static void cmma_state_pending(void *opaque, MigPendingData *pending,
> > - bool exact)
> > + bool exact, bool final)
> > {
> > S390StAttribState *sas = S390_STATTRIB(opaque);
> > S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
> > - long long res = sac->get_dirtycount(sas);
> > + long long res;
> >
> > + if (final) {
> > + return;
> > + }
> > +
> > + res = sac->get_dirtycount(sas);
> > if (res >= 0) {
> > pending->precopy_bytes += res;
> > }
> > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> > index fb12b9717f..95072d6664 100644
> > --- a/hw/vfio/migration.c
> > +++ b/hw/vfio/migration.c
> > @@ -622,12 +622,16 @@ static void vfio_state_pending_sync(VFIODevice
> > *vbasedev)
> > }
> >
> > static void vfio_state_pending(void *opaque, MigPendingData *pending,
> > - bool exact)
> > + bool exact, bool final)
> > {
> > VFIODevice *vbasedev = opaque;
> > VFIOMigration *migration = vbasedev->migration;
> > uint64_t precopy_size, stopcopy_size;
> >
> > + if (final) {
> > + return;
> > + }
> > +
> > if (exact) {
> > vfio_state_pending_sync(vbasedev);
> > }
> > diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
> > index 7ef3759e53..66ed91de03 100644
> > --- a/migration/block-dirty-bitmap.c
> > +++ b/migration/block-dirty-bitmap.c
> > @@ -767,12 +767,16 @@ static int dirty_bitmap_save_complete(QEMUFile *f,
> > void *opaque)
> > }
> >
> > static void dirty_bitmap_state_pending(void *opaque, MigPendingData *data,
> > - bool exact)
> > + bool exact, bool final)
> > {
> > DBMSaveState *s = &((DBMState *)opaque)->save;
> > SaveBitmapState *dbms;
> > uint64_t pending = 0;
> >
> > + if (final) {
> > + return;
> > + }
> > +
> > bql_lock();
> >
> > QSIMPLEQ_FOREACH(dbms, &s->dbms_list, entry) {
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 7488a94206..df8ed3a9fd 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -2787,12 +2787,15 @@ static bool
> > migration_switchover_prepare(MigrationState *s)
> > static bool migration_switchover_start(MigrationState *s, Error **errp)
> > {
> > ERRP_GUARD();
> > + MigPendingData pending = {};
> >
> > if (!migration_switchover_prepare(s)) {
> > error_setg(errp, "Switchover is interrupted");
> > return false;
> > }
> >
> > + qemu_savevm_query_pending_final(&pending);
>
> Not needed for postcopy, we can move it to migration_completion_precopy().
A better idea is.. we can drop migration_bitmap_sync() in
ram_postcopy_send_discard_bitmap() too, then we can keep this one.
But then, let's properly document this, both for precopy and postcopy
cases. One example:
/*
* The final query to the whole system on dirty data to make sure we
* collect the latest status of the VM. For precopy, source QEMU will
* dump all the dirty data during switchover. For postcopy, this will
* properly update all the dirty bitmaps to finally generate the
* correct discard bitmaps; see ram_postcopy_send_discard_bitmap().
*/
>
> I'd also appreciate a short comment explaining why the final query is
> needed.
>
> > +
> > /* Inactivate disks except in COLO */
> > if (!migrate_colo()) {
> > /*
> > @@ -3285,7 +3288,7 @@ static void
> > migration_iteration_go_next(MigPendingData *pending)
> > /*
> > * Do a slow sync first before boosting the iteration count.
> > */
> > - qemu_savevm_query_pending(pending, true);
> > + qemu_savevm_query_pending_iter(pending, true);
> >
> > /*
> > * Update the dirty information for the whole system for this
> > @@ -3336,7 +3339,7 @@ static MigIterateState
> > migration_iteration_run(MigrationState *s)
> > bool complete_ready;
> >
> > /* Fast path - get the estimated amount of pending data */
> > - qemu_savevm_query_pending(&pending, false);
> > + qemu_savevm_query_pending_iter(&pending, false);
> >
> > if (in_postcopy) {
> > /*
> > diff --git a/migration/ram.c b/migration/ram.c
> > index fc38ffbf8a..4e3f442593 100644
> > --- a/migration/ram.c
> > +++ b/migration/ram.c
> > @@ -3376,10 +3376,6 @@ static int ram_save_complete(QEMUFile *f, void
> > *opaque)
> > rs->last_stage = !migration_in_colo_state();
> >
> > WITH_RCU_READ_LOCK_GUARD() {
> > - if (!migration_in_postcopy()) {
> > - migration_bitmap_sync_precopy(true);
> > - }
> > -
> > ret = rdma_registration_start(f, RAM_CONTROL_FINISH);
> > if (ret < 0) {
> > qemu_file_set_error(f, ret);
> > @@ -3442,25 +3438,38 @@ static int ram_save_complete(QEMUFile *f, void
> > *opaque)
> > return qemu_fflush(f);
> > }
> >
> > -static void ram_state_pending(void *opaque, MigPendingData *pending,
> > - bool exact)
> > +static void ram_state_pending_sync(bool exact, bool final)
> > {
> > - RAMState **temp = opaque;
> > - RAMState *rs = *temp;
> > - uint64_t remaining_size;
> > -
> > /*
> > * Sync is not needed either with: (1) a fast query, or (2) after
> > * postcopy has started (no new dirty will generate anymore).
> > */
> > - if (exact && !migration_in_postcopy()) {
> > + if (!exact || migration_in_postcopy()) {
> > + return;
> > + }
> > +
> > + /* Final pending query is called with BQL locked */
> > + if (!final) {
> > bql_lock();
> > - WITH_RCU_READ_LOCK_GUARD() {
> > - migration_bitmap_sync_precopy(false);
> > - }
> > + }
> > +
> > + WITH_RCU_READ_LOCK_GUARD() {
> > + migration_bitmap_sync_precopy(final);
> > + }
> > +
> > + if (!final) {
> > bql_unlock();
> > }
> > +}
> > +
> > +static void ram_state_pending(void *opaque, MigPendingData *pending,
> > + bool exact, bool final)
> > +{
> > + RAMState **temp = opaque;
> > + RAMState *rs = *temp;
> > + uint64_t remaining_size;
> >
> > + ram_state_pending_sync(exact, final);
> > remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
> >
> > if (migrate_postcopy_ram()) {
> > diff --git a/migration/savevm.c b/migration/savevm.c
> > index 23adaf9dd9..22cb6a15c6 100644
> > --- a/migration/savevm.c
> > +++ b/migration/savevm.c
> > @@ -1795,7 +1795,8 @@ int qemu_savevm_state_complete_precopy(MigrationState
> > *s)
> > return qemu_fflush(f);
> > }
> >
> > -void qemu_savevm_query_pending(MigPendingData *pending, bool exact)
> > +static void qemu_savevm_query_pending(MigPendingData *pending, bool exact,
> > + bool final)
> > {
> > SaveStateEntry *se;
> >
> > @@ -1808,7 +1809,7 @@ void qemu_savevm_query_pending(MigPendingData
> > *pending, bool exact)
> > if (!qemu_savevm_state_active(se)) {
> > continue;
> > }
> > - se->ops->save_query_pending(se->opaque, pending, exact);
> > + se->ops->save_query_pending(se->opaque, pending, exact, final);
> > }
> >
> > pending->total_bytes = pending->precopy_bytes +
> > @@ -1817,16 +1818,30 @@ void qemu_savevm_query_pending(MigPendingData
> > *pending, bool exact)
> > /*
> > * Update system remaining dirty bytes whenever QEMU queries. It will
> > * make the value to be not as accurate, but should still be pretty
> > - * close to reality when this got invoked frequently while iterating.
> > + * close to reality when this got invoked frequently while iterating.
> > Don't
> > + * update in final query as some modules may skip it if not needed.
>
> IMHO this specialty isn't needed. We can just make all modules report and
> making sure we don't deadlock on bql. AFAIU, CMMA never takes BQL, while
> dirty-bitmap should conditionally take it now.
>
> > */
> > - mig_stats.dirty_bytes_total = pending->total_bytes;
> > -
> > - trace_qemu_savevm_query_pending(exact, pending->precopy_bytes,
> > + if (!final) {
> > + mig_stats.dirty_bytes_total = pending->total_bytes;
> > + }
>
> Then we remove this too which is counter-intuitive..
>
> The rest looks all reasonable.
>
> Thanks,
>
> > + trace_qemu_savevm_query_pending(exact, final, pending->precopy_bytes,
> > pending->stopcopy_bytes,
> > pending->postcopy_bytes,
> > pending->total_bytes);
> > }
> >
> > +void qemu_savevm_query_pending_iter(MigPendingData *pending, bool exact)
> > +{
> > + qemu_savevm_query_pending(pending, exact, false);
> > +}
> > +
> > +void qemu_savevm_query_pending_final(MigPendingData *pending)
> > +{
> > + g_assert(bql_locked());
> > +
> > + qemu_savevm_query_pending(pending, true, true);
> > +}
> > +
> > void qemu_savevm_state_cleanup(void)
> > {
> > SaveStateEntry *se;
> > diff --git a/migration/trace-events b/migration/trace-events
> > index de99d976ab..1c9212d3e2 100644
> > --- a/migration/trace-events
> > +++ b/migration/trace-events
> > @@ -7,7 +7,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
> > qemu_loadvm_state_post_main(int ret) "%d"
> > qemu_loadvm_state_section_startfull(uint32_t section_id, const char
> > *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
> > qemu_savevm_send_packaged(void) ""
> > -qemu_savevm_query_pending(bool exact, uint64_t precopy, uint64_t stopcopy,
> > uint64_t postcopy, uint64_t total) "exact=%d, precopy=%"PRIu64",
> > stopcopy=%"PRIu64", postcopy=%"PRIu64", total=%"PRIu64
> > +qemu_savevm_query_pending(bool exact, bool final, uint64_t precopy,
> > uint64_t stopcopy, uint64_t postcopy, uint64_t total) "exact=%d, final=%d,
> > precopy=%"PRIu64", stopcopy=%"PRIu64", postcopy=%"PRIu64", total=%"PRIu64
> > loadvm_state_switchover_ack_needed(unsigned int
> > switchover_ack_pending_num) "Switchover ack pending num=%u"
> > loadvm_state_setup(void) ""
> > loadvm_state_cleanup(void) ""
> > --
> > 2.40.1
> >
>
> --
> Peter Xu
--
Peter Xu