On Wed, Jun 03, 2026 at 04:05:49PM -0400, Peter Xu wrote:
> On Tue, Jun 02, 2026 at 12:26:10PM +0300, Avihai Horon wrote:
> > Before switchover, the source needs one last exact pending query so
> > modules can flush dirty state. This is currently done ad hoc in modules
> > handlers. For example, RAM syncs its dirty bitmap in its save_complete
> > handler.
> > 
> > This should be a general concept relevant for any module, so extract it
> > to migration core instead by running a final save_query_pending before
> > switchover.
> > 
> > The final query requires special handling by modules (e.g., it's called
> > with BQL locked, during VM stop), so extend save_query_pending
> > SaveVMHandlers callback and qemu_savevm_query_pending() with a "final"
> > flag so migration modules can tell the last pending query during
> > switchover from periodic iteration queries.
> > 
> > Not all modules need the final query; skip it for those who don't need.
> > 
> > Signed-off-by: Avihai Horon <[email protected]>
> > ---
> >  include/migration/register.h   | 41 ++++++++++++++++++----------------
> >  migration/savevm.h             |  3 ++-
> >  hw/s390x/s390-stattrib.c       |  9 ++++++--
> >  hw/vfio/migration.c            |  6 ++++-
> >  migration/block-dirty-bitmap.c |  6 ++++-
> >  migration/migration.c          |  7 ++++--
> >  migration/ram.c                | 37 ++++++++++++++++++------------
> >  migration/savevm.c             | 27 +++++++++++++++++-----
> >  migration/trace-events         |  2 +-
> >  9 files changed, 91 insertions(+), 47 deletions(-)
> > 
> > diff --git a/include/migration/register.h b/include/migration/register.h
> > index 5e5e0ee432..6f632123f1 100644
> > --- a/include/migration/register.h
> > +++ b/include/migration/register.h
> > @@ -171,6 +171,28 @@ typedef struct SaveVMHandlers {
> >       */
> >      bool (*is_active_iterate)(void *opaque);
> >  
> > +    /**
> > +     * @save_query_pending
> > +     *
> > +     * This estimates the remaining data to transfer on the source side.
> > +     *
> > +     * When @exact is true, a module must report accurate results.  When
> > +     * @exact is false, a module may report estimates.
> > +     *
> > +     * It's highly recommended that modules implement a faster version of
> > +     * the query path (for example, by proper caching on the counters) if
> > +     * an accurate query will be time-consuming.
> > +     *
> > +     * @opaque: data pointer passed to register_savevm_live()
> > +     * @pending: pointer to a MigPendingData struct
> > +     * @exact: set to true for an accurate (slow) query
> > +     * @final: set to true for the final query during switchover. When 
> > final is
> > +     * true, the query is called with BQL locked. Otherwise, it's called 
> > with
> > +     * BQL unlocked.
> > +     */
> > +    void (*save_query_pending)(void *opaque, MigPendingData *pending,
> > +                               bool exact, bool final);
> > +
> >      /* This runs outside the BQL in the migration case, and
> >       * within the lock in the savevm case.  The callback had better only
> >       * use data that is local to the migration thread or protected
> > @@ -210,25 +232,6 @@ typedef struct SaveVMHandlers {
> >       */
> >      bool (*save_postcopy_prepare)(QEMUFile *f, void *opaque, Error **errp);
> >  
> > -    /**
> > -     * @save_query_pending
> > -     *
> > -     * This estimates the remaining data to transfer on the source side.
> > -     *
> > -     * When @exact is true, a module must report accurate results.  When
> > -     * @exact is false, a module may report estimates.
> > -     *
> > -     * It's highly recommended that modules implement a faster version of
> > -     * the query path (for example, by proper caching on the counters) if
> > -     * an accurate query will be time-consuming.
> > -     *
> > -     * @opaque: data pointer passed to register_savevm_live()
> > -     * @pending: pointer to a MigPendingData struct
> > -     * @exact: set to true for an accurate (slow) query
> > -     */
> > -    void (*save_query_pending)(void *opaque, MigPendingData *pending,
> > -                               bool exact);
> > -
> >      /**
> >       * @load_state
> >       *
> > diff --git a/migration/savevm.h b/migration/savevm.h
> > index 96fdf96d4e..10b3d78a5f 100644
> > --- a/migration/savevm.h
> > +++ b/migration/savevm.h
> > @@ -45,7 +45,8 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
> >  void qemu_savevm_state_cleanup(void);
> >  void qemu_savevm_state_complete_postcopy(QEMUFile *f);
> >  int qemu_savevm_state_complete_precopy(MigrationState *s);
> > -void qemu_savevm_query_pending(MigPendingData *pending, bool exact);
> > +void qemu_savevm_query_pending_iter(MigPendingData *pending, bool exact);
> > +void qemu_savevm_query_pending_final(MigPendingData *pending);
> >  int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool 
> > in_postcopy);
> >  bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
> >  void qemu_savevm_state_end(QEMUFile *f);
> > diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
> > index c334714b31..aed0c6844d 100644
> > --- a/hw/s390x/s390-stattrib.c
> > +++ b/hw/s390x/s390-stattrib.c
> > @@ -190,12 +190,17 @@ static int cmma_save_setup(QEMUFile *f, void *opaque, 
> > Error **errp)
> >  }
> >  
> >  static void cmma_state_pending(void *opaque, MigPendingData *pending,
> > -                               bool exact)
> > +                               bool exact, bool final)
> >  {
> >      S390StAttribState *sas = S390_STATTRIB(opaque);
> >      S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
> > -    long long res = sac->get_dirtycount(sas);
> > +    long long res;
> >  
> > +    if (final) {
> > +        return;
> > +    }
> > +
> > +    res = sac->get_dirtycount(sas);
> >      if (res >= 0) {
> >          pending->precopy_bytes += res;
> >      }
> > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> > index fb12b9717f..95072d6664 100644
> > --- a/hw/vfio/migration.c
> > +++ b/hw/vfio/migration.c
> > @@ -622,12 +622,16 @@ static void vfio_state_pending_sync(VFIODevice 
> > *vbasedev)
> >  }
> >  
> >  static void vfio_state_pending(void *opaque, MigPendingData *pending,
> > -                               bool exact)
> > +                               bool exact, bool final)
> >  {
> >      VFIODevice *vbasedev = opaque;
> >      VFIOMigration *migration = vbasedev->migration;
> >      uint64_t precopy_size, stopcopy_size;
> >  
> > +    if (final) {
> > +        return;
> > +    }
> > +
> >      if (exact) {
> >          vfio_state_pending_sync(vbasedev);
> >      }
> > diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
> > index 7ef3759e53..66ed91de03 100644
> > --- a/migration/block-dirty-bitmap.c
> > +++ b/migration/block-dirty-bitmap.c
> > @@ -767,12 +767,16 @@ static int dirty_bitmap_save_complete(QEMUFile *f, 
> > void *opaque)
> >  }
> >  
> >  static void dirty_bitmap_state_pending(void *opaque, MigPendingData *data,
> > -                                       bool exact)
> > +                                       bool exact, bool final)
> >  {
> >      DBMSaveState *s = &((DBMState *)opaque)->save;
> >      SaveBitmapState *dbms;
> >      uint64_t pending = 0;
> >  
> > +    if (final) {
> > +        return;
> > +    }
> > +
> >      bql_lock();
> >  
> >      QSIMPLEQ_FOREACH(dbms, &s->dbms_list, entry) {
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 7488a94206..df8ed3a9fd 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -2787,12 +2787,15 @@ static bool 
> > migration_switchover_prepare(MigrationState *s)
> >  static bool migration_switchover_start(MigrationState *s, Error **errp)
> >  {
> >      ERRP_GUARD();
> > +    MigPendingData pending = {};
> >  
> >      if (!migration_switchover_prepare(s)) {
> >          error_setg(errp, "Switchover is interrupted");
> >          return false;
> >      }
> >  
> > +    qemu_savevm_query_pending_final(&pending);
> 
> Not needed for postcopy, we can move it to migration_completion_precopy().

A better idea is.. we can drop migration_bitmap_sync() in
ram_postcopy_send_discard_bitmap() too, then we can keep this one.

But then, let's properly document this, both for precopy and postcopy
cases.  One example:

    /*
     * The final query to the whole system on dirty data to make sure we
     * collect the latest status of the VM.  For precopy, source QEMU will
     * dump all the dirty data during switchover.  For postcopy, this will
     * properly update all the dirty bitmaps to finally generate the
     * correct discard bitmaps; see ram_postcopy_send_discard_bitmap().
     */

> 
> I'd also appreciate a short comment explaining why the final query is
> needed.
> 
> > +
> >      /* Inactivate disks except in COLO */
> >      if (!migrate_colo()) {
> >          /*
> > @@ -3285,7 +3288,7 @@ static void 
> > migration_iteration_go_next(MigPendingData *pending)
> >      /*
> >       * Do a slow sync first before boosting the iteration count.
> >       */
> > -    qemu_savevm_query_pending(pending, true);
> > +    qemu_savevm_query_pending_iter(pending, true);
> >  
> >      /*
> >       * Update the dirty information for the whole system for this
> > @@ -3336,7 +3339,7 @@ static MigIterateState 
> > migration_iteration_run(MigrationState *s)
> >      bool complete_ready;
> >  
> >      /* Fast path - get the estimated amount of pending data */
> > -    qemu_savevm_query_pending(&pending, false);
> > +    qemu_savevm_query_pending_iter(&pending, false);
> >  
> >      if (in_postcopy) {
> >          /*
> > diff --git a/migration/ram.c b/migration/ram.c
> > index fc38ffbf8a..4e3f442593 100644
> > --- a/migration/ram.c
> > +++ b/migration/ram.c
> > @@ -3376,10 +3376,6 @@ static int ram_save_complete(QEMUFile *f, void 
> > *opaque)
> >      rs->last_stage = !migration_in_colo_state();
> >  
> >      WITH_RCU_READ_LOCK_GUARD() {
> > -        if (!migration_in_postcopy()) {
> > -            migration_bitmap_sync_precopy(true);
> > -        }
> > -
> >          ret = rdma_registration_start(f, RAM_CONTROL_FINISH);
> >          if (ret < 0) {
> >              qemu_file_set_error(f, ret);
> > @@ -3442,25 +3438,38 @@ static int ram_save_complete(QEMUFile *f, void 
> > *opaque)
> >      return qemu_fflush(f);
> >  }
> >  
> > -static void ram_state_pending(void *opaque, MigPendingData *pending,
> > -                              bool exact)
> > +static void ram_state_pending_sync(bool exact, bool final)
> >  {
> > -    RAMState **temp = opaque;
> > -    RAMState *rs = *temp;
> > -    uint64_t remaining_size;
> > -
> >      /*
> >       * Sync is not needed either with: (1) a fast query, or (2) after
> >       * postcopy has started (no new dirty will generate anymore).
> >       */
> > -    if (exact && !migration_in_postcopy()) {
> > +    if (!exact || migration_in_postcopy()) {
> > +        return;
> > +    }
> > +
> > +    /* Final pending query is called with BQL locked */
> > +    if (!final) {
> >          bql_lock();
> > -        WITH_RCU_READ_LOCK_GUARD() {
> > -            migration_bitmap_sync_precopy(false);
> > -        }
> > +    }
> > +
> > +    WITH_RCU_READ_LOCK_GUARD() {
> > +        migration_bitmap_sync_precopy(final);
> > +    }
> > +
> > +    if (!final) {
> >          bql_unlock();
> >      }
> > +}
> > +
> > +static void ram_state_pending(void *opaque, MigPendingData *pending,
> > +                              bool exact, bool final)
> > +{
> > +    RAMState **temp = opaque;
> > +    RAMState *rs = *temp;
> > +    uint64_t remaining_size;
> >  
> > +    ram_state_pending_sync(exact, final);
> >      remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
> >  
> >      if (migrate_postcopy_ram()) {
> > diff --git a/migration/savevm.c b/migration/savevm.c
> > index 23adaf9dd9..22cb6a15c6 100644
> > --- a/migration/savevm.c
> > +++ b/migration/savevm.c
> > @@ -1795,7 +1795,8 @@ int qemu_savevm_state_complete_precopy(MigrationState 
> > *s)
> >      return qemu_fflush(f);
> >  }
> >  
> > -void qemu_savevm_query_pending(MigPendingData *pending, bool exact)
> > +static void qemu_savevm_query_pending(MigPendingData *pending, bool exact,
> > +                                      bool final)
> >  {
> >      SaveStateEntry *se;
> >  
> > @@ -1808,7 +1809,7 @@ void qemu_savevm_query_pending(MigPendingData 
> > *pending, bool exact)
> >          if (!qemu_savevm_state_active(se)) {
> >              continue;
> >          }
> > -        se->ops->save_query_pending(se->opaque, pending, exact);
> > +        se->ops->save_query_pending(se->opaque, pending, exact, final);
> >      }
> >  
> >      pending->total_bytes = pending->precopy_bytes +
> > @@ -1817,16 +1818,30 @@ void qemu_savevm_query_pending(MigPendingData 
> > *pending, bool exact)
> >      /*
> >       * Update system remaining dirty bytes whenever QEMU queries.  It will
> >       * make the value to be not as accurate, but should still be pretty
> > -     * close to reality when this got invoked frequently while iterating.
> > +     * close to reality when this got invoked frequently while iterating.  
> > Don't
> > +     * update in final query as some modules may skip it if not needed.
> 
> IMHO this specialty isn't needed.  We can just make all modules report and
> making sure we don't deadlock on bql.  AFAIU, CMMA never takes BQL, while
> dirty-bitmap should conditionally take it now.
> 
> >       */
> > -    mig_stats.dirty_bytes_total = pending->total_bytes;
> > -
> > -    trace_qemu_savevm_query_pending(exact, pending->precopy_bytes,
> > +    if (!final) {
> > +        mig_stats.dirty_bytes_total = pending->total_bytes;
> > +    }
> 
> Then we remove this too which is counter-intuitive..
> 
> The rest looks all reasonable.
> 
> Thanks,
> 
> > +    trace_qemu_savevm_query_pending(exact, final, pending->precopy_bytes,
> >                                      pending->stopcopy_bytes,
> >                                      pending->postcopy_bytes,
> >                                      pending->total_bytes);
> >  }
> >  
> > +void qemu_savevm_query_pending_iter(MigPendingData *pending, bool exact)
> > +{
> > +    qemu_savevm_query_pending(pending, exact, false);
> > +}
> > +
> > +void qemu_savevm_query_pending_final(MigPendingData *pending)
> > +{
> > +    g_assert(bql_locked());
> > +
> > +    qemu_savevm_query_pending(pending, true, true);
> > +}
> > +
> >  void qemu_savevm_state_cleanup(void)
> >  {
> >      SaveStateEntry *se;
> > diff --git a/migration/trace-events b/migration/trace-events
> > index de99d976ab..1c9212d3e2 100644
> > --- a/migration/trace-events
> > +++ b/migration/trace-events
> > @@ -7,7 +7,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
> >  qemu_loadvm_state_post_main(int ret) "%d"
> >  qemu_loadvm_state_section_startfull(uint32_t section_id, const char 
> > *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
> >  qemu_savevm_send_packaged(void) ""
> > -qemu_savevm_query_pending(bool exact, uint64_t precopy, uint64_t stopcopy, 
> > uint64_t postcopy, uint64_t total) "exact=%d, precopy=%"PRIu64", 
> > stopcopy=%"PRIu64", postcopy=%"PRIu64", total=%"PRIu64
> > +qemu_savevm_query_pending(bool exact, bool final, uint64_t precopy, 
> > uint64_t stopcopy, uint64_t postcopy, uint64_t total) "exact=%d, final=%d, 
> > precopy=%"PRIu64", stopcopy=%"PRIu64", postcopy=%"PRIu64", total=%"PRIu64
> >  loadvm_state_switchover_ack_needed(unsigned int 
> > switchover_ack_pending_num) "Switchover ack pending num=%u"
> >  loadvm_state_setup(void) ""
> >  loadvm_state_cleanup(void) ""
> > -- 
> > 2.40.1
> > 
> 
> -- 
> Peter Xu

-- 
Peter Xu


Reply via email to