On Tue, May 10, 2022 at 17:21:08 +0200, Jiri Denemark wrote:
> The check can reveal a serious bug in our migration code and we should
> not silently ignore it.
>
> Signed-off-by: Jiri Denemark <[email protected]>
> ---
> src/qemu/qemu_migration.c | 58 ++++++++++++++++++++++++---------------
> 1 file changed, 36 insertions(+), 22 deletions(-)
>
> diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
> index 5b6073b963..1c5dd9b391 100644
> --- a/src/qemu/qemu_migration.c
> +++ b/src/qemu/qemu_migration.c
> @@ -147,9 +147,10 @@ qemuMigrationCheckPhase(virDomainObj *vm,
>
> if (phase < QEMU_MIGRATION_PHASE_POSTCOPY_FAILED &&
> phase < priv->job.phase) {
> - VIR_ERROR(_("migration protocol going backwards %s => %s"),
> - qemuMigrationJobPhaseTypeToString(priv->job.phase),
> - qemuMigrationJobPhaseTypeToString(phase));
> + virReportError(VIR_ERR_INTERNAL_ERROR,
> + _("migration protocol going backwards %s => %s"),
> + qemuMigrationJobPhaseTypeToString(priv->job.phase),
> + qemuMigrationJobPhaseTypeToString(phase));
This bit seems to belongs to the previous commit actually, but since
it's not used anywhere else ...
> return -1;
> }
>
[...]
> @@ -4920,7 +4928,7 @@ qemuMigrationSrcPerformPeer2Peer2(virQEMUDriver *driver,
> * until the migration is complete.
> */
> VIR_DEBUG("Perform %p", sconn);
> - qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_PERFORM2);
> + ignore_value(qemuMigrationJobSetPhase(vm,
> QEMU_MIGRATION_PHASE_PERFORM2));
> if (flags & VIR_MIGRATE_TUNNELLED)
> ret = qemuMigrationSrcPerformTunnel(driver, vm, st, NULL,
> NULL, 0, NULL, NULL,
> @@ -5164,7 +5172,7 @@ qemuMigrationSrcPerformPeer2Peer3(virQEMUDriver *driver,
> * confirm migration completion.
> */
> VIR_DEBUG("Perform3 %p uri=%s", sconn, NULLSTR(uri));
> - qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_PERFORM3);
> + ignore_value(qemuMigrationJobSetPhase(vm,
> QEMU_MIGRATION_PHASE_PERFORM3));
> VIR_FREE(cookiein);
> cookiein = g_steal_pointer(&cookieout);
> cookieinlen = cookieoutlen;
Any reason why you want to ignore this before the migration was
performed?
> @@ -5189,7 +5197,7 @@ qemuMigrationSrcPerformPeer2Peer3(virQEMUDriver *driver,
> if (ret < 0) {
> virErrorPreserveLast(&orig_err);
> } else {
> - qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_PERFORM3_DONE);
> + ignore_value(qemuMigrationJobSetPhase(vm,
> QEMU_MIGRATION_PHASE_PERFORM3_DONE));
> }
>
> /* If Perform returns < 0, then we need to cancel the VM
I could somehwat understand it here after the migration is done, but a
bug could be also in this code.
> @@ -5657,7 +5667,9 @@ qemuMigrationSrcPerformPhase(virQEMUDriver *driver,
> return ret;
> }
>
> - qemuMigrationJobStartPhase(vm, QEMU_MIGRATION_PHASE_PERFORM3);
> + if (qemuMigrationJobStartPhase(vm, QEMU_MIGRATION_PHASE_PERFORM3) < 0)
> + goto endjob;
> +
> virCloseCallbacksUnset(driver->closeCallbacks, vm,
> qemuMigrationSrcCleanup);
>
> @@ -5671,7 +5683,7 @@ qemuMigrationSrcPerformPhase(virQEMUDriver *driver,
> goto endjob;
> }
>
> - qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_PERFORM3_DONE);
> + ignore_value(qemuMigrationJobSetPhase(vm,
> QEMU_MIGRATION_PHASE_PERFORM3_DONE));
>
> if (virCloseCallbacksSet(driver->closeCallbacks, vm, conn,
> qemuMigrationSrcCleanup) < 0)
Same here.