Re: [libvirt] [PATCH 04/10] qemu: Recover from interrupted migration

2011-07-26 Thread Eric Blake

On 07/22/2011 03:06 PM, Eric Blake wrote:


+ case QEMU_MIGRATION_PHASE_PERFORM2:
+ case QEMU_MIGRATION_PHASE_PERFORM3:
+ /* migration is still in progress, let's cancel it and resume the
+ * domain */
+ VIR_DEBUG("Canceling unfinished outgoing migration of domain %s",
+ vm->def->name);
+ /* TODO cancel possibly running migrate operation */


As in issue qemuMonitorMigrateCancel, but ignoring if it fails? Might be
reasonable, but probably as a separate patch.


Should have read the series first - you did this in patch 10/10.

--
Eric Blake   ebl...@redhat.com+1-801-349-2682
Libvirt virtualization library http://libvirt.org

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list


Re: [libvirt] [PATCH 04/10] qemu: Recover from interrupted migration

2011-07-22 Thread Eric Blake

On 07/18/2011 06:27 PM, Jiri Denemark wrote:

---
  src/qemu/qemu_process.c |  110 ++-
  1 files changed, 109 insertions(+), 1 deletions(-)




  static int
+qemuProcessRecoverMigration(struct qemud_driver *driver,
+virDomainObjPtr vm,
+virConnectPtr conn,
+enum qemuDomainAsyncJob job,
+enum qemuMigrationJobPhase phase,
+virDomainState state,
+int reason)
+{
+if (job == QEMU_ASYNC_JOB_MIGRATION_IN) {
+switch (phase) {
+case QEMU_MIGRATION_PHASE_NONE:
+case QEMU_MIGRATION_PHASE_PERFORM2:
+case QEMU_MIGRATION_PHASE_BEGIN3:


Should we reject as impossible the phases that should never be 
encountered on MIGRATION_IN?  For example, QEMU_MIGRATION_PHASE_BEGIN3 
belongs to MIGRATION_OUT, so if our job is MIGRATION_IN but we see that 
phase, we should probably fail rather than return 0.



+case QEMU_MIGRATION_PHASE_PERFORM2:
+case QEMU_MIGRATION_PHASE_PERFORM3:
+/* migration is still in progress, let's cancel it and resume the
+ * domain */
+VIR_DEBUG("Canceling unfinished outgoing migration of domain %s",
+  vm->def->name);
+/* TODO cancel possibly running migrate operation */


As in issue qemuMonitorMigrateCancel, but ignoring if it fails?  Might 
be reasonable, but probably as a separate patch.



+/* resume the domain but only if it was paused as a result of
+ * migration */
+if (state == VIR_DOMAIN_PAUSED&&
+(reason == VIR_DOMAIN_PAUSED_MIGRATION ||
+ reason == VIR_DOMAIN_PAUSED_UNKNOWN)) {
+if (qemuProcessStartCPUs(driver, vm, conn,
+ VIR_DOMAIN_RUNNING_UNPAUSED)<  0) {


On the other hand, will the monitor command to restart cpus even work if 
a pending migration is underway?  So we may have to do the 
qemuMonitorMigrateCancel no matter what, to ensure the monitor will let 
us resume.


I think what you have works (strict improvement over what we have now), 
even if it can be further improved with later patches, so:


ACK.

--
Eric Blake   ebl...@redhat.com+1-801-349-2682
Libvirt virtualization library http://libvirt.org

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list


[libvirt] [PATCH 04/10] qemu: Recover from interrupted migration

2011-07-18 Thread Jiri Denemark
---
 src/qemu/qemu_process.c |  110 ++-
 1 files changed, 109 insertions(+), 1 deletions(-)

diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 448b06e..48bd435 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -37,6 +37,7 @@
 #include "qemu_hostdev.h"
 #include "qemu_hotplug.h"
 #include "qemu_bridge_filter.h"
+#include "qemu_migration.h"
 
 #if HAVE_NUMACTL
 # define NUMA_VERSION1_COMPATIBILITY 1
@@ -2236,6 +2237,111 @@ qemuProcessUpdateState(struct qemud_driver *driver, 
virDomainObjPtr vm)
 }
 
 static int
+qemuProcessRecoverMigration(struct qemud_driver *driver,
+virDomainObjPtr vm,
+virConnectPtr conn,
+enum qemuDomainAsyncJob job,
+enum qemuMigrationJobPhase phase,
+virDomainState state,
+int reason)
+{
+if (job == QEMU_ASYNC_JOB_MIGRATION_IN) {
+switch (phase) {
+case QEMU_MIGRATION_PHASE_NONE:
+case QEMU_MIGRATION_PHASE_PERFORM2:
+case QEMU_MIGRATION_PHASE_BEGIN3:
+case QEMU_MIGRATION_PHASE_PERFORM3:
+case QEMU_MIGRATION_PHASE_PERFORM3_DONE:
+case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED:
+case QEMU_MIGRATION_PHASE_CONFIRM3:
+case QEMU_MIGRATION_PHASE_LAST:
+break;
+
+case QEMU_MIGRATION_PHASE_PREPARE:
+VIR_DEBUG("Killing unfinished incoming migration for domain %s",
+  vm->def->name);
+return -1;
+
+case QEMU_MIGRATION_PHASE_FINISH2:
+/* source domain is already killed so let's just resume the domain
+ * and hope we are all set */
+VIR_DEBUG("Incoming migration finished, resuming domain %s",
+  vm->def->name);
+if (qemuProcessStartCPUs(driver, vm, conn,
+ VIR_DOMAIN_RUNNING_UNPAUSED) < 0) {
+VIR_WARN("Could not resume domain %s", vm->def->name);
+}
+break;
+
+case QEMU_MIGRATION_PHASE_FINISH3:
+/* migration finished, we started resuming the domain but didn't
+ * confirm success or failure yet; killing it seems safest */
+VIR_DEBUG("Killing migrated domain %s", vm->def->name);
+return -1;
+}
+} else if (job == QEMU_ASYNC_JOB_MIGRATION_OUT) {
+switch (phase) {
+case QEMU_MIGRATION_PHASE_NONE:
+case QEMU_MIGRATION_PHASE_PREPARE:
+case QEMU_MIGRATION_PHASE_FINISH2:
+case QEMU_MIGRATION_PHASE_FINISH3:
+case QEMU_MIGRATION_PHASE_LAST:
+break;
+
+case QEMU_MIGRATION_PHASE_BEGIN3:
+/* nothing happen so far, just forget we were about to migrate the
+ * domain */
+break;
+
+case QEMU_MIGRATION_PHASE_PERFORM2:
+case QEMU_MIGRATION_PHASE_PERFORM3:
+/* migration is still in progress, let's cancel it and resume the
+ * domain */
+VIR_DEBUG("Canceling unfinished outgoing migration of domain %s",
+  vm->def->name);
+/* TODO cancel possibly running migrate operation */
+/* resume the domain but only if it was paused as a result of
+ * migration */
+if (state == VIR_DOMAIN_PAUSED &&
+(reason == VIR_DOMAIN_PAUSED_MIGRATION ||
+ reason == VIR_DOMAIN_PAUSED_UNKNOWN)) {
+if (qemuProcessStartCPUs(driver, vm, conn,
+ VIR_DOMAIN_RUNNING_UNPAUSED) < 0) {
+VIR_WARN("Could not resume domain %s", vm->def->name);
+}
+}
+break;
+
+case QEMU_MIGRATION_PHASE_PERFORM3_DONE:
+/* migration finished but we didn't have a chance to get the result
+ * of Finish3 step; third party needs to check what to do next
+ */
+break;
+
+case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED:
+/* Finish3 failed, we need to resume the domain */
+VIR_DEBUG("Resuming domain %s after failed migration",
+  vm->def->name);
+if (state == VIR_DOMAIN_PAUSED &&
+(reason == VIR_DOMAIN_PAUSED_MIGRATION ||
+ reason == VIR_DOMAIN_PAUSED_UNKNOWN)) {
+if (qemuProcessStartCPUs(driver, vm, conn,
+ VIR_DOMAIN_RUNNING_UNPAUSED) < 0) {
+VIR_WARN("Could not resume domain %s", vm->def->name);
+}
+}
+break;
+
+case QEMU_MIGRATION_PHASE_CONFIRM3:
+/* migration completed, we need to kill the domain here */
+return -1;
+}
+}
+
+return 0;
+}
+
+static int
 qemuProcessRecoverJob(struct qemud_driver *driver,