Re: [libvirt] [PATCH 06/19] qemu: Recover from interrupted jobs

2011-07-11 Thread Daniel P. Berrange
On Fri, Jul 08, 2011 at 01:34:11AM +0200, Jiri Denemark wrote:
> Detect and react on situations when libvirtd was restarted or killed
> when a job was active.
> ---
>  src/qemu/qemu_domain.c  |   14 
>  src/qemu/qemu_domain.h  |2 +
>  src/qemu/qemu_process.c |   80 
> +++
>  3 files changed, 96 insertions(+), 0 deletions(-)

ACK

Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org   -o-   http://live.gnome.org/gtk-vnc :|

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list


[libvirt] [PATCH 06/19] qemu: Recover from interrupted jobs

2011-07-07 Thread Jiri Denemark
Detect and react on situations when libvirtd was restarted or killed
when a job was active.
---
 src/qemu/qemu_domain.c  |   14 
 src/qemu/qemu_domain.h  |2 +
 src/qemu/qemu_process.c |   80 +++
 3 files changed, 96 insertions(+), 0 deletions(-)

diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 062ecc7..b26308e 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -142,6 +142,20 @@ qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv)
 memset(&job->signalsData, 0, sizeof(job->signalsData));
 }
 
+void
+qemuDomainObjRestoreJob(virDomainObjPtr obj,
+struct qemuDomainJobObj *job)
+{
+qemuDomainObjPrivatePtr priv = obj->privateData;
+
+memset(job, 0, sizeof(*job));
+job->active = priv->job.active;
+job->asyncJob = priv->job.asyncJob;
+
+qemuDomainObjResetJob(priv);
+qemuDomainObjResetAsyncJob(priv);
+}
+
 static void
 qemuDomainObjFreeJob(qemuDomainObjPrivatePtr priv)
 {
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 17d1356..49be3d2 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -177,6 +177,8 @@ void qemuDomainObjEndNestedJob(struct qemud_driver *driver,
 void qemuDomainObjSaveJob(struct qemud_driver *driver, virDomainObjPtr obj);
 void qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj,
   unsigned long long allowedJobs);
+void qemuDomainObjRestoreJob(virDomainObjPtr obj,
+ struct qemuDomainJobObj *job);
 void qemuDomainObjDiscardAsyncJob(struct qemud_driver *driver,
   virDomainObjPtr obj);
 
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 3ffde51..49625b5 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -2223,6 +2223,80 @@ qemuProcessUpdateState(struct qemud_driver *driver, 
virDomainObjPtr vm)
 return 0;
 }
 
+static int
+qemuProcessRecoverJob(struct qemud_driver *driver,
+  virDomainObjPtr vm,
+  virConnectPtr conn,
+  const struct qemuDomainJobObj *job)
+{
+virDomainState state;
+int reason;
+
+state = virDomainObjGetState(vm, &reason);
+
+switch (job->asyncJob) {
+case QEMU_ASYNC_JOB_MIGRATION_OUT:
+case QEMU_ASYNC_JOB_MIGRATION_IN:
+/* we don't know what to do yet */
+break;
+
+case QEMU_ASYNC_JOB_SAVE:
+case QEMU_ASYNC_JOB_DUMP:
+/* TODO cancel possibly running migrate operation */
+/* resume the domain but only if it was paused as a result of
+ * running save/dump operation */
+if (state == VIR_DOMAIN_PAUSED &&
+((job->asyncJob == QEMU_ASYNC_JOB_DUMP &&
+  reason == VIR_DOMAIN_PAUSED_DUMP) ||
+ (job->asyncJob == QEMU_ASYNC_JOB_SAVE &&
+  reason == VIR_DOMAIN_PAUSED_SAVE) ||
+ reason == VIR_DOMAIN_PAUSED_UNKNOWN)) {
+if (qemuProcessStartCPUs(driver, vm, conn,
+ VIR_DOMAIN_RUNNING_UNPAUSED) < 0) {
+VIR_WARN("Could not resume domain %s after", vm->def->name);
+}
+}
+break;
+
+case QEMU_ASYNC_JOB_NONE:
+case QEMU_ASYNC_JOB_LAST:
+break;
+}
+
+if (!virDomainObjIsActive(vm))
+return -1;
+
+switch (job->active) {
+case QEMU_JOB_QUERY:
+/* harmless */
+break;
+
+case QEMU_JOB_DESTROY:
+VIR_DEBUG("Domain %s should have already been destroyed",
+  vm->def->name);
+return -1;
+
+case QEMU_JOB_SUSPEND:
+/* mostly harmless */
+break;
+
+case QEMU_JOB_MODIFY:
+/* XXX depending on the command we may be in an inconsistent state and
+ * we should probably fall back to "monitor error" state and refuse to
+ */
+break;
+
+case QEMU_JOB_ASYNC:
+case QEMU_JOB_ASYNC_NESTED:
+/* async job was already handled above */
+case QEMU_JOB_NONE:
+case QEMU_JOB_LAST:
+break;
+}
+
+return 0;
+}
+
 struct qemuProcessReconnectData {
 virConnectPtr conn;
 struct qemud_driver *driver;
@@ -2239,9 +2313,12 @@ qemuProcessReconnect(void *payload, const void *name 
ATTRIBUTE_UNUSED, void *opa
 struct qemud_driver *driver = data->driver;
 qemuDomainObjPrivatePtr priv;
 virConnectPtr conn = data->conn;
+struct qemuDomainJobObj oldjob;
 
 virDomainObjLock(obj);
 
+qemuDomainObjRestoreJob(obj, &oldjob);
+
 VIR_DEBUG("Reconnect monitor to %p '%s'", obj, obj->def->name);
 
 priv = obj->privateData;
@@ -2287,6 +2364,9 @@ qemuProcessReconnect(void *payload, const void *name 
ATTRIBUTE_UNUSED, void *opa
 if (qemuProcessFiltersInstantiate(conn, obj->def))
 goto error;
 
+if (qemuProcessRecoverJob(driver, obj, conn, &oldjob) < 0)
+goto error;
+
 priv->job.active = QEMU_JOB_NONE;
 
 /* update domai