From e6704af1fcc39a8daabf97e78378c3c45f99515d Mon Sep 17 00:00:00 2001 From: Jiri Denemark Date: Mon, 4 Jul 2011 23:33:39 +0200 Subject: [PATCH] qemu: Recover from interrupted jobs Detect and react on situations when libvirtd was restarted or killed when a job was active. --- src/qemu/qemu_domain.c | 14 ++++++++ src/qemu/qemu_domain.h | 2 ++ src/qemu/qemu_process.c | 80 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 062ecc776f..b26308e2ae 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -142,6 +142,20 @@ qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv) memset(&job->signalsData, 0, sizeof(job->signalsData)); } +void +qemuDomainObjRestoreJob(virDomainObjPtr obj, + struct qemuDomainJobObj *job) +{ + qemuDomainObjPrivatePtr priv = obj->privateData; + + memset(job, 0, sizeof(*job)); + job->active = priv->job.active; + job->asyncJob = priv->job.asyncJob; + + qemuDomainObjResetJob(priv); + qemuDomainObjResetAsyncJob(priv); +} + static void qemuDomainObjFreeJob(qemuDomainObjPrivatePtr priv) { diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 8766393baf..d721e247ef 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -184,6 +184,8 @@ void qemuDomainObjEndNestedJob(struct qemud_driver *driver, void qemuDomainObjSaveJob(struct qemud_driver *driver, virDomainObjPtr obj); void qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj, unsigned long long allowedJobs); +void qemuDomainObjRestoreJob(virDomainObjPtr obj, + struct qemuDomainJobObj *job); void qemuDomainObjDiscardAsyncJob(struct qemud_driver *driver, virDomainObjPtr obj); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index e2e1388124..52a73b8377 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2231,6 +2231,80 @@ qemuProcessUpdateState(struct qemud_driver *driver, virDomainObjPtr vm) return 0; } +static int +qemuProcessRecoverJob(struct qemud_driver *driver, + virDomainObjPtr vm, + virConnectPtr conn, + const struct qemuDomainJobObj *job) +{ + virDomainState state; + int reason; + + state = virDomainObjGetState(vm, &reason); + + switch (job->asyncJob) { + case QEMU_ASYNC_JOB_MIGRATION_OUT: + case QEMU_ASYNC_JOB_MIGRATION_IN: + /* we don't know what to do yet */ + break; + + case QEMU_ASYNC_JOB_SAVE: + case QEMU_ASYNC_JOB_DUMP: + /* TODO cancel possibly running migrate operation */ + /* resume the domain but only if it was paused as a result of + * running save/dump operation */ + if (state == VIR_DOMAIN_PAUSED && + ((job->asyncJob == QEMU_ASYNC_JOB_DUMP && + reason == VIR_DOMAIN_PAUSED_DUMP) || + (job->asyncJob == QEMU_ASYNC_JOB_SAVE && + reason == VIR_DOMAIN_PAUSED_SAVE) || + reason == VIR_DOMAIN_PAUSED_UNKNOWN)) { + if (qemuProcessStartCPUs(driver, vm, conn, + VIR_DOMAIN_RUNNING_UNPAUSED) < 0) { + VIR_WARN("Could not resume domain %s after", vm->def->name); + } + } + break; + + case QEMU_ASYNC_JOB_NONE: + case QEMU_ASYNC_JOB_LAST: + break; + } + + if (!virDomainObjIsActive(vm)) + return -1; + + switch (job->active) { + case QEMU_JOB_QUERY: + /* harmless */ + break; + + case QEMU_JOB_DESTROY: + VIR_DEBUG("Domain %s should have already been destroyed", + vm->def->name); + return -1; + + case QEMU_JOB_SUSPEND: + /* mostly harmless */ + break; + + case QEMU_JOB_MODIFY: + /* XXX depending on the command we may be in an inconsistent state and + * we should probably fall back to "monitor error" state and refuse to + */ + break; + + case QEMU_JOB_ASYNC: + case QEMU_JOB_ASYNC_NESTED: + /* async job was already handled above */ + case QEMU_JOB_NONE: + case QEMU_JOB_LAST: + break; + } + + return 0; +} + struct qemuProcessReconnectData { virConnectPtr conn; struct qemud_driver *driver; @@ -2247,9 +2321,12 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa struct qemud_driver *driver = data->driver; qemuDomainObjPrivatePtr priv; virConnectPtr conn = data->conn; + struct qemuDomainJobObj oldjob; virDomainObjLock(obj); + qemuDomainObjRestoreJob(obj, &oldjob); + VIR_DEBUG("Reconnect monitor to %p '%s'", obj, obj->def->name); priv = obj->privateData; @@ -2295,6 +2372,9 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa if (qemuProcessFiltersInstantiate(conn, obj->def)) goto error; + if (qemuProcessRecoverJob(driver, obj, conn, &oldjob) < 0) + goto error; + priv->job.active = QEMU_JOB_NONE; /* update domain state XML with possibly updated state in virDomainObj */