Signed-off-by: Vineeth Pillai <virem...@linux.microsoft.com> Signed-off-by: Praveen K Paladugu <pra...@linux.microsoft.com> --- po/POTFILES.in | 1 + src/ch/ch_cgroup.c | 457 ++++++++++++++++++++++++++++++++++++++++++++ src/ch/ch_cgroup.h | 45 +++++ src/ch/ch_conf.c | 2 + src/ch/ch_conf.h | 4 +- src/ch/ch_domain.c | 33 ++++ src/ch/ch_domain.h | 3 +- src/ch/ch_monitor.c | 125 ++++++++++-- src/ch/ch_monitor.h | 54 +++++- src/ch/ch_process.c | 288 +++++++++++++++++++++++++++- src/ch/ch_process.h | 3 + src/ch/meson.build | 2 + 12 files changed, 991 insertions(+), 26 deletions(-) create mode 100644 src/ch/ch_cgroup.c create mode 100644 src/ch/ch_cgroup.h
diff --git a/po/POTFILES.in b/po/POTFILES.in index b554cf08ca..3a8db501bc 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -19,6 +19,7 @@ @SRCDIR@src/bhyve/bhyve_parse_command.c @SRCDIR@src/bhyve/bhyve_process.c @SRCDIR@src/ch/ch_conf.c +@SRCDIR@src/ch/ch_cgroup.c @SRCDIR@src/ch/ch_domain.c @SRCDIR@src/ch/ch_driver.c @SRCDIR@src/ch/ch_monitor.c diff --git a/src/ch/ch_cgroup.c b/src/ch/ch_cgroup.c new file mode 100644 index 0000000000..6be2184cf1 --- /dev/null +++ b/src/ch/ch_cgroup.c @@ -0,0 +1,457 @@ +/* + * ch_cgroup.c: CH cgroup management + * + * Copyright Microsoft Corp. 2020-2021 + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#include "ch_cgroup.h" +#include "ch_domain.h" +#include "ch_process.h" +#include "vircgroup.h" +#include "virlog.h" +#include "viralloc.h" +#include "virerror.h" +#include "domain_audit.h" +#include "domain_cgroup.h" +#include "virscsi.h" +#include "virstring.h" +#include "virfile.h" +#include "virtypedparam.h" +#include "virnuma.h" +#include "virdevmapper.h" +#include "virutil.h" + +#define VIR_FROM_THIS VIR_FROM_CH + +VIR_LOG_INIT("ch.ch_cgroup"); + +static int +chSetupBlkioCgroup(virDomainObj * vm) +{ + virCHDomainObjPrivate *priv = vm->privateData; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) { + if (vm->def->blkio.weight || vm->def->blkio.ndevices) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Block I/O tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + return virDomainCgroupSetupBlkio(priv->cgroup, vm->def->blkio); +} + + +static int +chSetupMemoryCgroup(virDomainObj * vm) +{ + virCHDomainObjPrivate *priv = vm->privateData; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { + if (virMemoryLimitIsSet(vm->def->mem.hard_limit) || + virMemoryLimitIsSet(vm->def->mem.soft_limit) || + virMemoryLimitIsSet(vm->def->mem.swap_hard_limit)) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Memory cgroup is not available on this host")); + return -1; + } else { + return 0; + } + } + + return virDomainCgroupSetupMemtune(priv->cgroup, vm->def->mem); +} + +static int +chSetupCpusetCgroup(virDomainObj * vm) +{ + virCHDomainObjPrivate *priv = vm->privateData; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) + return 0; + + if (virCgroupSetCpusetMemoryMigrate(priv->cgroup, true) < 0) + return -1; + + return 0; +} + + +static int +chSetupCpuCgroup(virDomainObj * vm) +{ + virCHDomainObjPrivate *priv = vm->privateData; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { + if (vm->def->cputune.sharesSpecified) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("CPU tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->cputune.sharesSpecified) { + + if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0) + return -1; + + } + + return 0; +} + + +static int +chInitCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes) +{ + virCHDomainObjPrivate *priv = vm->privateData; + + g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver); + + if (!priv->driver->privileged) + return 0; + + if (!virCgroupAvailable()) + return 0; + + virCgroupFree(priv->cgroup); + + if (!vm->def->resource) { + virDomainResourceDef *res; + + res = g_new0(virDomainResourceDef, 1); + + res->partition = g_strdup("/machine"); + + vm->def->resource = res; + } + + if (vm->def->resource->partition[0] != '/') { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Resource partition '%s' must start with '/'"), + vm->def->resource->partition); + return -1; + } + + if (virCgroupNewMachine(priv->machineName, "ch", vm->def->uuid, NULL, vm->pid, false, nnicindexes, nicindexes, vm->def->resource->partition, cfg->cgroupControllers, 0, /* maxThreadsPerProc */ + &priv->cgroup) < 0) { + if (virCgroupNewIgnoreError()) + return 0; + + return -1; + } + + return 0; +} + +static void +chRestoreCgroupState(virDomainObj * vm) +{ + g_autofree char *mem_mask = NULL; + g_autofree char *nodeset = NULL; + virCHDomainObjPrivate *priv = vm->privateData; + size_t i = 0; + + g_autoptr(virBitmap) all_nodes = NULL; + virCgroup *cgroup_temp = NULL; + + if (!virNumaIsAvailable() || + !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) + return; + + if (!(all_nodes = virNumaGetHostMemoryNodeset())) + goto error; + + if (!(mem_mask = virBitmapFormat(all_nodes))) + goto error; + + if ((virCgroupHasEmptyTasks(priv->cgroup, + VIR_CGROUP_CONTROLLER_CPUSET)) <= 0) + goto error; + + if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0) + goto error; + + for (i = 0; i < virDomainDefGetVcpusMax(vm->def); i++) { + virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, i); + + if (!vcpu->online) + continue; + + if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i, + false, &cgroup_temp) < 0 || + virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 || + virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 || + virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0) + goto cleanup; + + g_free(nodeset); + virCgroupFree(cgroup_temp); + } + + for (i = 0; i < vm->def->niothreadids; i++) { + if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD, + vm->def->iothreadids[i]->iothread_id, + false, &cgroup_temp) < 0 || + virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 || + virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 || + virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0) + goto cleanup; + + g_free(nodeset); + virCgroupFree(cgroup_temp); + } + + if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0, + false, &cgroup_temp) < 0 || + virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 || + virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 || + virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0) + goto cleanup; + + cleanup: + virCgroupFree(cgroup_temp); + return; + + error: + virResetLastError(); + VIR_DEBUG("Couldn't restore cgroups to meaningful state"); + goto cleanup; +} + +int +chConnectCgroup(virDomainObj * vm) +{ + virCHDomainObjPrivate *priv = vm->privateData; + + g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver); + + if (!priv->driver->privileged) + return 0; + + if (!virCgroupAvailable()) + return 0; + + virCgroupFree(priv->cgroup); + + if (virCgroupNewDetectMachine(vm->def->name, + "ch", + vm->pid, + cfg->cgroupControllers, + priv->machineName, &priv->cgroup) < 0) + return -1; + + chRestoreCgroupState(vm); + return 0; +} + +int +chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes) +{ + virCHDomainObjPrivate *priv = vm->privateData; + + if (!vm->pid) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot setup cgroups until process is started")); + return -1; + } + + if (chInitCgroup(vm, nnicindexes, nicindexes) < 0) + return -1; + + if (!priv->cgroup) + return 0; + + if (chSetupBlkioCgroup(vm) < 0) + return -1; + + if (chSetupMemoryCgroup(vm) < 0) + return -1; + + if (chSetupCpuCgroup(vm) < 0) + return -1; + + if (chSetupCpusetCgroup(vm) < 0) + return -1; + + return 0; +} + +int +chSetupCgroupVcpuBW(virCgroup * cgroup, + unsigned long long period, long long quota) +{ + return virCgroupSetupCpuPeriodQuota(cgroup, period, quota); +} + + +int +chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask) +{ + return virCgroupSetupCpusetCpus(cgroup, cpumask); +} + +int +chSetupGlobalCpuCgroup(virDomainObj * vm) +{ + virCHDomainObjPrivate *priv = vm->privateData; + unsigned long long period = vm->def->cputune.global_period; + long long quota = vm->def->cputune.global_quota; + g_autofree char *mem_mask = NULL; + virDomainNumatuneMemMode mem_mode; + + if ((period || quota) && + !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("cgroup cpu is required for scheduler tuning")); + return -1; + } + + /* + * If CPU cgroup controller is not initialized here, then we need + * neither period nor quota settings. And if CPUSET controller is + * not initialized either, then there's nothing to do anyway. + */ + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) && + !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) + return 0; + + + if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 && + mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && + virDomainNumatuneMaybeFormatNodeset(vm->def->numa, + priv->autoNodeset, + &mem_mask, -1) < 0) + return -1; + + if (period || quota) { + if (chSetupCgroupVcpuBW(priv->cgroup, period, quota) < 0) + return -1; + } + + return 0; +} + + +int +chRemoveCgroup(virDomainObj * vm) +{ + virCHDomainObjPrivate *priv = vm->privateData; + + if (priv->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + if (virCgroupTerminateMachine(priv->machineName) < 0) { + if (!virCgroupNewIgnoreError()) + VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name); + } + + return virCgroupRemove(priv->cgroup); +} + + +static void +chCgroupEmulatorAllNodesDataFree(chCgroupEmulatorAllNodesData * data) +{ + if (!data) + return; + + virCgroupFree(data->emulatorCgroup); + g_free(data->emulatorMemMask); + g_free(data); +} + + +/** + * chCgroupEmulatorAllNodesAllow: + * @cgroup: domain cgroup pointer + * @retData: filled with structure used to roll back the operation + * + * Allows all NUMA nodes for the cloud hypervisor thread temporarily. This is + * necessary when hotplugging cpus since it requires memory allocated in the + * DMA region. Afterwards the operation can be reverted by + * chCgroupEmulatorAllNodesRestore. + * + * Returns 0 on success -1 on error + */ +int +chCgroupEmulatorAllNodesAllow(virCgroup * cgroup, + chCgroupEmulatorAllNodesData ** retData) +{ + chCgroupEmulatorAllNodesData *data = NULL; + g_autofree char *all_nodes_str = NULL; + + g_autoptr(virBitmap) all_nodes = NULL; + int ret = -1; + + if (!virNumaIsAvailable() || + !virCgroupHasController(cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) + return 0; + + if (!(all_nodes = virNumaGetHostMemoryNodeset())) + goto cleanup; + + if (!(all_nodes_str = virBitmapFormat(all_nodes))) + goto cleanup; + + data = g_new0(chCgroupEmulatorAllNodesData, 1); + + if (virCgroupNewThread(cgroup, VIR_CGROUP_THREAD_EMULATOR, 0, + false, &data->emulatorCgroup) < 0) + goto cleanup; + + if (virCgroupGetCpusetMems(data->emulatorCgroup, &data->emulatorMemMask) < 0 + || virCgroupSetCpusetMems(data->emulatorCgroup, all_nodes_str) < 0) + goto cleanup; + + *retData = g_steal_pointer(&data); + ret = 0; + + cleanup: + chCgroupEmulatorAllNodesDataFree(data); + + return ret; +} + + +/** + * chCgroupEmulatorAllNodesRestore: + * @data: data structure created by chCgroupEmulatorAllNodesAllow + * + * Rolls back the setting done by chCgroupEmulatorAllNodesAllow and frees the + * associated data. + */ +void +chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data) +{ + virError *err; + + if (!data) + return; + + virErrorPreserveLast(&err); + virCgroupSetCpusetMems(data->emulatorCgroup, data->emulatorMemMask); + virErrorRestore(&err); + + chCgroupEmulatorAllNodesDataFree(data); +} diff --git a/src/ch/ch_cgroup.h b/src/ch/ch_cgroup.h new file mode 100644 index 0000000000..0152b5477c --- /dev/null +++ b/src/ch/ch_cgroup.h @@ -0,0 +1,45 @@ +/* + * ch_cgroup.h: CH cgroup management + * + * Copyright Microsoft Corp. 2020-2021 + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "virusb.h" +#include "vircgroup.h" +#include "domain_conf.h" +#include "ch_conf.h" + +int chConnectCgroup(virDomainObj * vm); +int chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes); +int chSetupCgroupVcpuBW(virCgroup * cgroup, + unsigned long long period, long long quota); +int chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask); +int chSetupGlobalCpuCgroup(virDomainObj * vm); +int chRemoveCgroup(virDomainObj * vm); + +typedef struct _chCgroupEmulatorAllNodesData chCgroupEmulatorAllNodesData; + +struct _chCgroupEmulatorAllNodesData { + virCgroup *emulatorCgroup; + char *emulatorMemMask; +}; + +int chCgroupEmulatorAllNodesAllow(virCgroup * cgroup, + chCgroupEmulatorAllNodesData ** data); +void chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data); diff --git a/src/ch/ch_conf.c b/src/ch/ch_conf.c index ed0fffe5d6..7f70452296 100644 --- a/src/ch/ch_conf.c +++ b/src/ch/ch_conf.c @@ -141,6 +141,8 @@ virCHDriverConfigNew(bool privileged) if (!(cfg = virObjectNew(virCHDriverConfigClass))) return NULL; + cfg->cgroupControllers = -1; /* Auto detect */ + if (privileged) { if (virGetUserID(CH_USER, &cfg->user) < 0) return NULL; diff --git a/src/ch/ch_conf.h b/src/ch/ch_conf.h index 49f286f97a..19deb8e568 100644 --- a/src/ch/ch_conf.h +++ b/src/ch/ch_conf.h @@ -35,11 +35,13 @@ struct _virCHDriverConfig { char *stateDir; char *logDir; - + int cgroupControllers; uid_t user; gid_t group; }; +G_DEFINE_AUTOPTR_CLEANUP_FUNC(virCHDriverConfig, virObjectUnref); + struct _virCHDriver { virMutex lock; diff --git a/src/ch/ch_domain.c b/src/ch/ch_domain.c index e1030800aa..d0aaeed1f4 100644 --- a/src/ch/ch_domain.c +++ b/src/ch/ch_domain.c @@ -326,6 +326,39 @@ chValidateDomainDeviceDef(const virDomainDeviceDef *dev, _("Serial can only be enabled for a PTY")); return -1; } + return 0; +} +int +virCHDomainRefreshThreadInfo(virDomainObj *vm) +{ + size_t maxvcpus = virDomainDefGetVcpusMax(vm->def); + virCHMonitorThreadInfo *info = NULL; + size_t nthreads, ncpus = 0; + size_t i; + + nthreads = virCHMonitorGetThreadInfo(virCHDomainGetMonitor(vm), + true, &info); + + for (i = 0; i < nthreads; i++) { + virCHDomainVcpuPrivate *vcpupriv; + virDomainVcpuDef *vcpu; + virCHMonitorCPUInfo *vcpuInfo; + + if (info[i].type != virCHThreadTypeVcpu) + continue; + + // TODO: hotplug support + vcpuInfo = &info[i].vcpuInfo; + vcpu = virDomainDefGetVcpu(vm->def, vcpuInfo->cpuid); + vcpupriv = CH_DOMAIN_VCPU_PRIVATE(vcpu); + vcpupriv->tid = vcpuInfo->tid; + ncpus++; + } + + // TODO: Remove the warning when hotplug is implemented. + if (ncpus != maxvcpus) + VIR_WARN("Mismatch in the number of cpus, expected: %ld, actual: %ld", + maxvcpus, ncpus); return 0; } diff --git a/src/ch/ch_domain.h b/src/ch/ch_domain.h index 3ac3421015..2ce3e2cef3 100644 --- a/src/ch/ch_domain.h +++ b/src/ch/ch_domain.h @@ -89,7 +89,8 @@ virCHDomainObjBeginJob(virDomainObj *obj, enum virCHDomainJob job) void virCHDomainObjEndJob(virDomainObj *obj); -int virCHDomainRefreshVcpuInfo(virDomainObj *vm); +int virCHDomainRefreshThreadInfo(virDomainObj *vm); + pid_t virCHDomainGetVcpuPid(virDomainObj *vm, unsigned int vcpuid); bool virCHDomainHasVcpuPids(virDomainObj *vm); diff --git a/src/ch/ch_monitor.c b/src/ch/ch_monitor.c index c0ae031200..095779cb3f 100644 --- a/src/ch/ch_monitor.c +++ b/src/ch/ch_monitor.c @@ -41,6 +41,7 @@ VIR_LOG_INIT("ch.ch_monitor"); static virClass *virCHMonitorClass; static void virCHMonitorDispose(void *obj); +static void virCHMonitorThreadInfoFree(virCHMonitor *mon); static int virCHMonitorOnceInit(void) { @@ -571,6 +572,7 @@ static void virCHMonitorDispose(void *opaque) virCHMonitor *mon = opaque; VIR_DEBUG("mon=%p", mon); + virCHMonitorThreadInfoFree(mon); virObjectUnref(mon->vm); } @@ -736,6 +738,114 @@ virCHMonitorGet(virCHMonitor *mon, const char *endpoint, virJSONValue **response return ret; } +/** + * virCHMonitorGetInfo: + * @mon: Pointer to the monitor + * @info: Get VM info + * + * Retrieve the VM info and store in @info + * + * Returns 0 on success. + */ +int +virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info) +{ + return virCHMonitorGet(mon, URL_VM_INFO, info); +} + +static void +virCHMonitorThreadInfoFree(virCHMonitor *mon) +{ + mon->nthreads = 0; + if (mon->threads) + VIR_FREE(mon->threads); +} + +static size_t +virCHMonitorRefreshThreadInfo(virCHMonitor *mon) +{ + virCHMonitorThreadInfo *info = NULL; + g_autofree pid_t *tids = NULL; + virDomainObj *vm = mon->vm; + size_t ntids = 0; + size_t i; + + + virCHMonitorThreadInfoFree(mon); + if (virProcessGetPids(vm->pid, &ntids, &tids) < 0) { + mon->threads = NULL; + return 0; + } + + info = g_new0(virCHMonitorThreadInfo, ntids); + for (i = 0; i < ntids; i++) { + g_autofree char *proc = NULL; + g_autofree char *data = NULL; + + proc = g_strdup_printf("/proc/%d/task/%d/comm", + (int)vm->pid, (int)tids[i]); + + if (virFileReadAll(proc, (1<<16), &data) < 0) { + continue; + } + + VIR_DEBUG("VM PID: %d, TID %d, COMM: %s", + (int)vm->pid, (int)tids[i], data); + if (STRPREFIX(data, "vcpu")) { + int cpuid; + char *tmp; + if (virStrToLong_i(data + 4, &tmp, 0, &cpuid) < 0) { + VIR_WARN("Index is not specified correctly"); + continue; + } + info[i].type = virCHThreadTypeVcpu; + info[i].vcpuInfo.tid = tids[i]; + info[i].vcpuInfo.online = true; + info[i].vcpuInfo.cpuid = cpuid; + VIR_DEBUG("vcpu%d -> tid: %d", cpuid, tids[i]); + } else if (STRPREFIX(data, "_disk") || STRPREFIX(data, "_net") || + STRPREFIX(data, "_rng")) { + /* Prefixes used by cloud-hypervisor for IO Threads are captured at + https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/vmm/src/device_manager.rs */ + info[i].type = virCHThreadTypeIO; + info[i].ioInfo.tid = tids[i]; + virStrcpy(info[i].ioInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1); + }else { + info[i].type = virCHThreadTypeEmulator; + info[i].emuInfo.tid = tids[i]; + virStrcpy(info[i].emuInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1); + } + mon->nthreads++; + + } + mon->threads = info; + + return mon->nthreads; +} + +/** + * virCHMonitorGetThreadInfo: + * @mon: Pointer to the monitor + * @refresh: Refresh thread info or not + * + * Retrive thread info and store to @threads + * + * Returns count of threads on success. + */ +size_t +virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh, + virCHMonitorThreadInfo **threads) +{ + int nthreads = 0; + + if (refresh) + nthreads = virCHMonitorRefreshThreadInfo(mon); + + *threads = mon->threads; + + return nthreads; +} + int virCHMonitorShutdownVMM(virCHMonitor *mon) { @@ -810,18 +920,3 @@ virCHMonitorResumeVM(virCHMonitor *mon) { return virCHMonitorPutNoContent(mon, URL_VM_RESUME); } - -/** - * virCHMonitorGetInfo: - * @mon: Pointer to the monitor - * @info: Get VM info - * - * Retrieve the VM info and store in @info - * - * Returns 0 on success. - */ -int -virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info) -{ - return virCHMonitorGet(mon, URL_VM_INFO, info); -} diff --git a/src/ch/ch_monitor.h b/src/ch/ch_monitor.h index 8ca9e17a9a..f8c3fa75e8 100644 --- a/src/ch/ch_monitor.h +++ b/src/ch/ch_monitor.h @@ -37,6 +37,50 @@ #define URL_VM_RESUME "vm.resume" #define URL_VM_INFO "vm.info" +#define VIRCH_THREAD_NAME_LEN 16 + +typedef enum { + virCHThreadTypeEmulator, + virCHThreadTypeVcpu, + virCHThreadTypeIO, + virCHThreadTypeMax +} virCHThreadType; + +typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo; + +struct _virCHMonitorCPUInfo { + int cpuid; + pid_t tid; + + bool online; +}; + +typedef struct _virCHMonitorEmuThreadInfo virCHMonitorEmuThreadInfo; + +struct _virCHMonitorEmuThreadInfo { + char thrName[VIRCH_THREAD_NAME_LEN]; + pid_t tid; +}; + +typedef struct _virCHMonitorIOThreadInfo virCHMonitorIOThreadInfo; + +struct _virCHMonitorIOThreadInfo { + char thrName[VIRCH_THREAD_NAME_LEN]; + pid_t tid; +}; + +typedef struct _virCHMonitorThreadInfo virCHMonitorThreadInfo; + +struct _virCHMonitorThreadInfo { + virCHThreadType type; + + union { + virCHMonitorCPUInfo vcpuInfo; + virCHMonitorEmuThreadInfo emuInfo; + virCHMonitorIOThreadInfo ioInfo; + }; +}; + typedef struct _virCHMonitor virCHMonitor; struct _virCHMonitor { @@ -49,6 +93,9 @@ struct _virCHMonitor { pid_t pid; virDomainObj *vm; + + size_t nthreads; + virCHMonitorThreadInfo *threads; }; virCHMonitor *virCHMonitorNew(virDomainObj *vm, const char *socketdir); @@ -65,12 +112,9 @@ int virCHMonitorSuspendVM(virCHMonitor *mon); int virCHMonitorResumeVM(virCHMonitor *mon); int virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info); -typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo; -struct _virCHMonitorCPUInfo { - pid_t tid; - bool online; -}; void virCHMonitorCPUInfoFree(virCHMonitorCPUInfo *cpus); int virCHMonitorGetCPUInfo(virCHMonitor *mon, virCHMonitorCPUInfo **vcpus, size_t maxvcpus); +size_t virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh, + virCHMonitorThreadInfo **threads); diff --git a/src/ch/ch_process.c b/src/ch/ch_process.c index 3b7f6fcddf..8dce737adb 100644 --- a/src/ch/ch_process.c +++ b/src/ch/ch_process.c @@ -26,6 +26,8 @@ #include "ch_domain.h" #include "ch_monitor.h" #include "ch_process.h" +#include "ch_cgroup.h" +#include "virnuma.h" #include "viralloc.h" #include "virerror.h" #include "virjson.h" @@ -133,6 +135,257 @@ virCHProcessUpdateInfo(virDomainObj *vm) return 0; } +static int +virCHProcessGetAllCpuAffinity(virBitmap **cpumapRet) +{ + *cpumapRet = NULL; + + if (!virHostCPUHasBitmap()) + return 0; + + if (!(*cpumapRet = virHostCPUGetOnlineBitmap())) + return -1; + + return 0; +} + +#if defined(WITH_SCHED_GETAFFINITY) || defined(WITH_BSD_CPU_AFFINITY) +static int +virCHProcessInitCpuAffinity(virDomainObj *vm) +{ + g_autoptr(virBitmap) cpumapToSet = NULL; + virDomainNumatuneMemMode mem_mode; + virCHDomainObjPrivate *priv = vm->privateData; + + if (!vm->pid) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot setup CPU affinity until process is started")); + return -1; + } + + if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 && + virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 && + mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + virBitmap *nodeset = NULL; + + if (virDomainNumatuneMaybeGetNodeset(vm->def->numa, + priv->autoNodeset, + &nodeset, + -1) < 0) + return -1; + + if (virNumaNodesetToCPUset(nodeset, &cpumapToSet) < 0) + return -1; + } else if (vm->def->cputune.emulatorpin) { + if (!(cpumapToSet = virBitmapNewCopy(vm->def->cputune.emulatorpin))) + return -1; + } else { + if (virCHProcessGetAllCpuAffinity(&cpumapToSet) < 0) + return -1; + } + + if (cpumapToSet && + virProcessSetAffinity(vm->pid, cpumapToSet, false) < 0) { + return -1; + } + + return 0; +} +#else /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */ +static int +virCHProcessInitCpuAffinity(virDomainObj *vm G_GNUC_UNUSED) +{ + return 0; +} +#endif /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */ + +/** + * virCHProcessSetupPid: + * + * This function sets resource properties (affinity, cgroups, + * scheduler) for any PID associated with a domain. It should be used + * to set up emulator PIDs as well as vCPU and I/O thread pids to + * ensure they are all handled the same way. + * + * Returns 0 on success, -1 on error. + */ +static int +virCHProcessSetupPid(virDomainObj *vm, + pid_t pid, + virCgroupThreadName nameval, + int id, + virBitmap *cpumask, + unsigned long long period, + long long quota, + virDomainThreadSchedParam *sched) +{ + virCHDomainObjPrivate *priv = vm->privateData; + virDomainNumatuneMemMode mem_mode; + virCgroup *cgroup = NULL; + virBitmap *use_cpumask = NULL; + virBitmap *affinity_cpumask = NULL; + g_autoptr(virBitmap) hostcpumap = NULL; + g_autofree char *mem_mask = NULL; + int ret = -1; + + if ((period || quota) && + !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("cgroup cpu is required for scheduler tuning")); + goto cleanup; + } + + /* Infer which cpumask shall be used. */ + if (cpumask) { + use_cpumask = cpumask; + } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { + use_cpumask = priv->autoCpuset; + } else if (vm->def->cpumask) { + use_cpumask = vm->def->cpumask; + } else { + /* we can't assume cloud-hypervisor itself is running on all pCPUs, + * so we need to explicitly set the spawned instance to all pCPUs. */ + if (virCHProcessGetAllCpuAffinity(&hostcpumap) < 0) + goto cleanup; + affinity_cpumask = hostcpumap; + } + + /* + * If CPU cgroup controller is not initialized here, then we need + * neither period nor quota settings. And if CPUSET controller is + * not initialized either, then there's nothing to do anyway. + */ + if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) || + virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { + + if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 && + mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && + virDomainNumatuneMaybeFormatNodeset(vm->def->numa, + priv->autoNodeset, + &mem_mask, -1) < 0) + goto cleanup; + + if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0) + goto cleanup; + + if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { + if (use_cpumask && + chSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0) + goto cleanup; + + if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0) + goto cleanup; + + } + + if ((period || quota) && + chSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + + /* Move the thread to the sub dir */ + VIR_INFO("Adding pid %d to cgroup", pid); + if (virCgroupAddThread(cgroup, pid) < 0) + goto cleanup; + + } + + if (!affinity_cpumask) + affinity_cpumask = use_cpumask; + + /* Setup legacy affinity. */ + if (affinity_cpumask && virProcessSetAffinity(pid, affinity_cpumask, false) < 0) + goto cleanup; + + /* Set scheduler type and priority, but not for the main thread. */ + if (sched && + nameval != VIR_CGROUP_THREAD_EMULATOR && + virProcessSetScheduler(pid, sched->policy, sched->priority) < 0) + goto cleanup; + + ret = 0; + cleanup: + if (cgroup) { + if (ret < 0) + virCgroupRemove(cgroup); + virCgroupFree(cgroup); + } + + return ret; +} + +/** + * virCHProcessSetupVcpu: + * @vm: domain object + * @vcpuid: id of VCPU to set defaults + * + * This function sets resource properties (cgroups, affinity, scheduler) for a + * vCPU. This function expects that the vCPU is online and the vCPU pids were + * correctly detected at the point when it's called. + * + * Returns 0 on success, -1 on error. + */ +int +virCHProcessSetupVcpu(virDomainObj *vm, + unsigned int vcpuid) +{ + pid_t vcpupid = virCHDomainGetVcpuPid(vm, vcpuid); + virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, vcpuid); + + return virCHProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU, + vcpuid, vcpu->cpumask, + vm->def->cputune.period, + vm->def->cputune.quota, + &vcpu->sched); +} + +static int +virCHProcessSetupVcpus(virDomainObj *vm) +{ + virDomainVcpuDef *vcpu; + unsigned int maxvcpus = virDomainDefGetVcpusMax(vm->def); + size_t i; + + if ((vm->def->cputune.period || vm->def->cputune.quota) && + !virCgroupHasController(((virCHDomainObjPrivate *) vm->privateData)->cgroup, + VIR_CGROUP_CONTROLLER_CPU)) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("cgroup cpu is required for scheduler tuning")); + return -1; + } + + if (!virCHDomainHasVcpuPids(vm)) { + /* If any CPU has custom affinity that differs from the + * VM default affinity, we must reject it */ + for (i = 0; i < maxvcpus; i++) { + vcpu = virDomainDefGetVcpu(vm->def, i); + + if (!vcpu->online) + continue; + + if (vcpu->cpumask && + !virBitmapEqual(vm->def->cpumask, vcpu->cpumask)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cpu affinity is not supported")); + return -1; + } + } + + return 0; + } + + for (i = 0; i < maxvcpus; i++) { + vcpu = virDomainDefGetVcpu(vm->def, i); + + if (!vcpu->online) + continue; + + if (virCHProcessSetupVcpu(vm, i) < 0) + return -1; + } + + return 0; +} + /** * virCHProcessStart: * @driver: pointer to driver structure @@ -168,18 +421,33 @@ int virCHProcessStart(virCHDriver *driver, } } + vm->pid = priv->monitor->pid; + vm->def->id = vm->pid; + priv->machineName = virCHDomainGetMachineName(vm); + + if (chSetupCgroup(vm, nnicindexes, nicindexes) < 0) + goto cleanup; + + if (virCHProcessInitCpuAffinity(vm) < 0) + goto cleanup; + if (virCHMonitorBootVM(priv->monitor) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("failed to boot guest VM")); goto cleanup; } - priv->machineName = virCHDomainGetMachineName(vm); - vm->pid = priv->monitor->pid; - vm->def->id = vm->pid; + virCHDomainRefreshThreadInfo(vm); - virCHProcessUpdateInfo(vm); + VIR_DEBUG("Setting global CPU cgroup (if required)"); + if (chSetupGlobalCpuCgroup(vm) < 0) + goto cleanup; + + VIR_DEBUG("Setting vCPU tuning/settings"); + if (virCHProcessSetupVcpus(vm) < 0) + goto cleanup; + virCHProcessUpdateInfo(vm); virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason); return 0; @@ -195,6 +463,8 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED, virDomainObj *vm, virDomainShutoffReason reason) { + int ret; + int retries = 0; virCHDomainObjPrivate *priv = vm->privateData; VIR_DEBUG("Stopping VM name=%s pid=%d reason=%d", @@ -205,6 +475,16 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED, priv->monitor = NULL; } + retry: + if ((ret = chRemoveCgroup(vm)) < 0) { + if (ret == -EBUSY && (retries++ < 5)) { + g_usleep(200*1000); + goto retry; + } + VIR_WARN("Failed to remove cgroup for %s", + vm->def->name); + } + vm->pid = -1; vm->def->id = -1; diff --git a/src/ch/ch_process.h b/src/ch/ch_process.h index abc4915979..800e3f4e23 100644 --- a/src/ch/ch_process.h +++ b/src/ch/ch_process.h @@ -29,3 +29,6 @@ int virCHProcessStart(virCHDriver *driver, int virCHProcessStop(virCHDriver *driver, virDomainObj *vm, virDomainShutoffReason reason); + +int virCHProcessSetupVcpu(virDomainObj *vm, + unsigned int vcpuid); diff --git a/src/ch/meson.build b/src/ch/meson.build index 2b2bdda26c..0b20de56fd 100644 --- a/src/ch/meson.build +++ b/src/ch/meson.build @@ -1,6 +1,8 @@ ch_driver_sources = [ 'ch_conf.c', 'ch_conf.h', + 'ch_cgroup.c', + 'ch_cgroup.h', 'ch_domain.c', 'ch_domain.h', 'ch_driver.c', -- 2.27.0