Signed-off-by: Vineeth Pillai <virem...@linux.microsoft.com>
Signed-off-by: Praveen K Paladugu <pra...@linux.microsoft.com>
---
 po/POTFILES.in      |   1 +
 src/ch/ch_cgroup.c  | 457 ++++++++++++++++++++++++++++++++++++++++++++
 src/ch/ch_cgroup.h  |  45 +++++
 src/ch/ch_conf.c    |   2 +
 src/ch/ch_conf.h    |   4 +-
 src/ch/ch_domain.c  |  33 ++++
 src/ch/ch_domain.h  |   3 +-
 src/ch/ch_monitor.c | 125 ++++++++++--
 src/ch/ch_monitor.h |  54 +++++-
 src/ch/ch_process.c | 288 +++++++++++++++++++++++++++-
 src/ch/ch_process.h |   3 +
 src/ch/meson.build  |   2 +
 12 files changed, 991 insertions(+), 26 deletions(-)
 create mode 100644 src/ch/ch_cgroup.c
 create mode 100644 src/ch/ch_cgroup.h

diff --git a/po/POTFILES.in b/po/POTFILES.in
index b554cf08ca..3a8db501bc 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -19,6 +19,7 @@
 @SRCDIR@src/bhyve/bhyve_parse_command.c
 @SRCDIR@src/bhyve/bhyve_process.c
 @SRCDIR@src/ch/ch_conf.c
+@SRCDIR@src/ch/ch_cgroup.c
 @SRCDIR@src/ch/ch_domain.c
 @SRCDIR@src/ch/ch_driver.c
 @SRCDIR@src/ch/ch_monitor.c
diff --git a/src/ch/ch_cgroup.c b/src/ch/ch_cgroup.c
new file mode 100644
index 0000000000..6be2184cf1
--- /dev/null
+++ b/src/ch/ch_cgroup.c
@@ -0,0 +1,457 @@
+/*
+ * ch_cgroup.c: CH cgroup management
+ *
+ * Copyright Microsoft Corp. 2020-2021
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "ch_cgroup.h"
+#include "ch_domain.h"
+#include "ch_process.h"
+#include "vircgroup.h"
+#include "virlog.h"
+#include "viralloc.h"
+#include "virerror.h"
+#include "domain_audit.h"
+#include "domain_cgroup.h"
+#include "virscsi.h"
+#include "virstring.h"
+#include "virfile.h"
+#include "virtypedparam.h"
+#include "virnuma.h"
+#include "virdevmapper.h"
+#include "virutil.h"
+
+#define VIR_FROM_THIS VIR_FROM_CH
+
+VIR_LOG_INIT("ch.ch_cgroup");
+
+static int
+chSetupBlkioCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) {
+        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("Block I/O tuning is not available on this 
host"));
+            return -1;
+        } else {
+            return 0;
+        }
+    }
+
+    return virDomainCgroupSetupBlkio(priv->cgroup, vm->def->blkio);
+}
+
+
+static int
+chSetupMemoryCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
+        if (virMemoryLimitIsSet(vm->def->mem.hard_limit) ||
+            virMemoryLimitIsSet(vm->def->mem.soft_limit) ||
+            virMemoryLimitIsSet(vm->def->mem.swap_hard_limit)) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("Memory cgroup is not available on this host"));
+            return -1;
+        } else {
+            return 0;
+        }
+    }
+
+    return virDomainCgroupSetupMemtune(priv->cgroup, vm->def->mem);
+}
+
+static int
+chSetupCpusetCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+        return 0;
+
+    if (virCgroupSetCpusetMemoryMigrate(priv->cgroup, true) < 0)
+        return -1;
+
+    return 0;
+}
+
+
+static int
+chSetupCpuCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
+        if (vm->def->cputune.sharesSpecified) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("CPU tuning is not available on this host"));
+            return -1;
+        } else {
+            return 0;
+        }
+    }
+
+    if (vm->def->cputune.sharesSpecified) {
+
+        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
+            return -1;
+
+    }
+
+    return 0;
+}
+
+
+static int
+chInitCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
+
+    if (!priv->driver->privileged)
+        return 0;
+
+    if (!virCgroupAvailable())
+        return 0;
+
+    virCgroupFree(priv->cgroup);
+
+    if (!vm->def->resource) {
+        virDomainResourceDef *res;
+
+        res = g_new0(virDomainResourceDef, 1);
+
+        res->partition = g_strdup("/machine");
+
+        vm->def->resource = res;
+    }
+
+    if (vm->def->resource->partition[0] != '/') {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                       _("Resource partition '%s' must start with '/'"),
+                       vm->def->resource->partition);
+        return -1;
+    }
+
+    if (virCgroupNewMachine(priv->machineName, "ch", vm->def->uuid, NULL, 
vm->pid, false, nnicindexes, nicindexes, vm->def->resource->partition, 
cfg->cgroupControllers, 0,     /* maxThreadsPerProc */
+                            &priv->cgroup) < 0) {
+        if (virCgroupNewIgnoreError())
+            return 0;
+
+        return -1;
+    }
+
+    return 0;
+}
+
+static void
+chRestoreCgroupState(virDomainObj * vm)
+{
+    g_autofree char *mem_mask = NULL;
+    g_autofree char *nodeset = NULL;
+    virCHDomainObjPrivate *priv = vm->privateData;
+    size_t i = 0;
+
+    g_autoptr(virBitmap) all_nodes = NULL;
+    virCgroup *cgroup_temp = NULL;
+
+    if (!virNumaIsAvailable() ||
+        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+        return;
+
+    if (!(all_nodes = virNumaGetHostMemoryNodeset()))
+        goto error;
+
+    if (!(mem_mask = virBitmapFormat(all_nodes)))
+        goto error;
+
+    if ((virCgroupHasEmptyTasks(priv->cgroup,
+                                VIR_CGROUP_CONTROLLER_CPUSET)) <= 0)
+        goto error;
+
+    if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
+        goto error;
+
+    for (i = 0; i < virDomainDefGetVcpusMax(vm->def); i++) {
+        virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, i);
+
+        if (!vcpu->online)
+            continue;
+
+        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i,
+                               false, &cgroup_temp) < 0 ||
+            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
+            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
+            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
+            goto cleanup;
+
+        g_free(nodeset);
+        virCgroupFree(cgroup_temp);
+    }
+
+    for (i = 0; i < vm->def->niothreadids; i++) {
+        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD,
+                               vm->def->iothreadids[i]->iothread_id,
+                               false, &cgroup_temp) < 0 ||
+            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
+            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
+            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
+            goto cleanup;
+
+        g_free(nodeset);
+        virCgroupFree(cgroup_temp);
+    }
+
+    if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
+                           false, &cgroup_temp) < 0 ||
+        virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
+        virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
+        virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
+        goto cleanup;
+
+    cleanup:
+        virCgroupFree(cgroup_temp);
+        return;
+
+    error:
+        virResetLastError();
+        VIR_DEBUG("Couldn't restore cgroups to meaningful state");
+        goto cleanup;
+}
+
+int
+chConnectCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
+
+    if (!priv->driver->privileged)
+        return 0;
+
+    if (!virCgroupAvailable())
+        return 0;
+
+    virCgroupFree(priv->cgroup);
+
+    if (virCgroupNewDetectMachine(vm->def->name,
+                                  "ch",
+                                  vm->pid,
+                                  cfg->cgroupControllers,
+                                  priv->machineName, &priv->cgroup) < 0)
+        return -1;
+
+    chRestoreCgroupState(vm);
+    return 0;
+}
+
+int
+chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!vm->pid) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("Cannot setup cgroups until process is started"));
+        return -1;
+    }
+
+    if (chInitCgroup(vm, nnicindexes, nicindexes) < 0)
+        return -1;
+
+    if (!priv->cgroup)
+        return 0;
+
+    if (chSetupBlkioCgroup(vm) < 0)
+        return -1;
+
+    if (chSetupMemoryCgroup(vm) < 0)
+        return -1;
+
+    if (chSetupCpuCgroup(vm) < 0)
+        return -1;
+
+    if (chSetupCpusetCgroup(vm) < 0)
+        return -1;
+
+    return 0;
+}
+
+int
+chSetupCgroupVcpuBW(virCgroup * cgroup,
+                    unsigned long long period, long long quota)
+{
+    return virCgroupSetupCpuPeriodQuota(cgroup, period, quota);
+}
+
+
+int
+chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask)
+{
+    return virCgroupSetupCpusetCpus(cgroup, cpumask);
+}
+
+int
+chSetupGlobalCpuCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+    unsigned long long period = vm->def->cputune.global_period;
+    long long quota = vm->def->cputune.global_quota;
+    g_autofree char *mem_mask = NULL;
+    virDomainNumatuneMemMode mem_mode;
+
+    if ((period || quota) &&
+        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                       _("cgroup cpu is required for scheduler tuning"));
+        return -1;
+    }
+
+    /*
+     * If CPU cgroup controller is not initialized here, then we need
+     * neither period nor quota settings.  And if CPUSET controller is
+     * not initialized either, then there's nothing to do anyway.
+     */
+    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
+        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+        return 0;
+
+
+    if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
+        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
+        virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
+                                            priv->autoNodeset,
+                                            &mem_mask, -1) < 0)
+        return -1;
+
+    if (period || quota) {
+        if (chSetupCgroupVcpuBW(priv->cgroup, period, quota) < 0)
+            return -1;
+    }
+
+    return 0;
+}
+
+
+int
+chRemoveCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (priv->cgroup == NULL)
+        return 0;               /* Not supported, so claim success */
+
+    if (virCgroupTerminateMachine(priv->machineName) < 0) {
+        if (!virCgroupNewIgnoreError())
+            VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
+    }
+
+    return virCgroupRemove(priv->cgroup);
+}
+
+
+static void
+chCgroupEmulatorAllNodesDataFree(chCgroupEmulatorAllNodesData * data)
+{
+    if (!data)
+        return;
+
+    virCgroupFree(data->emulatorCgroup);
+    g_free(data->emulatorMemMask);
+    g_free(data);
+}
+
+
+/**
+ * chCgroupEmulatorAllNodesAllow:
+ * @cgroup: domain cgroup pointer
+ * @retData: filled with structure used to roll back the operation
+ *
+ * Allows all NUMA nodes for the cloud hypervisor thread temporarily. This is
+ * necessary when hotplugging cpus since it requires memory allocated in the
+ * DMA region. Afterwards the operation can be reverted by
+ * chCgroupEmulatorAllNodesRestore.
+ *
+ * Returns 0 on success -1 on error
+ */
+int
+chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
+                              chCgroupEmulatorAllNodesData ** retData)
+{
+    chCgroupEmulatorAllNodesData *data = NULL;
+    g_autofree char *all_nodes_str = NULL;
+
+    g_autoptr(virBitmap) all_nodes = NULL;
+    int ret = -1;
+
+    if (!virNumaIsAvailable() ||
+        !virCgroupHasController(cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+        return 0;
+
+    if (!(all_nodes = virNumaGetHostMemoryNodeset()))
+        goto cleanup;
+
+    if (!(all_nodes_str = virBitmapFormat(all_nodes)))
+        goto cleanup;
+
+    data = g_new0(chCgroupEmulatorAllNodesData, 1);
+
+    if (virCgroupNewThread(cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
+                           false, &data->emulatorCgroup) < 0)
+        goto cleanup;
+
+    if (virCgroupGetCpusetMems(data->emulatorCgroup, &data->emulatorMemMask) < 0
+        || virCgroupSetCpusetMems(data->emulatorCgroup, all_nodes_str) < 0)
+        goto cleanup;
+
+    *retData = g_steal_pointer(&data);
+    ret = 0;
+
+    cleanup:
+        chCgroupEmulatorAllNodesDataFree(data);
+
+    return ret;
+}
+
+
+/**
+ * chCgroupEmulatorAllNodesRestore:
+ * @data: data structure created by chCgroupEmulatorAllNodesAllow
+ *
+ * Rolls back the setting done by chCgroupEmulatorAllNodesAllow and frees the
+ * associated data.
+ */
+void
+chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data)
+{
+    virError *err;
+
+    if (!data)
+        return;
+
+    virErrorPreserveLast(&err);
+    virCgroupSetCpusetMems(data->emulatorCgroup, data->emulatorMemMask);
+    virErrorRestore(&err);
+
+    chCgroupEmulatorAllNodesDataFree(data);
+}
diff --git a/src/ch/ch_cgroup.h b/src/ch/ch_cgroup.h
new file mode 100644
index 0000000000..0152b5477c
--- /dev/null
+++ b/src/ch/ch_cgroup.h
@@ -0,0 +1,45 @@
+/*
+ * ch_cgroup.h: CH cgroup management
+ *
+ * Copyright Microsoft Corp. 2020-2021
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "virusb.h"
+#include "vircgroup.h"
+#include "domain_conf.h"
+#include "ch_conf.h"
+
+int chConnectCgroup(virDomainObj * vm);
+int chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes);
+int chSetupCgroupVcpuBW(virCgroup * cgroup,
+                        unsigned long long period, long long quota);
+int chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask);
+int chSetupGlobalCpuCgroup(virDomainObj * vm);
+int chRemoveCgroup(virDomainObj * vm);
+
+typedef struct _chCgroupEmulatorAllNodesData chCgroupEmulatorAllNodesData;
+
+struct _chCgroupEmulatorAllNodesData {
+    virCgroup *emulatorCgroup;
+    char *emulatorMemMask;
+};
+
+int chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
+                                  chCgroupEmulatorAllNodesData ** data);
+void chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data);
diff --git a/src/ch/ch_conf.c b/src/ch/ch_conf.c
index ed0fffe5d6..7f70452296 100644
--- a/src/ch/ch_conf.c
+++ b/src/ch/ch_conf.c
@@ -141,6 +141,8 @@ virCHDriverConfigNew(bool privileged)
     if (!(cfg = virObjectNew(virCHDriverConfigClass)))
         return NULL;
 
+    cfg->cgroupControllers = -1; /* Auto detect */
+
     if (privileged) {
         if (virGetUserID(CH_USER, &cfg->user) < 0)
             return NULL;
diff --git a/src/ch/ch_conf.h b/src/ch/ch_conf.h
index 49f286f97a..19deb8e568 100644
--- a/src/ch/ch_conf.h
+++ b/src/ch/ch_conf.h
@@ -35,11 +35,13 @@ struct _virCHDriverConfig {
 
     char *stateDir;
     char *logDir;
-
+    int cgroupControllers;
     uid_t user;
     gid_t group;
 };
 
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(virCHDriverConfig, virObjectUnref);
+
 struct _virCHDriver
 {
     virMutex lock;
diff --git a/src/ch/ch_domain.c b/src/ch/ch_domain.c
index e1030800aa..d0aaeed1f4 100644
--- a/src/ch/ch_domain.c
+++ b/src/ch/ch_domain.c
@@ -326,6 +326,39 @@ chValidateDomainDeviceDef(const virDomainDeviceDef *dev,
                        _("Serial can only be enabled for a PTY"));
         return -1;
     }
+    return 0;
+}
+int
+virCHDomainRefreshThreadInfo(virDomainObj *vm)
+{
+    size_t maxvcpus = virDomainDefGetVcpusMax(vm->def);
+    virCHMonitorThreadInfo *info = NULL;
+    size_t nthreads, ncpus = 0;
+    size_t i;
+
+    nthreads = virCHMonitorGetThreadInfo(virCHDomainGetMonitor(vm),
+                                         true, &info);
+
+    for (i = 0; i < nthreads; i++) {
+        virCHDomainVcpuPrivate *vcpupriv;
+        virDomainVcpuDef *vcpu;
+        virCHMonitorCPUInfo *vcpuInfo;
+
+        if (info[i].type != virCHThreadTypeVcpu)
+            continue;
+
+        // TODO: hotplug support
+        vcpuInfo = &info[i].vcpuInfo;
+        vcpu = virDomainDefGetVcpu(vm->def, vcpuInfo->cpuid);
+        vcpupriv = CH_DOMAIN_VCPU_PRIVATE(vcpu);
+        vcpupriv->tid = vcpuInfo->tid;
+        ncpus++;
+    }
+
+    // TODO: Remove the warning when hotplug is implemented.
+    if (ncpus != maxvcpus)
+        VIR_WARN("Mismatch in the number of cpus, expected: %ld, actual: %ld",
+                 maxvcpus, ncpus);
 
     return 0;
 }
diff --git a/src/ch/ch_domain.h b/src/ch/ch_domain.h
index 3ac3421015..2ce3e2cef3 100644
--- a/src/ch/ch_domain.h
+++ b/src/ch/ch_domain.h
@@ -89,7 +89,8 @@ virCHDomainObjBeginJob(virDomainObj *obj, enum virCHDomainJob 
job)
 void
 virCHDomainObjEndJob(virDomainObj *obj);
 
-int virCHDomainRefreshVcpuInfo(virDomainObj *vm);
+int virCHDomainRefreshThreadInfo(virDomainObj *vm);
+
 pid_t virCHDomainGetVcpuPid(virDomainObj *vm, unsigned int vcpuid);
 bool virCHDomainHasVcpuPids(virDomainObj *vm);
 
diff --git a/src/ch/ch_monitor.c b/src/ch/ch_monitor.c
index c0ae031200..095779cb3f 100644
--- a/src/ch/ch_monitor.c
+++ b/src/ch/ch_monitor.c
@@ -41,6 +41,7 @@ VIR_LOG_INIT("ch.ch_monitor");
 
 static virClass *virCHMonitorClass;
 static void virCHMonitorDispose(void *obj);
+static void virCHMonitorThreadInfoFree(virCHMonitor *mon);
 
 static int virCHMonitorOnceInit(void)
 {
@@ -571,6 +572,7 @@ static void virCHMonitorDispose(void *opaque)
     virCHMonitor *mon = opaque;
 
     VIR_DEBUG("mon=%p", mon);
+    virCHMonitorThreadInfoFree(mon);
     virObjectUnref(mon->vm);
 }
 
@@ -736,6 +738,114 @@ virCHMonitorGet(virCHMonitor *mon, const char *endpoint, 
virJSONValue **response
     return ret;
 }
 
+/**
+ * virCHMonitorGetInfo:
+ * @mon: Pointer to the monitor
+ * @info: Get VM info
+ *
+ * Retrieve the VM info and store in @info
+ *
+ * Returns 0 on success.
+ */
+int
+virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
+{
+    return virCHMonitorGet(mon, URL_VM_INFO, info);
+}
+
+static void
+virCHMonitorThreadInfoFree(virCHMonitor *mon)
+{
+    mon->nthreads = 0;
+    if (mon->threads)
+        VIR_FREE(mon->threads);
+}
+
+static size_t
+virCHMonitorRefreshThreadInfo(virCHMonitor *mon)
+{
+    virCHMonitorThreadInfo *info = NULL;
+    g_autofree pid_t *tids = NULL;
+    virDomainObj *vm = mon->vm;
+    size_t ntids = 0;
+    size_t i;
+
+
+    virCHMonitorThreadInfoFree(mon);
+    if (virProcessGetPids(vm->pid, &ntids, &tids) < 0) {
+        mon->threads = NULL;
+        return 0;
+    }
+
+    info = g_new0(virCHMonitorThreadInfo, ntids);
+    for (i = 0; i < ntids; i++) {
+        g_autofree char *proc = NULL;
+        g_autofree char *data = NULL;
+
+        proc = g_strdup_printf("/proc/%d/task/%d/comm",
+                (int)vm->pid, (int)tids[i]);
+
+        if (virFileReadAll(proc, (1<<16), &data) < 0) {
+            continue;
+        }
+
+        VIR_DEBUG("VM PID: %d, TID %d, COMM: %s",
+                (int)vm->pid, (int)tids[i], data);
+        if (STRPREFIX(data, "vcpu")) {
+            int cpuid;
+            char *tmp;
+            if (virStrToLong_i(data + 4, &tmp, 0, &cpuid) < 0) {
+                VIR_WARN("Index is not specified correctly");
+                continue;
+            }
+            info[i].type = virCHThreadTypeVcpu;
+            info[i].vcpuInfo.tid = tids[i];
+            info[i].vcpuInfo.online = true;
+            info[i].vcpuInfo.cpuid = cpuid;
+            VIR_DEBUG("vcpu%d -> tid: %d", cpuid, tids[i]);
+        } else if (STRPREFIX(data, "_disk") || STRPREFIX(data, "_net") ||
+                   STRPREFIX(data, "_rng")) {
+        /* Prefixes used by cloud-hypervisor for IO Threads are captured at
+        
https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/vmm/src/device_manager.rs
 */
+            info[i].type = virCHThreadTypeIO;
+            info[i].ioInfo.tid = tids[i];
+            virStrcpy(info[i].ioInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
+        }else {
+            info[i].type = virCHThreadTypeEmulator;
+            info[i].emuInfo.tid = tids[i];
+            virStrcpy(info[i].emuInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 
1);
+        }
+        mon->nthreads++;
+
+    }
+    mon->threads = info;
+
+    return mon->nthreads;
+}
+
+/**
+ * virCHMonitorGetThreadInfo:
+ * @mon: Pointer to the monitor
+ * @refresh: Refresh thread info or not
+ *
+ * Retrive thread info and store to @threads
+ *
+ * Returns count of threads on success.
+ */
+size_t
+virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
+                          virCHMonitorThreadInfo **threads)
+{
+    int nthreads = 0;
+
+    if (refresh)
+        nthreads = virCHMonitorRefreshThreadInfo(mon);
+
+    *threads = mon->threads;
+
+    return nthreads;
+}
+
 int
 virCHMonitorShutdownVMM(virCHMonitor *mon)
 {
@@ -810,18 +920,3 @@ virCHMonitorResumeVM(virCHMonitor *mon)
 {
     return virCHMonitorPutNoContent(mon, URL_VM_RESUME);
 }
-
-/**
- * virCHMonitorGetInfo:
- * @mon: Pointer to the monitor
- * @info: Get VM info
- *
- * Retrieve the VM info and store in @info
- *
- * Returns 0 on success.
- */
-int
-virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
-{
-    return virCHMonitorGet(mon, URL_VM_INFO, info);
-}
diff --git a/src/ch/ch_monitor.h b/src/ch/ch_monitor.h
index 8ca9e17a9a..f8c3fa75e8 100644
--- a/src/ch/ch_monitor.h
+++ b/src/ch/ch_monitor.h
@@ -37,6 +37,50 @@
 #define URL_VM_RESUME "vm.resume"
 #define URL_VM_INFO "vm.info"
 
+#define VIRCH_THREAD_NAME_LEN   16
+
+typedef enum {
+    virCHThreadTypeEmulator,
+    virCHThreadTypeVcpu,
+    virCHThreadTypeIO,
+    virCHThreadTypeMax
+} virCHThreadType;
+
+typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
+
+struct _virCHMonitorCPUInfo {
+    int cpuid;
+    pid_t tid;
+
+    bool online;
+};
+
+typedef struct _virCHMonitorEmuThreadInfo virCHMonitorEmuThreadInfo;
+
+struct _virCHMonitorEmuThreadInfo {
+    char    thrName[VIRCH_THREAD_NAME_LEN];
+    pid_t   tid;
+};
+
+typedef struct _virCHMonitorIOThreadInfo virCHMonitorIOThreadInfo;
+
+struct _virCHMonitorIOThreadInfo {
+    char    thrName[VIRCH_THREAD_NAME_LEN];
+    pid_t   tid;
+};
+
+typedef struct _virCHMonitorThreadInfo virCHMonitorThreadInfo;
+
+struct _virCHMonitorThreadInfo {
+    virCHThreadType type;
+
+    union {
+        virCHMonitorCPUInfo vcpuInfo;
+        virCHMonitorEmuThreadInfo emuInfo;
+        virCHMonitorIOThreadInfo ioInfo;
+    };
+};
+
 typedef struct _virCHMonitor virCHMonitor;
 
 struct _virCHMonitor {
@@ -49,6 +93,9 @@ struct _virCHMonitor {
     pid_t pid;
 
     virDomainObj *vm;
+
+    size_t nthreads;
+    virCHMonitorThreadInfo *threads;
 };
 
 virCHMonitor *virCHMonitorNew(virDomainObj *vm, const char *socketdir);
@@ -65,12 +112,9 @@ int virCHMonitorSuspendVM(virCHMonitor *mon);
 int virCHMonitorResumeVM(virCHMonitor *mon);
 int virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info);
 
-typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
-struct _virCHMonitorCPUInfo {
-    pid_t tid;
-    bool online;
-};
 void virCHMonitorCPUInfoFree(virCHMonitorCPUInfo *cpus);
 int virCHMonitorGetCPUInfo(virCHMonitor *mon,
                        virCHMonitorCPUInfo **vcpus,
                        size_t maxvcpus);
+size_t virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
+                                 virCHMonitorThreadInfo **threads);
diff --git a/src/ch/ch_process.c b/src/ch/ch_process.c
index 3b7f6fcddf..8dce737adb 100644
--- a/src/ch/ch_process.c
+++ b/src/ch/ch_process.c
@@ -26,6 +26,8 @@
 #include "ch_domain.h"
 #include "ch_monitor.h"
 #include "ch_process.h"
+#include "ch_cgroup.h"
+#include "virnuma.h"
 #include "viralloc.h"
 #include "virerror.h"
 #include "virjson.h"
@@ -133,6 +135,257 @@ virCHProcessUpdateInfo(virDomainObj *vm)
     return 0;
 }
 
+static int
+virCHProcessGetAllCpuAffinity(virBitmap **cpumapRet)
+{
+    *cpumapRet = NULL;
+
+    if (!virHostCPUHasBitmap())
+        return 0;
+
+    if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
+        return -1;
+
+    return 0;
+}
+
+#if defined(WITH_SCHED_GETAFFINITY) || defined(WITH_BSD_CPU_AFFINITY)
+static int
+virCHProcessInitCpuAffinity(virDomainObj *vm)
+{
+    g_autoptr(virBitmap) cpumapToSet = NULL;
+    virDomainNumatuneMemMode mem_mode;
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!vm->pid) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("Cannot setup CPU affinity until process is 
started"));
+        return -1;
+    }
+
+    if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
+        virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
+        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
+        virBitmap *nodeset = NULL;
+
+        if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
+                                             priv->autoNodeset,
+                                             &nodeset,
+                                             -1) < 0)
+            return -1;
+
+        if (virNumaNodesetToCPUset(nodeset, &cpumapToSet) < 0)
+            return -1;
+    } else if (vm->def->cputune.emulatorpin) {
+        if (!(cpumapToSet = virBitmapNewCopy(vm->def->cputune.emulatorpin)))
+            return -1;
+    } else {
+        if (virCHProcessGetAllCpuAffinity(&cpumapToSet) < 0)
+            return -1;
+    }
+
+    if (cpumapToSet &&
+        virProcessSetAffinity(vm->pid, cpumapToSet, false) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+#else /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
+static int
+virCHProcessInitCpuAffinity(virDomainObj *vm G_GNUC_UNUSED)
+{
+    return 0;
+}
+#endif /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) 
*/
+
+/**
+ * virCHProcessSetupPid:
+ *
+ * This function sets resource properties (affinity, cgroups,
+ * scheduler) for any PID associated with a domain.  It should be used
+ * to set up emulator PIDs as well as vCPU and I/O thread pids to
+ * ensure they are all handled the same way.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+static int
+virCHProcessSetupPid(virDomainObj *vm,
+                     pid_t pid,
+                     virCgroupThreadName nameval,
+                     int id,
+                     virBitmap *cpumask,
+                     unsigned long long period,
+                     long long quota,
+                     virDomainThreadSchedParam *sched)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+    virDomainNumatuneMemMode mem_mode;
+    virCgroup *cgroup = NULL;
+    virBitmap *use_cpumask = NULL;
+    virBitmap *affinity_cpumask = NULL;
+    g_autoptr(virBitmap) hostcpumap = NULL;
+    g_autofree char *mem_mask = NULL;
+    int ret = -1;
+
+    if ((period || quota) &&
+        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                       _("cgroup cpu is required for scheduler tuning"));
+        goto cleanup;
+    }
+
+    /* Infer which cpumask shall be used. */
+    if (cpumask) {
+        use_cpumask = cpumask;
+    } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
+        use_cpumask = priv->autoCpuset;
+    } else if (vm->def->cpumask) {
+        use_cpumask = vm->def->cpumask;
+    } else {
+        /* we can't assume cloud-hypervisor itself is running on all pCPUs,
+         * so we need to explicitly set the spawned instance to all pCPUs. */
+        if (virCHProcessGetAllCpuAffinity(&hostcpumap) < 0)
+            goto cleanup;
+        affinity_cpumask = hostcpumap;
+    }
+
+    /*
+     * If CPU cgroup controller is not initialized here, then we need
+     * neither period nor quota settings.  And if CPUSET controller is
+     * not initialized either, then there's nothing to do anyway.
+     */
+    if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) ||
+        virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
+
+        if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
+            mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
+            virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
+                                                priv->autoNodeset,
+                                                &mem_mask, -1) < 0)
+            goto cleanup;
+
+        if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0)
+            goto cleanup;
+
+        if (virCgroupHasController(priv->cgroup, 
VIR_CGROUP_CONTROLLER_CPUSET)) {
+            if (use_cpumask &&
+                chSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
+                goto cleanup;
+
+            if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
+                goto cleanup;
+
+        }
+
+        if ((period || quota) &&
+            chSetupCgroupVcpuBW(cgroup, period, quota) < 0)
+            goto cleanup;
+
+        /* Move the thread to the sub dir */
+        VIR_INFO("Adding pid %d to cgroup", pid);
+        if (virCgroupAddThread(cgroup, pid) < 0)
+            goto cleanup;
+
+    }
+
+    if (!affinity_cpumask)
+        affinity_cpumask = use_cpumask;
+
+    /* Setup legacy affinity. */
+    if (affinity_cpumask && virProcessSetAffinity(pid, affinity_cpumask, 
false) < 0)
+        goto cleanup;
+
+    /* Set scheduler type and priority, but not for the main thread. */
+    if (sched &&
+        nameval != VIR_CGROUP_THREAD_EMULATOR &&
+        virProcessSetScheduler(pid, sched->policy, sched->priority) < 0)
+        goto cleanup;
+
+    ret = 0;
+ cleanup:
+    if (cgroup) {
+        if (ret < 0)
+            virCgroupRemove(cgroup);
+        virCgroupFree(cgroup);
+    }
+
+    return ret;
+}
+
+/**
+ * virCHProcessSetupVcpu:
+ * @vm: domain object
+ * @vcpuid: id of VCPU to set defaults
+ *
+ * This function sets resource properties (cgroups, affinity, scheduler) for a
+ * vCPU. This function expects that the vCPU is online and the vCPU pids were
+ * correctly detected at the point when it's called.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+virCHProcessSetupVcpu(virDomainObj *vm,
+                      unsigned int vcpuid)
+{
+    pid_t vcpupid = virCHDomainGetVcpuPid(vm, vcpuid);
+    virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, vcpuid);
+
+    return virCHProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,
+                                vcpuid, vcpu->cpumask,
+                                vm->def->cputune.period,
+                                vm->def->cputune.quota,
+                                &vcpu->sched);
+}
+
+static int
+virCHProcessSetupVcpus(virDomainObj *vm)
+{
+    virDomainVcpuDef *vcpu;
+    unsigned int maxvcpus = virDomainDefGetVcpusMax(vm->def);
+    size_t i;
+
+    if ((vm->def->cputune.period || vm->def->cputune.quota) &&
+        !virCgroupHasController(((virCHDomainObjPrivate *) 
vm->privateData)->cgroup,
+                                VIR_CGROUP_CONTROLLER_CPU)) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                       _("cgroup cpu is required for scheduler tuning"));
+        return -1;
+    }
+
+    if (!virCHDomainHasVcpuPids(vm)) {
+        /* If any CPU has custom affinity that differs from the
+         * VM default affinity, we must reject it */
+        for (i = 0; i < maxvcpus; i++) {
+            vcpu = virDomainDefGetVcpu(vm->def, i);
+
+            if (!vcpu->online)
+                continue;
+
+            if (vcpu->cpumask &&
+                !virBitmapEqual(vm->def->cpumask, vcpu->cpumask)) {
+                virReportError(VIR_ERR_OPERATION_INVALID, "%s",
+                                _("cpu affinity is not supported"));
+                return -1;
+            }
+        }
+
+        return 0;
+    }
+
+    for (i = 0; i < maxvcpus; i++) {
+        vcpu = virDomainDefGetVcpu(vm->def, i);
+
+        if (!vcpu->online)
+            continue;
+
+        if (virCHProcessSetupVcpu(vm, i) < 0)
+            return -1;
+    }
+
+    return 0;
+}
+
 /**
  * virCHProcessStart:
  * @driver: pointer to driver structure
@@ -168,18 +421,33 @@ int virCHProcessStart(virCHDriver *driver,
         }
     }
 
+    vm->pid = priv->monitor->pid;
+    vm->def->id = vm->pid;
+    priv->machineName = virCHDomainGetMachineName(vm);
+
+    if (chSetupCgroup(vm, nnicindexes, nicindexes) < 0)
+        goto cleanup;
+
+    if (virCHProcessInitCpuAffinity(vm) < 0)
+        goto cleanup;
+
     if (virCHMonitorBootVM(priv->monitor) < 0) {
         virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                        _("failed to boot guest VM"));
         goto cleanup;
     }
 
-    priv->machineName = virCHDomainGetMachineName(vm);
-    vm->pid = priv->monitor->pid;
-    vm->def->id = vm->pid;
+    virCHDomainRefreshThreadInfo(vm);
 
-    virCHProcessUpdateInfo(vm);
+    VIR_DEBUG("Setting global CPU cgroup (if required)");
+    if (chSetupGlobalCpuCgroup(vm) < 0)
+        goto cleanup;
+
+    VIR_DEBUG("Setting vCPU tuning/settings");
+    if (virCHProcessSetupVcpus(vm) < 0)
+        goto cleanup;
 
+    virCHProcessUpdateInfo(vm);
     virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
 
     return 0;
@@ -195,6 +463,8 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
                      virDomainObj *vm,
                      virDomainShutoffReason reason)
 {
+    int ret;
+    int retries = 0;
     virCHDomainObjPrivate *priv = vm->privateData;
 
     VIR_DEBUG("Stopping VM name=%s pid=%d reason=%d",
@@ -205,6 +475,16 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
         priv->monitor = NULL;
     }
 
+    retry:
+        if ((ret = chRemoveCgroup(vm)) < 0) {
+            if (ret == -EBUSY && (retries++ < 5)) {
+                g_usleep(200*1000);
+                goto retry;
+            }
+            VIR_WARN("Failed to remove cgroup for %s",
+                    vm->def->name);
+        }
+
     vm->pid = -1;
     vm->def->id = -1;
 
diff --git a/src/ch/ch_process.h b/src/ch/ch_process.h
index abc4915979..800e3f4e23 100644
--- a/src/ch/ch_process.h
+++ b/src/ch/ch_process.h
@@ -29,3 +29,6 @@ int virCHProcessStart(virCHDriver *driver,
 int virCHProcessStop(virCHDriver *driver,
                      virDomainObj *vm,
                      virDomainShutoffReason reason);
+
+int virCHProcessSetupVcpu(virDomainObj *vm,
+                          unsigned int vcpuid);
diff --git a/src/ch/meson.build b/src/ch/meson.build
index 2b2bdda26c..0b20de56fd 100644
--- a/src/ch/meson.build
+++ b/src/ch/meson.build
@@ -1,6 +1,8 @@
 ch_driver_sources = [
   'ch_conf.c',
   'ch_conf.h',
+  'ch_cgroup.c',
+  'ch_cgroup.h',
   'ch_domain.c',
   'ch_domain.h',
   'ch_driver.c',
-- 
2.27.0


Reply via email to