The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/6967
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
From bd3292244c2a35fd6ae12c18452aa2c2be96ffe4 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Mon, 2 Mar 2020 15:27:05 +0100 Subject: [PATCH 1/4] seccomp: handle hugetlbfs mount syscall interception hugetlbfs already allocates new instances on every mount. We can deal with the filesystem through existing mount interception. We just need to add automatic handling of uid and gid mount options when detecting a hugetlbfs mount. We also need to have hugetlbfs bypass the shifted option as we don't want to put shiftfs on top of it when we already set the correct uid/gid. Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- lxd/seccomp/seccomp.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go index f996bb19d9..97b11c6c14 100644 --- a/lxd/seccomp/seccomp.go +++ b/lxd/seccomp/seccomp.go @@ -1249,6 +1249,15 @@ func mountFlagsToOpts(flags C.ulong) string { return opts } +// mountHandleHugetlbfsArgs adds user namespace root uid and gid to the +// hugetlbfs mount options to make it useable in unprivileged containers. +func (s *Server) mountHandleHugetlbfsArgs(args *MountArgs, nsuid int64, nsgid int64) { + if args.fstype == "hugetlbfs" && args.data == "" { + args.data = fmt.Sprintf("uid=%d,gid=%d", nsuid, nsgid) + args.shift = false + } +} + // HandleMountSyscall handles mount syscalls. func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int { ctx := log.Ctx{"container": c.Name(), @@ -1339,6 +1348,8 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int { return 0 } + s.mountHandleHugetlbfsArgs(&args, nsuid, nsgid) + if fuseBinary != "" { // Record ignored flags for debugging purposes flags := C.ulong(args.flags) From 8174951d490bcbcec07211f7e7728e86a233ab24 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Mon, 2 Mar 2020 17:34:28 +0100 Subject: [PATCH 2/4] doc: add container_syscall_intercept_hugetlbfs Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- doc/api-extensions.md | 6 ++++++ shared/version/api.go | 1 + 2 files changed, 7 insertions(+) diff --git a/doc/api-extensions.md b/doc/api-extensions.md index 5d9fd59a8a..2bedc3a5ab 100644 --- a/doc/api-extensions.md +++ b/doc/api-extensions.md @@ -926,3 +926,9 @@ Introduces the ability to create a storage pool from an existing non-empty volum This option should be used with care, as LXD can then not guarantee that volume name conflicts won't occur with non-LXD created volumes in the same volume group. This could also potentially lead to LXD deleting a non-LXD volume should name conflicts occur. + +## container\_syscall\_intercept\_hugetlbfs +When mount syscall interception is enabled and hugetlbfs is specified as an +allowed filesystem type LXD will mount a separate hugetlbfs instance for the +container with the uid and gid mount options set to the container's root uid +and gid. This ensure that processes in the container can use hugepages. diff --git a/shared/version/api.go b/shared/version/api.go index 503c7d4138..fbc50b997c 100644 --- a/shared/version/api.go +++ b/shared/version/api.go @@ -190,6 +190,7 @@ var APIExtensions = []string{ "clustering_sizing", "firewall_driver", "projects_limits", + "container_syscall_intercept_hugetlbfs", } // APIExtensionsCount returns the number of available API extensions. From 81615cd980672594c564d5b6c9baa542650f2786 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Mon, 2 Mar 2020 17:10:18 +0100 Subject: [PATCH 3/4] limits: add limits.hugepages.* keys limits.hugepages.64KB limits.hugepages.1MB limits.hugepages.2MB limits.hugepages.1GB Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- lxd/cgroup/abstraction.go | 18 ++++++++++++++++++ lxd/cgroup/init.go | 10 ++++++++++ lxd/instance/drivers/driver_lxc.go | 26 ++++++++++++++++++++++++++ shared/instance.go | 5 +++++ 4 files changed, 59 insertions(+) diff --git a/lxd/cgroup/abstraction.go b/lxd/cgroup/abstraction.go index ab8a910cf5..61b7ccf664 100644 --- a/lxd/cgroup/abstraction.go +++ b/lxd/cgroup/abstraction.go @@ -305,3 +305,21 @@ func (cg *CGroup) SetNetIfPrio(value string) error { } return ErrUnknownVersion } + +// SetMaxHugepages applies a limit to the number of processes +func (cg *CGroup) SetMaxHugepages(pageType string, value string) error { + // Confirm we have the controller + version := cgControllers["hugetlb"] + switch version { + case Unavailable: + return ErrControllerMissing + case V1: + return cg.rw.Set(version, "hugetlb", fmt.Sprintf("hugetlb.%s.limit_in_bytes", pageType), value) + case V2: + if value == "" { + return cg.rw.Set(version, "hugetlb", fmt.Sprintf("hugetlb.%s.max", pageType), "max") + } + return cg.rw.Set(version, "hugetlb", fmt.Sprintf("hugetlb.%s.max", pageType), value) + } + return ErrUnknownVersion +} diff --git a/lxd/cgroup/init.go b/lxd/cgroup/init.go index 0bacbb45b2..de6ee08678 100644 --- a/lxd/cgroup/init.go +++ b/lxd/cgroup/init.go @@ -91,6 +91,9 @@ const ( // Freezer resource control Freezer + // Hugetlb resource control + Hugetlb + // Memory resource control Memory @@ -161,6 +164,9 @@ func (info *Info) SupportsVersion(resource Resource) (Backend, bool) { case Freezer: val, ok := cgControllers["freezer"] return val, ok + case Hugetlb: + val, ok := cgControllers["hugetlb"] + return val, ok case Memory: val, ok := cgControllers["memory"] return val, ok @@ -266,6 +272,10 @@ func (info *Info) Log() { logger.Warnf(" - Couldn't find the CGroup freezer controller, pausing/resuming containers won't work") } + if !info.Supports(Hugetlb, nil) { + logger.Warnf(" - Couldn't find the CGroup hugetlb controller, pausing/resuming containers won't work") + } + if !info.Supports(Memory, nil) { logger.Warnf(" - Couldn't find the CGroup memory controller, memory limits will be ignored") } diff --git a/lxd/instance/drivers/driver_lxc.go b/lxd/instance/drivers/driver_lxc.go index fc19ec5022..86ea10f1d2 100644 --- a/lxd/instance/drivers/driver_lxc.go +++ b/lxd/instance/drivers/driver_lxc.go @@ -4291,6 +4291,32 @@ func (c *lxc) Update(args db.InstanceArgs, userRequested bool) error { return err } } + } else if strings.HasPrefix(key, "limits.hugepages.") { + pageType := "" + + switch key { + case "limits.hugepages.64KB": + pageType = "64KB" + case "limits.hugepages.1MB": + pageType = "1MB" + case "limits.hugepages.2MB": + pageType = "2MB" + case "limits.hugepages.1GB": + pageType = "1GB" + } + + if value != "" { + valueInt, err := units.ParseByteSizeString(value) + if err != nil { + return err + } + value = fmt.Sprintf("%d", valueInt) + } + + err = cg.SetMaxHugepages(pageType, value) + if err != nil { + return err + } } } } diff --git a/shared/instance.go b/shared/instance.go index a9dc1ab973..a616a45bf4 100644 --- a/shared/instance.go +++ b/shared/instance.go @@ -254,6 +254,11 @@ var KnownInstanceConfigKeys = map[string]func(value string) error{ "limits.disk.priority": IsPriority, + "limits.hugepages.64KB": IsSize, + "limits.hugepages.1MB": IsSize, + "limits.hugepages.2MB": IsSize, + "limits.hugepages.1GB": IsSize, + "limits.memory": func(value string) error { if value == "" { return nil From c68c311bd8d0570157e32e6c8137f99db37fa9b9 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Mon, 2 Mar 2020 17:35:05 +0100 Subject: [PATCH 4/4] doc: add limits_hugepages api extension Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- doc/api-extensions.md | 7 +++++++ doc/instances.md | 4 ++++ shared/version/api.go | 1 + 3 files changed, 12 insertions(+) diff --git a/doc/api-extensions.md b/doc/api-extensions.md index 2bedc3a5ab..ee3bc99546 100644 --- a/doc/api-extensions.md +++ b/doc/api-extensions.md @@ -932,3 +932,10 @@ When mount syscall interception is enabled and hugetlbfs is specified as an allowed filesystem type LXD will mount a separate hugetlbfs instance for the container with the uid and gid mount options set to the container's root uid and gid. This ensure that processes in the container can use hugepages. + +##limits\_hugepages.* +This allows to limit the number of hugepages a container can use through the +hugetlb cgroup. This means the hugetlb cgroup needs to be available. Note, that +limiting hugepages is recommended when intercepting the mount syscall for the +hugetlbfs filesystem to avoid allowing the container to exhaust the host's +hugepages resources. diff --git a/doc/instances.md b/doc/instances.md index 15f572b773..d3f97d8c3c 100644 --- a/doc/instances.md +++ b/doc/instances.md @@ -46,6 +46,10 @@ limits.cpu | string | - (all) | ye limits.cpu.allowance | string | 100% | yes | - | How much of the CPU can be used. Can be a percentage (e.g. 50%) for a soft limit or hard a chunk of time (25ms/100ms) limits.cpu.priority | integer | 10 (maximum) | yes | - | CPU scheduling priority compared to other instances sharing the same CPUs (overcommit) (integer between 0 and 10) limits.disk.priority | integer | 5 (medium) | yes | - | When under load, how much priority to give to the instance's I/O requests (integer between 0 and 10) +limits.hugepages.64KB | string | - | yes | container | Fixed value in bytes (various suffixes supported, see below) to limit number of 64 KB hugepages +limits.hugepages.1MB | string | - | yes | container | Fixed value in bytes (various suffixes supported, see below) to limit number of 1 MB hugepages +limits.hugepages.2MB | string | - | yes | container | Fixed value in bytes (various suffixes supported, see below) to limit number of 2 MB hugepages +limits.hugepages.1GB | string | - | yes | container | Fixed value in bytes (various suffixes supported, see below) to limit number of 1 GB hugepages limits.kernel.\* | string | - | no | container | This limits kernel resources per instance (e.g. number of open files) limits.memory | string | - (all) | yes | - | Percentage of the host's memory or fixed value in bytes (various suffixes supported, see below) limits.memory.enforce | string | hard | yes | container | If hard, instance can't exceed its memory limit. If soft, the instance can exceed its memory limit when extra host memory is available diff --git a/shared/version/api.go b/shared/version/api.go index fbc50b997c..83df5eab7c 100644 --- a/shared/version/api.go +++ b/shared/version/api.go @@ -191,6 +191,7 @@ var APIExtensions = []string{ "firewall_driver", "projects_limits", "container_syscall_intercept_hugetlbfs", + "limits_hugepages", } // APIExtensionsCount returns the number of available API extensions.
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel