The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/6443
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === This allows to intercept and redirect mount syscalls for filesystems and redirect them to their corresponding fuse implementation. A new key security.syscalls.intercept.mount.fuse=<fstype>=<fuse-binary> is added. Filesystems cannot both appear in security.syscalls.intercept.mount.fuse and security.syscalls.intercept.mount.allowed. Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
From 00811cbb3b68840f9796e2fa939f411601ebb970 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Tue, 12 Nov 2019 23:14:13 +0100 Subject: [PATCH 1/4] seccomp: implement redirection to fuse This allows to intercept and redirect mount syscalls for filesystems and redirect them to their corresponding fuse implementation. A new key security.syscalls.intercept.mount.fuse=<fstype>=<fuse-binary> is added. Filesystems cannot both appear in security.syscalls.intercept.mount.fuse and security.syscalls.intercept.mount.allowed. Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- lxd/container.go | 6 ++ lxd/main_forksyscall.go | 31 +++++-- lxd/seccomp/seccomp.go | 190 +++++++++++++++++++++++++++++++++++----- shared/container.go | 1 + shared/util.go | 2 + 5 files changed, 199 insertions(+), 31 deletions(-) diff --git a/lxd/container.go b/lxd/container.go index 8d5d3c457b..f61f286d01 100644 --- a/lxd/container.go +++ b/lxd/container.go @@ -24,6 +24,7 @@ import ( deviceConfig "github.com/lxc/lxd/lxd/device/config" "github.com/lxc/lxd/lxd/instance/instancetype" "github.com/lxc/lxd/lxd/operations" + "github.com/lxc/lxd/lxd/seccomp" "github.com/lxc/lxd/lxd/state" storagePools "github.com/lxc/lxd/lxd/storage" storageDrivers "github.com/lxc/lxd/lxd/storage/drivers" @@ -168,6 +169,11 @@ func containerValidConfig(sysOS *sys.OS, config map[string]string, profile bool, return fmt.Errorf("security.syscalls.whitelist is mutually exclusive with security.syscalls.blacklist*") } + err, _ := seccomp.SeccompSyscallInterceptMountFilter(config) + if err != nil { + return err + } + if expanded && (config["security.privileged"] == "" || !shared.IsTrue(config["security.privileged"])) && sysOS.IdmapSet == nil { return fmt.Errorf("LXD doesn't have a uid/gid allocation. In this mode, only privileged containers are supported") } diff --git a/lxd/main_forksyscall.go b/lxd/main_forksyscall.go index 04738aa6d9..6b2e1ac5d1 100644 --- a/lxd/main_forksyscall.go +++ b/lxd/main_forksyscall.go @@ -381,6 +381,7 @@ static void mount_emulate(void) { __do_close_prot_errno int mnt_fd = -EBADF; char *source = NULL, *shiftfs = NULL, *target = NULL, *fstype = NULL; + bool use_fuse; uid_t uid = -1, fsuid = -1; gid_t gid = -1, fsgid = -1; int ret; @@ -389,28 +390,40 @@ static void mount_emulate(void) const void *data; pid = atoi(advance_arg(true)); - source = advance_arg(true); - target = advance_arg(true); - fstype = advance_arg(true); - flags = atoi(advance_arg(true)); - shiftfs = advance_arg(true); + use_fuse = (atoi(advance_arg(true)) == 1); + if (!use_fuse) { + source = advance_arg(true); + target = advance_arg(true); + fstype = advance_arg(true); + flags = atoi(advance_arg(true)); + shiftfs = advance_arg(true); + } uid = atoi(advance_arg(true)); gid = atoi(advance_arg(true)); fsuid = atoi(advance_arg(true)); fsgid = atoi(advance_arg(true)); - data = advance_arg(false); + if (!use_fuse) + data = advance_arg(false); mnt_fd = preserve_ns(getpid(), "mnt"); if (mnt_fd < 0) _exit(EXIT_FAILURE); + if (use_fuse) + attach_userns(pid); + if (!acquire_basic_creds(pid)) _exit(EXIT_FAILURE); if (!acquire_final_creds(pid, uid, gid, fsuid, fsgid)) _exit(EXIT_FAILURE); - if (strcmp(shiftfs, "true") == 0) { + if (use_fuse) { + const char *cmd = advance_arg(true); + ret = system(cmd); + if (ret) + _exit(EXIT_FAILURE); + } else if (strcmp(shiftfs, "true") == 0) { char template[] = P_tmpdir "/.lxd_tmp_mount_XXXXXX"; // Create basic mount in container's mount namespace. @@ -525,12 +538,12 @@ type cmdForksyscall struct { func (c *cmdForksyscall) Command() *cobra.Command { // Main subcommand cmd := &cobra.Command{} - cmd.Use = "forksyscall <syscall> <PID> <path> <mode> <dev>" + cmd.Use = "forksyscall <syscall> <PID> [...]" cmd.Short = "Perform syscall operations" cmd.Long = `Description: Perform syscall operations - This set of internal commands are used for all seccom-based container syscall + This set of internal commands is used for all seccomp-based container syscall operations. ` cmd.RunE = c.Run diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go index 65ff54b70f..eef84aeb68 100644 --- a/lxd/seccomp/seccomp.go +++ b/lxd/seccomp/seccomp.go @@ -45,6 +45,7 @@ import ( #include <stdint.h> #include <stdlib.h> #include <string.h> +#include <sys/mount.h> #include <sys/socket.h> #include <sys/stat.h> #include <sys/syscall.h> @@ -1176,6 +1177,77 @@ type MountArgs struct { shift bool } +// MS_REC +var mountFlagsToOptMap = map[C.ulong]string{ + C.MS_BIND: "bind", + C.ulong(0): "defaults", + C.MS_LAZYTIME: "lazytime", + C.MS_MANDLOCK: "mand", + C.MS_NOATIME: "noatime", + C.MS_NODEV: "nodev", + C.MS_NODIRATIME: "nodiratime", + C.MS_NOEXEC: "noexec", + C.MS_NOSUID: "nosuid", + C.MS_RELATIME: "relatime", + C.MS_REMOUNT: "remount", + C.MS_RDONLY: "ro", + C.MS_STRICTATIME: "strictatime", + C.MS_SYNCHRONOUS: "sync", + C.MS_PRIVATE: "--make-private", + C.MS_SHARED: "--make-shared", + C.MS_SLAVE: "--make-slave", + C.MS_UNBINDABLE: "--make-unbindable", + + C.MS_REC | C.MS_BIND: "rbind", + C.MS_REC | C.MS_PRIVATE: "--make-rprivate", + C.MS_REC | C.MS_SHARED: "--make-rshared", + C.MS_REC | C.MS_SLAVE: "--make-rslave", + C.MS_REC | C.MS_UNBINDABLE: "--make-runbindable", +} + +func mountFlagsToOpts(flags C.ulong) (string, string) { + var bit C.ulong = 0 + opts := "" + args := "" + var msRec C.ulong = (flags & C.MS_REC) + + flags = (flags &^ C.MS_REC) + for bit < (4*8 - 1) { + if (flags & (1 << bit)) > 0 { + var flagKey C.ulong = (1 << bit) + + switch flagKey { + case C.MS_BIND: + fallthrough + case C.MS_PRIVATE: + fallthrough + case C.MS_SHARED: + fallthrough + case C.MS_SLAVE: + fallthrough + case C.MS_UNBINDABLE: + flagKey |= msRec + } + optOrArg := mountFlagsToOptMap[flagKey] + + if optOrArg == "" { + continue + } + + if strings.HasPrefix(optOrArg, "--") { + args = fmt.Sprintf("%s %s", args, optOrArg) + } else if opts == "" { + opts = fmt.Sprintf("%s", optOrArg) + } else { + opts = fmt.Sprintf("%s,%s", opts, optOrArg) + } + } + bit++ + } + + return opts, args +} + // HandleMountSyscall handles mount syscalls. func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int { ctx := log.Ctx{"container": c.Name(), @@ -1252,7 +1324,8 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int { args.data = C.GoString(&cBuf[0]) } - if !s.MountSyscallValid(c, &args) { + ok, fuseBinary := s.MountSyscallValid(c, &args) + if !ok { ctx["syscall_continue"] = "true" C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue)) return 0 @@ -1265,20 +1338,55 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int { return 0 } - _, _, err = shared.RunCommandSplit(nil, util.GetExecPath(), - "forksyscall", - "mount", - fmt.Sprintf("%d", args.pid), - fmt.Sprintf("%s", args.source), - fmt.Sprintf("%s", args.target), - fmt.Sprintf("%s", args.fstype), - fmt.Sprintf("%d", args.flags), - fmt.Sprintf("%t", args.shift), - fmt.Sprintf("%d", nsuid), - fmt.Sprintf("%d", nsgid), - fmt.Sprintf("%d", nsfsuid), - fmt.Sprintf("%d", nsfsgid), - fmt.Sprintf("%s", args.data)) + if fuseBinary != "" { + addOpts, addArgs := mountFlagsToOpts(C.ulong(args.flags)) + + fuseCmd := fmt.Sprintf("mount.fuse %s#%s %s", fuseBinary, args.source, args.target) + + if addArgs != "" { + fuseCmd = fmt.Sprintf("%s %s", fuseCmd, addArgs) + } + + if args.data != "" || addOpts != "" { + fuseCmd = fmt.Sprintf("%s -o", fuseCmd) + if args.data != "" && addOpts != "" { + fuseCmd = fmt.Sprintf("%s %s,%s", fuseCmd, args.data, addOpts) + } else if args.data != "" { + fuseCmd = fmt.Sprintf("%s %s", fuseCmd, args.data) + } else { + fuseCmd = fmt.Sprintf("%s %s", fuseCmd, addOpts) + } + } + + logger.Errorf("AAAA: %s", fuseCmd) + ctx["fuse_cmd"] = fuseCmd + _, _, err = shared.RunCommandSplit(nil, util.GetExecPath(), + "forksyscall", + "mount", + fmt.Sprintf("%d", args.pid), + fmt.Sprintf("%d", 1), + fmt.Sprintf("%d", nsuid), + fmt.Sprintf("%d", nsgid), + fmt.Sprintf("%d", nsfsuid), + fmt.Sprintf("%d", nsfsgid), + fmt.Sprintf("%s", fuseCmd)) + } else { + _, _, err = shared.RunCommandSplit(nil, util.GetExecPath(), + "forksyscall", + "mount", + fmt.Sprintf("%d", args.pid), + fmt.Sprintf("%d", 0), + fmt.Sprintf("%s", args.source), + fmt.Sprintf("%s", args.target), + fmt.Sprintf("%s", args.fstype), + fmt.Sprintf("%d", args.flags), + fmt.Sprintf("%t", args.shift), + fmt.Sprintf("%d", nsuid), + fmt.Sprintf("%d", nsgid), + fmt.Sprintf("%d", nsfsuid), + fmt.Sprintf("%d", nsfsgid), + fmt.Sprintf("%s", args.data)) + } if err != nil { ctx["syscall_continue"] = "true" C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue)) @@ -1390,16 +1498,54 @@ func MountSyscallFilter(config map[string]string) []string { return fs } -// MountSyscallValid checks whether this is a mount syscall we intercept. -func (s *Server) MountSyscallValid(c Instance, args *MountArgs) bool { - fsList := MountSyscallFilter(c.ExpandedConfig()) - for _, fs := range fsList { - if fs == args.fstype { - return true +// SeccompSyscallInterceptMountFilter creates a new mount syscall interception filter +func SeccompSyscallInterceptMountFilter(config map[string]string) (error, map[string]string) { + if !shared.IsTrue(config["security.syscalls.intercept.mount"]) { + return nil, map[string]string{} + + } + + fsMap := map[string]string{} + fsFused := strings.Split(config["security.syscalls.intercept.mount.fuse"], ",") + if len(fsFused) > 0 && fsFused[0] != "" { + for _, ent := range fsFused { + fsfuse := strings.Split(ent, "=") + if len(fsfuse) != 2 { + return fmt.Errorf("security.syscalls.intercept.mount.fuse is not of the form 'filesystem=fuse-binary': %s", ent), map[string]string{} + } + + // fsfuse[0] == filesystems that are ok to mount + // fsfuse[1] == fuse binary to use to mount filesystemstype + fsMap[fsfuse[0]] = fsfuse[1] } } - return false + fsAllowed := strings.Split(config["security.syscalls.intercept.mount.allowed"], ",") + if len(fsAllowed) > 0 && fsAllowed[0] != "" { + for _, allowedfs := range fsAllowed { + if fsMap[allowedfs] != "" { + return fmt.Errorf("Filesystem %s cannot appear in security.syscalls.intercept.mount.allowed and security.syscalls.intercept.mount.fuse", allowedfs), map[string]string{} + } + + fsMap[allowedfs] = "" + } + } + + return nil, fsMap +} + +// MountSyscallValid checks whether this is a mount syscall we intercept. +func (s *Server) MountSyscallValid(c Instance, args *MountArgs) (bool, string) { + err, fsMap := SeccompSyscallInterceptMountFilter(c.ExpandedConfig()) + if err != nil { + return false, "" + } + + if fuse, ok := fsMap[args.fstype]; ok { + return true, fuse + } + + return false, "" } // MountSyscallShift checks whether this mount syscall needs shiftfs. diff --git a/shared/container.go b/shared/container.go index cb04e09141..719aac857f 100644 --- a/shared/container.go +++ b/shared/container.go @@ -300,6 +300,7 @@ var KnownContainerConfigKeys = map[string]func(value string) error{ "security.syscalls.intercept.mknod": IsBool, "security.syscalls.intercept.mount": IsBool, "security.syscalls.intercept.mount.allowed": IsAny, + "security.syscalls.intercept.mount.fuse": IsAny, "security.syscalls.intercept.mount.shift": IsBool, "security.syscalls.intercept.setxattr": IsBool, "security.syscalls.whitelist": IsAny, diff --git a/shared/util.go b/shared/util.go index e307f985fe..8bb03ab470 100644 --- a/shared/util.go +++ b/shared/util.go @@ -29,6 +29,7 @@ import ( "github.com/lxc/lxd/shared/cancel" "github.com/lxc/lxd/shared/ioprogress" + "github.com/lxc/lxd/shared/logger" "github.com/lxc/lxd/shared/units" ) @@ -991,6 +992,7 @@ func DownloadFileHash(httpClient *http.Client, useragent string, progress func(p } result := fmt.Sprintf("%x", hashFunc.Sum(nil)) + logger.Errorf("Hashing for download from url %s. Got hash: %s. Expected hash: %s", url, result, hash) if result != hash { return -1, fmt.Errorf("Hash mismatch for %s: %s != %s", url, result, hash) } From dab3ee3cfe2401081d9954ce3ada2d40632f6dd2 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Tue, 12 Nov 2019 23:18:37 +0100 Subject: [PATCH 2/4] api: add container_syscall_intercept_mount_fuse extension Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- doc/api-extensions.md | 7 ++++++- shared/version/api.go | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/api-extensions.md b/doc/api-extensions.md index ca09e50e41..b78ffb32c8 100644 --- a/doc/api-extensions.md +++ b/doc/api-extensions.md @@ -871,4 +871,9 @@ elevated permissions. Adds support for importing/exporting of images/backups using SquashFS file system format. ## container\_raw\_mount -This adds support for passing in raw mount options for disk devices. \ No newline at end of file +This adds support for passing in raw mount options for disk devices. + +## container\_syscall\_intercept\_mount\_fuse +Adds the `security.syscalls.intercept.mount.fuse` key. It can be used to +redirect filesystem mounts to their fuse implementation. To this end, set e.g. +`security.syscalls.intercept.mount.fuse=ext4=fuse2fs`. diff --git a/shared/version/api.go b/shared/version/api.go index f6b0e345a1..fb1e6edd43 100644 --- a/shared/version/api.go +++ b/shared/version/api.go @@ -175,6 +175,7 @@ var APIExtensions = []string{ "container_syscall_intercept_mount", "compression_squashfs", "container_raw_mount", + "container_syscall_intercept_mount_fuse", } // APIExtensionsCount returns the number of available API extensions. From 9f64917fb6e48c39b3d8ed5516999f2a5a59957d Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Tue, 12 Nov 2019 23:22:17 +0100 Subject: [PATCH 3/4] doc: add security.syscalls.intercept.mount.fuse Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- doc/containers.md | 109 +++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/doc/containers.md b/doc/containers.md index 5441367e3b..e5c46602c0 100644 --- a/doc/containers.md +++ b/doc/containers.md @@ -34,60 +34,61 @@ currently supported: The currently supported keys are: -Key | Type | Default | Live update | API extension | Description -:-- | :--- | :------ | :---------- | :------------ | :---------- -boot.autostart | boolean | - | n/a | - | Always start the container when LXD starts (if not set, restore last state) -boot.autostart.delay | integer | 0 | n/a | - | Number of seconds to wait after the container started before starting the next one -boot.autostart.priority | integer | 0 | n/a | - | What order to start the containers in (starting with highest) -boot.host\_shutdown\_timeout | integer | 30 | yes | container\_host\_shutdown\_timeout | Seconds to wait for container to shutdown before it is force stopped -boot.stop.priority | integer | 0 | n/a | container\_stop\_priority | What order to shutdown the containers (starting with highest) -environment.\* | string | - | yes (exec) | - | key/value environment variables to export to the container and set on exec -limits.cpu | string | - (all) | yes | - | Number or range of CPUs to expose to the container -limits.cpu.allowance | string | 100% | yes | - | How much of the CPU can be used. Can be a percentage (e.g. 50%) for a soft limit or hard a chunk of time (25ms/100ms) -limits.cpu.priority | integer | 10 (maximum) | yes | - | CPU scheduling priority compared to other containers sharing the same CPUs (overcommit) (integer between 0 and 10) -limits.disk.priority | integer | 5 (medium) | yes | - | When under load, how much priority to give to the container's I/O requests (integer between 0 and 10) -limits.kernel.\* | string | - | no | kernel\_limits | This limits kernel resources per container (e.g. number of open files) -limits.memory | string | - (all) | yes | - | Percentage of the host's memory or fixed value in bytes (various suffixes supported, see below) -limits.memory.enforce | string | hard | yes | - | If hard, container can't exceed its memory limit. If soft, the container can exceed its memory limit when extra host memory is available. -limits.memory.swap | boolean | true | yes | - | Whether to allow some of the container's memory to be swapped out to disk -limits.memory.swap.priority | integer | 10 (maximum) | yes | - | The higher this is set, the least likely the container is to be swapped to disk (integer between 0 and 10) -limits.network.priority | integer | 0 (minimum) | yes | - | When under load, how much priority to give to the container's network requests (integer between 0 and 10) -limits.processes | integer | - (max) | yes | - | Maximum number of processes that can run in the container -linux.kernel\_modules | string | - | yes | - | Comma separated list of kernel modules to load before starting the container -migration.incremental.memory | boolean | false | yes | migration\_pre\_copy | Incremental memory transfer of the container's memory to reduce downtime. -migration.incremental.memory.goal | integer | 70 | yes | migration\_pre\_copy | Percentage of memory to have in sync before stopping the container. -migration.incremental.memory.iterations | integer | 10 | yes | migration\_pre\_copy | Maximum number of transfer operations to go through before stopping the container. -nvidia.driver.capabilities | string | compute,utility | no | nvidia\_runtime\_config | What driver capabilities the container needs (sets libnvidia-container NVIDIA\_DRIVER\_CAPABILITIES) -nvidia.runtime | boolean | false | no | nvidia\_runtime | Pass the host NVIDIA and CUDA runtime libraries into the container -nvidia.require.cuda | string | - | no | nvidia\_runtime\_config | Version expression for the required CUDA version (sets libnvidia-container NVIDIA\_REQUIRE\_CUDA) -nvidia.require.driver | string | - | no | nvidia\_runtime\_config | Version expression for the required driver version (sets libnvidia-container NVIDIA\_REQUIRE\_DRIVER) -raw.apparmor | blob | - | yes | - | Apparmor profile entries to be appended to the generated profile -raw.idmap | blob | - | no | id\_map | Raw idmap configuration (e.g. "both 1000 1000") -raw.lxc | blob | - | no | - | Raw LXC configuration to be appended to the generated one -raw.seccomp | blob | - | no | container\_syscall\_filtering | Raw Seccomp configuration -security.devlxd | boolean | true | no | restrict\_devlxd | Controls the presence of /dev/lxd in the container -security.devlxd.images | boolean | false | no | devlxd\_images | Controls the availability of the /1.0/images API over devlxd -security.idmap.base | integer | - | no | id\_map\_base | The base host ID to use for the allocation (overrides auto-detection) -security.idmap.isolated | boolean | false | no | id\_map | Use an idmap for this container that is unique among containers with isolated set. -security.idmap.size | integer | - | no | id\_map | The size of the idmap to use -security.nesting | boolean | false | yes | - | Support running lxd (nested) inside the container -security.privileged | boolean | false | no | - | Runs the container in privileged mode -security.protection.delete | boolean | false | yes | container\_protection\_delete | Prevents the container from being deleted -security.protection.shift | boolean | false | yes | container\_protection\_shift | Prevents the container's filesystem from being uid/gid shifted on startup -security.syscalls.blacklist | string | - | no | container\_syscall\_filtering | A '\n' separated list of syscalls to blacklist -security.syscalls.blacklist\_compat | boolean | false | no | container\_syscall\_filtering | On x86\_64 this enables blocking of compat\_\* syscalls, it is a no-op on other arches -security.syscalls.blacklist\_default | boolean | true | no | container\_syscall\_filtering | Enables the default syscall blacklist -security.syscalls.intercept.mknod | boolean | false | no | container\_syscall\_intercept | Handles the `mknod` and `mknodat` system calls (allows creation of a limited subset of char/block devices) -security.syscalls.intercept.mount | boolean | false | no | container\_syscall\_intercept\_mount | Handles the `mount` system call -security.syscalls.intercept.mount.allowed | string | - | yes | container\_syscall\_intercept\_mount | Specify a comma-separated list of filesystems that are safe to mount for processes inside the container. -security.syscalls.intercept.mount.shift | boolean | false | yes | container\_syscall\_intercept\_mount | Whether to mount shiftfs on top of filesystems handled through mount syscall interception. -security.syscalls.intercept.setxattr | boolean | false | no | container\_syscall\_intercept | Handles the `setxattr` system call (allows setting a limited subset of restricted extended attributes) -security.syscalls.whitelist | string | - | no | container\_syscall\_filtering | A '\n' separated list of syscalls to whitelist (mutually exclusive with security.syscalls.blacklist\*) -snapshots.schedule | string | - | no | snapshot\_scheduling | Cron expression (`<minute> <hour> <dom> <month> <dow>`) -snapshots.schedule.stopped | bool | false | no | snapshot\_scheduling | Controls whether or not stopped containers are to be snapshoted automatically -snapshots.pattern | string | snap%d | no | snapshot\_scheduling | Pongo2 template string which represents the snapshot name (used for scheduled snapshots and unnamed snapshots) -snapshots.expiry | string | - | no | snapshot\_expiry | Controls when snapshots are to be deleted (expects expression like `1M 2H 3d 4w 5m 6y`) -user.\* | string | - | n/a | - | Free form user key/value storage (can be used in search) +Key | Type | Default | Live update | API extension | Description +:-- | :--- | :------ | :---------- | :------------ | :---------- +boot.autostart | boolean | - | n/a | - | Always start the container when LXD starts (if not set, restore last state) +boot.autostart.delay | integer | 0 | n/a | - | Number of seconds to wait after the container started before starting the next one +boot.autostart.priority | integer | 0 | n/a | - | What order to start the containers in (starting with highest) +boot.host\_shutdown\_timeout | integer | 30 | yes | container\_host\_shutdown\_timeout | Seconds to wait for container to shutdown before it is force stopped +boot.stop.priority | integer | 0 | n/a | container\_stop\_priority | What order to shutdown the containers (starting with highest) +environment.\* | string | - | yes (exec) | - | key/value environment variables to export to the container and set on exec +limits.cpu | string | - (all) | yes | - | Number or range of CPUs to expose to the container +limits.cpu.allowance | string | 100% | yes | - | How much of the CPU can be used. Can be a percentage (e.g. 50%) for a soft limit or hard a chunk of time (25ms/100ms) +limits.cpu.priority | integer | 10 (maximum) | yes | - | CPU scheduling priority compared to other containers sharing the same CPUs (overcommit) (integer between 0 and 10) +limits.disk.priority | integer | 5 (medium) | yes | - | When under load, how much priority to give to the container's I/O requests (integer between 0 and 10) +limits.kernel.\* | string | - | no | kernel\_limits | This limits kernel resources per container (e.g. number of open files) +limits.memory | string | - (all) | yes | - | Percentage of the host's memory or fixed value in bytes (various suffixes supported, see below) +limits.memory.enforce | string | hard | yes | - | If hard, container can't exceed its memory limit. If soft, the container can exceed its memory limit when extra host memory is available. +limits.memory.swap | boolean | true | yes | - | Whether to allow some of the container's memory to be swapped out to disk +limits.memory.swap.priority | integer | 10 (maximum) | yes | - | The higher this is set, the least likely the container is to be swapped to disk (integer between 0 and 10) +limits.network.priority | integer | 0 (minimum) | yes | - | When under load, how much priority to give to the container's network requests (integer between 0 and 10) +limits.processes | integer | - (max) | yes | - | Maximum number of processes that can run in the container +linux.kernel\_modules | string | - | yes | - | Comma separated list of kernel modules to load before starting the container +migration.incremental.memory | boolean | false | yes | migration\_pre\_copy | Incremental memory transfer of the container's memory to reduce downtime. +migration.incremental.memory.goal | integer | 70 | yes | migration\_pre\_copy | Percentage of memory to have in sync before stopping the container. +migration.incremental.memory.iterations | integer | 10 | yes | migration\_pre\_copy | Maximum number of transfer operations to go through before stopping the container. +nvidia.driver.capabilities | string | compute,utility | no | nvidia\_runtime\_config | What driver capabilities the container needs (sets libnvidia-container NVIDIA\_DRIVER\_CAPABILITIES) +nvidia.runtime | boolean | false | no | nvidia\_runtime | Pass the host NVIDIA and CUDA runtime libraries into the container +nvidia.require.cuda | string | - | no | nvidia\_runtime\_config | Version expression for the required CUDA version (sets libnvidia-container NVIDIA\_REQUIRE\_CUDA) +nvidia.require.driver | string | - | no | nvidia\_runtime\_config | Version expression for the required driver version (sets libnvidia-container NVIDIA\_REQUIRE\_DRIVER) +raw.apparmor | blob | - | yes | - | Apparmor profile entries to be appended to the generated profile +raw.idmap | blob | - | no | id\_map | Raw idmap configuration (e.g. "both 1000 1000") +raw.lxc | blob | - | no | - | Raw LXC configuration to be appended to the generated one +raw.seccomp | blob | - | no | container\_syscall\_filtering | Raw Seccomp configuration +security.devlxd | boolean | true | no | restrict\_devlxd | Controls the presence of /dev/lxd in the container +security.devlxd.images | boolean | false | no | devlxd\_images | Controls the availability of the /1.0/images API over devlxd +security.idmap.base | integer | - | no | id\_map\_base | The base host ID to use for the allocation (overrides auto-detection) +security.idmap.isolated | boolean | false | no | id\_map | Use an idmap for this container that is unique among containers with isolated set. +security.idmap.size | integer | - | no | id\_map | The size of the idmap to use +security.nesting | boolean | false | yes | - | Support running lxd (nested) inside the container +security.privileged | boolean | false | no | - | Runs the container in privileged mode +security.protection.delete | boolean | false | yes | container\_protection\_delete | Prevents the container from being deleted +security.protection.shift | boolean | false | yes | container\_protection\_shift | Prevents the container's filesystem from being uid/gid shifted on startup +security.syscalls.blacklist | string | - | no | container\_syscall\_filtering | A '\n' separated list of syscalls to blacklist +security.syscalls.blacklist\_compat | boolean | false | no | container\_syscall\_filtering | On x86\_64 this enables blocking of compat\_\* syscalls, it is a no-op on other arches +security.syscalls.blacklist\_default | boolean | true | no | container\_syscall\_filtering | Enables the default syscall blacklist +security.syscalls.intercept.mknod | boolean | false | no | container\_syscall\_intercept | Handles the `mknod` and `mknodat` system calls (allows creation of a limited subset of char/block devices) +security.syscalls.intercept.mount | boolean | false | no | container\_syscall\_intercept\_mount | Handles the `mount` system call +security.syscalls.intercept.mount.allowed | string | - | yes | container\_syscall\_intercept\_mount | Specify a comma-separated list of filesystems that are safe to mount for processes inside the container. +security.syscalls.intercept.mount.fuse | string | - | yes | container\_syscall\_intercept\_mount\_fuse | Whether to mount shiftfs on top of filesystems handled through mount syscall interception. +security.syscalls.intercept.mount.shift | boolean | false | yes | container\_syscall\_intercept\_mount | Whether to redirect mounts of a given filesystem to their fuse implemenation (e.g. ext4=fuse2fs) +security.syscalls.intercept.setxattr | boolean | false | no | container\_syscall\_intercept | Handles the `setxattr` system call (allows setting a limited subset of restricted extended attributes) +security.syscalls.whitelist | string | - | no | container\_syscall\_filtering | A '\n' separated list of syscalls to whitelist (mutually exclusive with security.syscalls.blacklist\*) +snapshots.schedule | string | - | no | snapshot\_scheduling | Cron expression (`<minute> <hour> <dom> <month> <dow>`) +snapshots.schedule.stopped | bool | false | no | snapshot\_scheduling | Controls whether or not stopped containers are to be snapshoted automatically +snapshots.pattern | string | snap%d | no | snapshot\_scheduling | Pongo2 template string which represents the snapshot name (used for scheduled snapshots and unnamed snapshots) +snapshots.expiry | string | - | no | snapshot\_expiry | Controls when snapshots are to be deleted (expects expression like `1M 2H 3d 4w 5m 6y`) +user.\* | string | - | n/a | - | Free form user key/value storage (can be used in search) The following volatile keys are currently internally used by LXD: From 9cb23c07a7f17cefc235afe8436d184b68e1a832 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Tue, 12 Nov 2019 23:22:52 +0100 Subject: [PATCH 4/4] scripts: add security.syscalls.intercept.mount.fuse Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- scripts/bash/lxd-client | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/bash/lxd-client b/scripts/bash/lxd-client index 1fae67dea7..19d4173bd7 100644 --- a/scripts/bash/lxd-client +++ b/scripts/bash/lxd-client @@ -96,6 +96,7 @@ _have lxc && { security.syscalls.blacklist_compat security.syscalls.blacklist_default \ security.syscalls.intercept.mknod security.syscalls.intercept.mount \ security.syscalls.intercept.mount.allowed \ + security.syscall.intercept.mount.fuse \ security.syscalls.intercept.setxattr \ security.syscall.intercept.mount.shift \ snapshots.schedule snapshots.schedule.stopped snapshots.pattern \
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel