The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/crio-lxc/pull/23
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === Hi crio-lxc devs, I've made a lot of changes to the crio-lxc source to make it running with kubernetes. With my changes I can bootstrap a single node kubernetes. I have deployed several apps, even with more complex startup commands (spec.Process.Args). Short background: At my company we are currently running our monolitic system as a lxc container and we want to take advantage of kubernetes features - so we are willing to contribute to crio-lxc in the future. Please take some time and have a look at the changes I made and let's start discussion. Greeting, Ruben My system setup: root@container-node:~/crio-lxc# crio --version crio version 1.19.0-dev Version: 1.19.0-dev GitCommit: a7b54a80cb5f4818b2175e31fe44a84f6755a5cd GitTreeState: clean BuildDate: 2020-07-16T11:23:53Z GoVersion: go1.14.4 Compiler: gc Platform: linux/amd64 Linkmode: dynamic ``` lxc 4.0.2 root@container-node:~/crio-lxc# lxc-checkconfig Kernel configuration not found at /proc/config.gz; searching... Kernel configuration found at /boot/config-5.7.0-1-amd64 --- Namespaces --- Namespaces: enabled Utsname namespace: enabled Ipc namespace: enabled Pid namespace: enabled User namespace: enabled Network namespace: enabled --- Control groups --- Cgroups: enabled Cgroup v1 mount points: /sys/fs/cgroup/systemd /sys/fs/cgroup/pids /sys/fs/cgroup/memory /sys/fs/cgroup/net_cls,net_prio /sys/fs/cgroup/devices /sys/fs/cgroup/blkio /sys/fs/cgroup/cpuset /sys/fs/cgroup/freezer /sys/fs/cgroup/cpu,cpuacct /sys/fs/cgroup/perf_event /sys/fs/cgroup/rdma Cgroup v2 mount points: /sys/fs/cgroup/unified Cgroup v1 clone_children flag: enabled Cgroup device: enabled Cgroup sched: enabled Cgroup cpu account: enabled Cgroup memory controller: enabled Cgroup cpuset: enabled --- Misc --- Veth pair device: enabled, loaded Macvlan: enabled, not loaded Vlan: enabled, not loaded Bridges: enabled, loaded Advanced netfilter: enabled, loaded CONFIG_NF_NAT_IPV4: missing CONFIG_NF_NAT_IPV6: missing CONFIG_IP_NF_TARGET_MASQUERADE: enabled, not loaded CONFIG_IP6_NF_TARGET_MASQUERADE: enabled, not loaded CONFIG_NETFILTER_XT_TARGET_CHECKSUM: enabled, loaded CONFIG_NETFILTER_XT_MATCH_COMMENT: enabled, loaded FUSE (for use with lxcfs): enabled, loaded --- Checkpoint/Restore --- checkpoint restore: enabled CONFIG_FHANDLE: enabled CONFIG_EVENTFD: enabled CONFIG_EPOLL: enabled CONFIG_UNIX_DIAG: enabled CONFIG_INET_DIAG: enabled CONFIG_PACKET_DIAG: enabled CONFIG_NETLINK_DIAG: enabled File capabilities: Note : Before booting a new kernel, you can check its configuration usage : CONFIG=/path/to/config /usr/bin/lxc-checkconfig ```
From c6397825ff9e24abff09016132a76f893d5ff510 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Mon, 6 Jul 2020 19:30:53 +0200 Subject: [PATCH 01/19] create: Poll for container status to fix container startup. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/create.go | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/cmd/create.go b/cmd/create.go index f0f20a5..78990e4 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -295,6 +295,19 @@ func makeSyncFifo(dir string) error { return nil } +func waitContainer(c *lxc.Container, state lxc.State, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + // liblxc.Wait / go-libxc.Wait do not block when container is stopped. BUG in liblxc ? + // https://github.com/lxc/lxc/issues/2027 + for time.Now().Before(deadline) { + if c.State() == state { + return true + } + time.Sleep(time.Millisecond * 50) + } + return false +} + func startContainer(c *lxc.Container, spec *specs.Spec) error { binary, err := os.Readlink("/proc/self/exe") if err != nil { @@ -317,8 +330,10 @@ func startContainer(c *lxc.Container, spec *specs.Spec) error { cmdErr := cmd.Start() + log.Debugf("LXC container PID %d", c.InitPid()) + if cmdErr == nil { - if !c.Wait(lxc.RUNNING, 30*time.Second) { + if !waitContainer(c, lxc.RUNNING, 30*time.Second) { cmdErr = fmt.Errorf("Container failed to initialize") } } From 75db967708cb67b96bd782634e258fc5ee2336f2 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 8 Jul 2020 15:55:57 +0200 Subject: [PATCH 02/19] Add stub --systemd-cgroup cmdline flag to gracefully handle default crio config. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/main.go | 4 ++++ test/crio.conf.in | 9 ++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 36a02dc..0598913 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -44,6 +44,10 @@ func main() { Usage: "set the lxc path to use", Value: "/var/lib/lxc", }, + cli.BoolFlag{ + Name: "systemd-cgroup", + Usage: "enable systemd cgroup (unimplemented stub for conmon)", + }, } app.Before = func(ctx *cli.Context) error { diff --git a/test/crio.conf.in b/test/crio.conf.in index eb0c7f8..5331fe7 100644 --- a/test/crio.conf.in +++ b/test/crio.conf.in @@ -81,7 +81,7 @@ grpc_max_recv_msg_size = 16777216 # default_runtime is the _name_ of the OCI runtime to be used as the default. # The name is matched against the runtimes map below. -default_runtime = "runc" +default_runtime = "lxc" # If true, the runtime will not use pivot_root, but instead use MS_MOVE. no_pivot = false @@ -89,6 +89,9 @@ no_pivot = false # Path to the conmon binary, used for monitoring the OCI runtime. conmon = "PACKAGES_DIR/conmon/bin/conmon" +# Cgroup setting for conmon +#conmon_cgroup = "system.slice" + # Path to pinns. pinns_path = "PACKAGES_DIR/cri-o/bin/pinns" @@ -110,7 +113,7 @@ seccomp_profile = "CRIOLXC_TEST_DIR/seccomp.json" apparmor_profile = "unconfined" # Cgroup management implementation used for the runtime. -cgroup_manager = "cgroupfs" +cgroup_manager = "systemd" # List of default capabilities for containers. If it is empty or commented out, # only the capabilities defined in the containers json file by the user/kube @@ -213,7 +216,7 @@ manage_network_ns_lifecycle = false # If no runtime_handler is provided, the runtime will be picked based on the level # of trust of the workload. -[crio.runtime.runtimes.runc] +[crio.runtime.runtimes.lxc] runtime_path = "CRIOLXC_BINARY" runtime_type = "oci" runtime_root = "CRIOLXC_TEST_DIR/runtime-root" From e1faa3938c15193603fee74d3990650412336ded Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 8 Jul 2020 15:57:16 +0200 Subject: [PATCH 03/19] Small documentation and test iprovements. Signed-off-by: Ruben Jenster <[email protected]> --- .gitignore | 1 + Makefile | 3 +++ README.md | 22 ++++++++++++++++++++++ test/helpers.bash | 3 +-- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index f940ec2..4a7f0c8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ crio-lxc-test* oci/ roots/ .stacker/ +.keeptempdirs diff --git a/Makefile b/Makefile index 1d8f2b2..a6598c3 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,9 @@ check: crio-lxc go test ./... PACKAGES_DIR=$(PACKAGES_DIR) sudo -E "PATH=$$PATH" bats -t $(patsubst %,test/%.bats,$(TEST)) +%.bats: crio-lxc + PACKAGES_DIR=$(PACKAGES_DIR) sudo -E "PATH=$$PATH" bats -t test/$@ + .PHONY: vendorup vendorup: go get -u diff --git a/README.md b/README.md index fbce9fd..53039d2 100644 --- a/README.md +++ b/README.md @@ -51,3 +51,25 @@ You'll also need conntrack installed: ``` apt install conntrack ``` + +You have to install [bats](https://github.com/bats-core/bats-core) to run the tests. +On debian you can install bats with: + + apt-get install bats + + +To keep the tempdir of of a test run, you have to create the lockfile `.keeptempdirs` +in the top-level of this repository. + + touch .keeptempdirs + +To debug `crictl` calls within a test run: + + CRICTLDEBUG="-D" make basic.bats + +`bats` does not show any output when the test was successful. +The logfile is created in /tmp and deleted when the test was successful. +To watch the test output while the test is running: + + tail -f /tmp/bats.*.log + diff --git a/test/helpers.bash b/test/helpers.bash index 2bba1b2..d8800d1 100644 --- a/test/helpers.bash +++ b/test/helpers.bash @@ -53,8 +53,7 @@ function cleanup_tempdir { function crictl { # watch out for: https://github.com/kubernetes-sigs/cri-tools/issues/460 # If you need more debug output, set CRICTLDEBUG to -D - CRICTLDEBUG="" - $(which crictl) ${CRICTLDEBUG} --runtime-endpoint "$TEMP_DIR/crio.sock" $@ + $(which crictl) ${CRICTLDEBUG} --runtime-endpoint "unix://$TEMP_DIR/crio.sock" $@ echo "$output" } From 8c116566e314b0940fff2af4c530142ef698aabc Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 8 Jul 2020 17:17:14 +0200 Subject: [PATCH 04/19] create: Fix lxc.mount.entry generation Signed-off-by: Ruben Jenster <[email protected]> --- cmd/create.go | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/cmd/create.go b/cmd/create.go index 78990e4..8c4412a 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -227,8 +227,35 @@ func configureContainer(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) er } for _, ms := range spec.Mounts { + // ignore cgroup mount, lxc automouts this even with lxc.rootfs.managed = 0 + // conf.c:mount_entry:1854 - Device or resource busy - Failed to mount "cgroup" on "/usr/lib/x86_64-linux-gnu/lxc/rootfs/sys/fs/cgroup" + if ms.Type == "cgroup" { + continue + } + + // create target files and directories + info, err := os.Stat(ms.Source) + if err == nil { + if info.IsDir() { + ms.Options = append(ms.Options, "create=dir") + } else { + ms.Options = append(ms.Options, "create=file") + } + } else { + // This case catches all kind of virtual and remote filesystems (/dev/pts, /dev/shm, sysfs, procfs, dev ...) + // It can not be a file because the source file for a bind mount must exist. + if os.IsNotExist(err) { + ms.Options = append(ms.Options, "create=dir") + } else { + log.Debugf("failed to stat source %s of mountpoint %s: %s", ms.Source, ms.Destination, err) + } + } + opts := strings.Join(ms.Options, ",") - mnt := fmt.Sprintf("%s %s %s %s", ms.Source, ms.Destination, ms.Type, opts) + // Make mount paths relative to container root https://github.com/lxc/lxc/issues/2276 + dest := strings.TrimLeft(ms.Destination, "/") + mnt := fmt.Sprintf("%s %s %s %s", ms.Source, dest, ms.Type, opts) + if err := c.SetConfigItem("lxc.mount.entry", mnt); err != nil { return errors.Wrap(err, "failed to set mount config") } From 65bd9bbd2287fd7bb937246f3b79616bc9912958 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 8 Jul 2020 17:20:02 +0200 Subject: [PATCH 05/19] main: Fix enable debug logging. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/main.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 0598913..569bf4a 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -50,15 +50,17 @@ func main() { }, } + log.SetLevel(log.InfoLevel) + app.Before = func(ctx *cli.Context) error { LXC_PATH = ctx.String("lxc-path") - debug = ctx.Bool("debug") + if debug { + log.SetLevel(log.DebugLevel) + } return nil } - log.SetLevel(log.InfoLevel) - if err := app.Run(os.Args); err != nil { format := "error: %v\n" if debug { From e314dd29d719ad2563ad344cc6d36811fe130c4d Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 8 Jul 2020 17:21:21 +0200 Subject: [PATCH 06/19] Disable checkHackyPreStart that prevents proper container termination. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/kill.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/kill.go b/cmd/kill.go index 3af4ddf..a665048 100644 --- a/cmd/kill.go +++ b/cmd/kill.go @@ -93,7 +93,7 @@ func doKill(ctx *cli.Context) error { } - if c.Running() && checkHackyPreStart(c) == "started" { + if c.Running() { //&& checkHackyPreStart(c) == "started" { pid := c.InitPid() if err := unix.Kill(pid, signalMap[ctx.String("signal")]); err != nil { From a713a4be7adae75c1d5f7f72027a139e34852351 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Fri, 10 Jul 2020 16:34:52 +0200 Subject: [PATCH 07/19] exec: Implement running a new command within a running container. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/exec.go | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++ cmd/main.go | 1 + 2 files changed, 100 insertions(+) create mode 100644 cmd/exec.go diff --git a/cmd/exec.go b/cmd/exec.go new file mode 100644 index 0000000..a13d00f --- /dev/null +++ b/cmd/exec.go @@ -0,0 +1,99 @@ +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "github.com/urfave/cli" + + "golang.org/x/sys/unix" + + lxc "gopkg.in/lxc/go-lxc.v2" +) + +var execCmd = cli.Command{ + Name: "exec", + Usage: "execute a new process in a running container", + ArgsUsage: "<containerID>", + Action: doExec, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "process, p", + Usage: "path to process json", + Value: "", + }, + cli.StringFlag{ + Name: "pid-file", + Usage: "file to write the process id to", + Value: "", + }, + cli.BoolFlag{ + Name: "detach, d", + Usage: "detach from the executed process", + }, + }, +} + +// NOTE stdio (stdout/stderr) is not attached when adding unix.CLONE_NEWUSER +const EXEC_NAMESPACES = unix.CLONE_NEWIPC | unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP | unix.CLONE_NEWPID + +func doExec(ctx *cli.Context) error { + containerID := ctx.Args().First() + if len(containerID) == 0 { + fmt.Fprintf(os.Stderr, "missing container ID\n") + cli.ShowCommandHelpAndExit(ctx, "exec", 1) + } + + c, err := lxc.NewContainer(containerID, LXC_PATH) + if err != nil { + return errors.Wrap(err, "failed to create new container") + } + defer c.Release() + + attachOpts := lxc.AttachOptions{ + Namespaces: EXEC_NAMESPACES, + } + + var procArgs []string + specFilePath := ctx.String("process") + specData, err := ioutil.ReadFile(specFilePath) + if err == nil { + // prefer the process spec file + var procSpec *specs.Process + err := json.Unmarshal(specData, &procSpec) + if err != nil { + return errors.Wrapf(err, "failed to read process spec from %s: %s", specFilePath, err) + } + // tanslate process spec to lxc.AttachOptions + procArgs = procSpec.Args + attachOpts.UID = int(procSpec.User.UID) + attachOpts.GID = int(procSpec.User.GID) + attachOpts.Cwd = procSpec.Cwd + attachOpts.Env = procSpec.Env + } else { + // fall back to cmdline arguments + if len(ctx.Args()) >= 2 { + procArgs = ctx.Args()[1:] + } + } + + if ctx.Bool("detach") { + // FIXME detach is not called by conmon ! why ? + pid, err := c.RunCommandNoWait(procArgs, attachOpts) + pidFile := ctx.String("pid-file") + err = ioutil.WriteFile(pidFile, []byte(fmt.Sprintf("%s\n", pid)), 0640) + if err != nil { + return errors.Wrapf(err, "failed to write pid file %s: %s", pidFile) + } + } else { + exitStatus, err := c.RunCommandStatus(procArgs, attachOpts) + if err != nil { + return errors.Wrapf(err, "Cmd returned with exit code %d", exitStatus) + } + } + return nil +} diff --git a/cmd/main.go b/cmd/main.go index 569bf4a..3e1c877 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -24,6 +24,7 @@ func main() { startCmd, killCmd, deleteCmd, + execCmd, } app.Flags = []cli.Flag{ From 014464fa76f94ccf8ce03c09501da2c32bd3494c Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 15 Jul 2020 19:45:01 +0200 Subject: [PATCH 08/19] Ignore case for lxc --log-level flag Signed-off-by: Ruben Jenster <[email protected]> --- cmd/utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/utils.go b/cmd/utils.go index aa3a45b..2b6e38e 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -32,7 +32,7 @@ func readBundleSpec(specFilePath string) (spec *specs.Spec, err error) { func configureLogging(ctx *cli.Context, c *lxc.Container) error { if ctx.GlobalIsSet("log-level") { var logLevel lxc.LogLevel - switch ctx.GlobalString("log-level") { + switch strings.ToLower(ctx.GlobalString("log-level")) { case "trace": logLevel = lxc.TRACE case "debug": From 16c045e580e201310919fda1041e207a1bfcc165 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 15 Jul 2020 19:46:31 +0200 Subject: [PATCH 09/19] Prepend cri-lxc log output to lxc log-file. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/main.go | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/cmd/main.go b/cmd/main.go index 3e1c877..9cb398d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -2,6 +2,7 @@ package main import ( "fmt" + stdlog "log" "os" "github.com/apex/log" @@ -47,18 +48,36 @@ func main() { }, cli.BoolFlag{ Name: "systemd-cgroup", - Usage: "enable systemd cgroup (unimplemented stub for conmon)", + Usage: "enable systemd cgroup", }, } log.SetLevel(log.InfoLevel) + var logFile *os.File app.Before = func(ctx *cli.Context) error { LXC_PATH = ctx.String("lxc-path") debug = ctx.Bool("debug") if debug { log.SetLevel(log.DebugLevel) } + logFilePath := ctx.String("log-file") + if logFilePath != "" { + f, err := os.OpenFile(logFilePath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0640) + if err != nil { + log.Errorf("failed to open log file %s: %s", logFilePath, err) + } else { + logFile = f + stdlog.SetOutput(logFile) + } + } + return nil + } + + app.After = func(ctx *cli.Context) error { + if logFile != nil { + return logFile.Close() + } return nil } From 5284229f577a8c4db7149d9d2fdaec09c580cb94 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 15 Jul 2020 19:48:43 +0200 Subject: [PATCH 10/19] Set lxc process security attributes / apparmor profile and cgroup root Signed-off-by: Ruben Jenster <[email protected]> --- cmd/create.go | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/cmd/create.go b/cmd/create.go index 8c4412a..d8e01d7 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -202,6 +202,48 @@ func doCreate(ctx *cli.Context) error { return nil } +func configureContainerSecurity(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) error { + // https://github.com/kubernetes/kubernetes/blob/a38a02792b55942177ee676a5e1993b18a8b4b0a/pkg/kubelet/apis/cri/runtime/v1alpha2/api.proto#L541 + // // Privileged mode implies the following specific options are applied: + // 1. All capabilities are added. + // 2. Sensitive paths, such as kernel module paths within sysfs, are not masked. + // 3. Any sysfs and procfs mounts are mounted RW. + // 4. Apparmor confinement is not applied. + // 5. Seccomp restrictions are not applied. + // 6. The device cgroup does not restrict access to any devices. + // 7. All devices from the host's /dev are available within the container. + // 8. SELinux restrictions are not applied (e.g. label=disabled). + // security + // FIXME Kubelet does not set the 'io.kubernetes.cri-o.PrivilegedRuntime" + // https://github.com/containers/podman/blob/8704b78a6fbb953acb6b74d1671d5ad6456bf81f/pkg/annotations/annotations.go#L64 + + aaprofile := spec.Process.ApparmorProfile + if aaprofile == "" { + aaprofile = "generated" + } + if err := c.SetConfigItem("lxc.apparmor.profile", "generated"); err != nil { + //if err := c.SetConfigItem("lxc.apparmor.profile", "unconfined"); err != nil { + return errors.Wrapf(err, "faield to set apparmor.profile") + } + if aaprofile == "generated" { + // TODO Create apparmor profile from spec.Linux.Readonly and MaskedPaths + // set lxc.apparmor.raw + // see man apparmor.d + } + + if err := c.SetConfigItem("lxc.proc.oom_score_adj", fmt.Sprintf("%d", *spec.Process.OOMScoreAdj)); err != nil { + return errors.Wrap(err, "failed to set lxc.proc.oom_score_adj") + } + + if spec.Process.NoNewPrivileges { + if err := c.SetConfigItem("lxc.no_new_privs", "1"); err != nil { + return errors.Wrapf(err, "failed to set lxc.no_new_privs") + } + } + + return nil +} + func configureContainer(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) error { if ctx.Bool("debug") { c.SetVerbosity(lxc.Verbose) @@ -296,6 +338,14 @@ func configureContainer(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) er return errors.Wrap(err, "failed to configure namespaces") } + if ctx.Bool("systemd-cgroup") { + c.SetConfigItem("lxc.cgroup.root", "system.slice") + } + + if err := configureContainerSecurity(ctx, c, spec); err != nil { + return errors.Wrap(err, "failed to configure container security") + } + // capabilities? // if !spec.Process.Terminal { From f749ba85ab25d7183aba350e0ab6ea5fe08d1818 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 15 Jul 2020 19:49:48 +0200 Subject: [PATCH 11/19] Fix mount entry paths and symlink protection. WIP. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/create.go | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/cmd/create.go b/cmd/create.go index d8e01d7..542d1e0 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -294,9 +294,26 @@ func configureContainer(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) er } opts := strings.Join(ms.Options, ",") - // Make mount paths relative to container root https://github.com/lxc/lxc/issues/2276 - dest := strings.TrimLeft(ms.Destination, "/") + + // Fix failing mounts due to symlink protection + // CVE-2015-1335: Protect container mounts against symlinks + // https://github.com/lxc/lxc/commit/592fd47a6245508b79fe6ac819fe6d3b2c1289be + // mount targets that contain symlinks must be resolved + dest := ms.Destination + if strings.HasPrefix(dest, "/var/run") { + dest = strings.TrimPrefix(dest, "/var") + } + + // Either make mount destination paths relative to container or prepend the root path to them, + // otherwise lxc ignores the mount points. + // Hmmm, why does lxc not prepend the rootfs itself ? + // https://github.com/lxc/lxc/issues/2276 + //dest := strings.TrimLeft(ms.Destination, "/") + + dest = filepath.Join(spec.Root.Path, dest) + mnt := fmt.Sprintf("%s %s %s %s", ms.Source, dest, ms.Type, opts) + log.Debugf("adding mount entry %q", mnt) if err := c.SetConfigItem("lxc.mount.entry", mnt); err != nil { return errors.Wrap(err, "failed to set mount config") From 6d4217f01a176298c0174e65c9aa6baa16dcc183 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 15 Jul 2020 19:59:09 +0200 Subject: [PATCH 12/19] Add debug wrapper for crio-lxc binary. Signed-off-by: Ruben Jenster <[email protected]> --- bin/crio-lxc-debug.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100755 bin/crio-lxc-debug.sh diff --git a/bin/crio-lxc-debug.sh b/bin/crio-lxc-debug.sh new file mode 100755 index 0000000..5ad9e3b --- /dev/null +++ b/bin/crio-lxc-debug.sh @@ -0,0 +1,14 @@ +#!/bin/sh +# About: debug wrapper for crio-lxc + +LOG=/tmp/crio-lxc.log.$$ +NEWARGS="--debug --log-level DEBUG --log-file $LOG $@" + +env >> $LOG +echo ARGS:$@ >> $LOG +echo ---- >> $LOG +cat /tmp/exec-process-* >> $LOG +echo ---- >> $LOG +cat $PWD/config.json >> $LOG +echo ---- >> $LOG +exec /usr/local/bin/crio-lxc $NEWARGS From ce45e0a68d693233c98e14ba28eaafe573b94d26 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Thu, 16 Jul 2020 20:23:24 +0200 Subject: [PATCH 13/19] Fix mounts on /var/run when /var/run is a symlink to /run. Finally the kubernetes dashboard and deployments are working. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/create.go | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/cmd/create.go b/cmd/create.go index 542d1e0..059c2bb 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -292,27 +292,37 @@ func configureContainer(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) er log.Debugf("failed to stat source %s of mountpoint %s: %s", ms.Source, ms.Destination, err) } } - opts := strings.Join(ms.Options, ",") - // Fix failing mounts due to symlink protection - // CVE-2015-1335: Protect container mounts against symlinks - // https://github.com/lxc/lxc/commit/592fd47a6245508b79fe6ac819fe6d3b2c1289be - // mount targets that contain symlinks must be resolved - dest := ms.Destination - if strings.HasPrefix(dest, "/var/run") { - dest = strings.TrimPrefix(dest, "/var") + mountDest := ms.Destination + + // Resolve mount mountDestinatination paths containing symlinks. + // Symlinks in mount mountDestination paths are not allowed in LXC. + // See CVE-2015-1335: Protect container mounts against symlinks + // and https://github.com/lxc/lxc/commit/592fd47a6245508b79fe6ac819fe6d3b2c1289be + // + // So mount targets that contain symlinks must be resolved. + // e.g k8s service account tokens are mounted to /var/run/secrets/kubernetes.io/serviceaccount + // TODO Check whether a recursive and generic implementation is required! + if strings.HasPrefix(mountDest, "/var/run/") { + stat, err := os.Lstat(filepath.Join(spec.Root.Path, "/var/run/")) + if err == nil { + if stat.Mode() & os.ModeSymlink != 0 { + // resolve the symlink from /var/run to /run + mountDest = strings.TrimPrefix(mountDest, "/var") + } + } } - // Either make mount destination paths relative to container or prepend the root path to them, - // otherwise lxc ignores the mount points. - // Hmmm, why does lxc not prepend the rootfs itself ? - // https://github.com/lxc/lxc/issues/2276 - //dest := strings.TrimLeft(ms.Destination, "/") + // Mount destinations must be either relative to the container root or absolute to + // the directory on the host containing the rootfs. + // LXC simply ignores relative mounts paths to an absolute rootfs. - dest = filepath.Join(spec.Root.Path, dest) + // See https://github.com/lxc/lxc/issues/2276 + // See man lxc.container.conf #MOUNT POINTS + mountDest = filepath.Join(spec.Root.Path, mountDest) - mnt := fmt.Sprintf("%s %s %s %s", ms.Source, dest, ms.Type, opts) + mnt := fmt.Sprintf("%s %s %s %s", ms.Source, mountDest, ms.Type, opts) log.Debugf("adding mount entry %q", mnt) if err := c.SetConfigItem("lxc.mount.entry", mnt); err != nil { From 83f6e0d14bd5303d9af0239bcd9303120e377404 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 22 Jul 2020 10:20:55 +0200 Subject: [PATCH 14/19] Fix spec.Process command execution. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/create.go | 33 +++++++++++++++++++++++++++++---- go.mod | 1 + go.sum | 2 ++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/cmd/create.go b/cmd/create.go index 059c2bb..34e6136 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -16,6 +16,7 @@ import ( "github.com/apex/log" "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" + "github.com/segmentio/ksuid" "github.com/urfave/cli" lxc "gopkg.in/lxc/go-lxc.v2" @@ -90,14 +91,33 @@ const ( func emitFifoWaiter(file string) error { fifoWaiter := fmt.Sprintf(`#!/bin/sh -stat /syncfifo echo "%s" | tee /syncfifo -exec $@ +echo "Sourcing command file $1" +echo "-----------------------" +cat $1 +echo "-----------------------" +. $1 `, SYNC_FIFO_CONTENT) return ioutil.WriteFile(file, []byte(fifoWaiter), 0755) } +// Write the container init command to a file. +// This file is then sourced by the file /syncfifo on container startup. +// Every command argument is quoted so `exec` can process them properly. +func emitCmdFile(cmdFile string, args ...string) error { + // https://stackoverflow.com/questions/33887194/how-to-set-multiple-commands-in-one-yaml-file-with-kubernetes + buf := strings.Builder{} + buf.WriteString("exec") + for _, arg := range args { + buf.WriteRune(' ') + buf.WriteRune('"') + buf.WriteString(arg) + buf.WriteRune('"') + } + return ioutil.WriteFile(cmdFile, []byte(buf.String()), 0640) +} + func configureNamespaces(c *lxc.Container, spec *specs.Spec) error { procPidPathRE := regexp.MustCompile(`/proc/(\d+)/ns`) @@ -352,8 +372,13 @@ func configureContainer(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) er return errors.Wrap(err, "failed to set hostname") } - argsString := "/fifo-wait " + strings.Join(spec.Process.Args, " ") - if err := c.SetConfigItem("lxc.execute.cmd", argsString); err != nil { + cmd := fmt.Sprintf("/mycmd.%s", ksuid.New().String()) + err = emitCmdFile(path.Join(spec.Root.Path, cmd), spec.Process.Args...) + if err != nil { + return errors.Wrapf(err, "could not write command file") + } + + if err := c.SetConfigItem("lxc.execute.cmd", "/fifo-wait "+cmd); err != nil { return errors.Wrap(err, "failed to set lxc.execute.cmd") } diff --git a/go.mod b/go.mod index 7f80ccc..a8ec1d1 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/onsi/ginkgo v1.8.0 // indirect github.com/opencontainers/runtime-spec v1.0.1 github.com/pkg/errors v0.8.1 + github.com/segmentio/ksuid v1.0.3 github.com/stretchr/objx v0.2.0 // indirect github.com/urfave/cli v1.20.0 golang.org/x/crypto v0.0.0-20190621222207-cc06ce4a13d4 // indirect diff --git a/go.sum b/go.sum index cc4071c..b91cb46 100644 --- a/go.sum +++ b/go.sum @@ -33,6 +33,8 @@ github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= +github.com/segmentio/ksuid v1.0.3 h1:FoResxvleQwYiPAVKe1tMUlEirodZqlqglIuFsdDntY= +github.com/segmentio/ksuid v1.0.3/go.mod h1:/XUiZBD3kVx5SmUOl55voK5yeAbBNNIed+2O73XgrPE= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/smartystreets/assertions v1.0.0/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM= github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM= From ca653b7fe24362468618b7da817e313b62d4ade4 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 22 Jul 2020 10:23:12 +0200 Subject: [PATCH 15/19] Fix container mount path destination for LXC. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/create.go | 33 ++++++--------------------------- cmd/utils.go | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/cmd/create.go b/cmd/create.go index 34e6136..99e3c4a 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -305,7 +305,7 @@ func configureContainer(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) er } } else { // This case catches all kind of virtual and remote filesystems (/dev/pts, /dev/shm, sysfs, procfs, dev ...) - // It can not be a file because the source file for a bind mount must exist. + // It can never be a file because the source file for a bind mount must exist. if os.IsNotExist(err) { ms.Options = append(ms.Options, "create=dir") } else { @@ -314,34 +314,13 @@ func configureContainer(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) er } opts := strings.Join(ms.Options, ",") - mountDest := ms.Destination - - // Resolve mount mountDestinatination paths containing symlinks. - // Symlinks in mount mountDestination paths are not allowed in LXC. - // See CVE-2015-1335: Protect container mounts against symlinks - // and https://github.com/lxc/lxc/commit/592fd47a6245508b79fe6ac819fe6d3b2c1289be - // - // So mount targets that contain symlinks must be resolved. - // e.g k8s service account tokens are mounted to /var/run/secrets/kubernetes.io/serviceaccount - // TODO Check whether a recursive and generic implementation is required! - if strings.HasPrefix(mountDest, "/var/run/") { - stat, err := os.Lstat(filepath.Join(spec.Root.Path, "/var/run/")) - if err == nil { - if stat.Mode() & os.ModeSymlink != 0 { - // resolve the symlink from /var/run to /run - mountDest = strings.TrimPrefix(mountDest, "/var") - } - } + mountDest, err := resolveMountDestination(spec.Root.Path, ms.Destination) + if err != nil { + log.Debugf("resolveMountDestination: %s --> %s (err:%s)", ms.Destination, mountDest, err) + } else { + log.Debugf("resolveMountDestination: %s --> %s)", ms.Destination, mountDest) } - // Mount destinations must be either relative to the container root or absolute to - // the directory on the host containing the rootfs. - // LXC simply ignores relative mounts paths to an absolute rootfs. - - // See https://github.com/lxc/lxc/issues/2276 - // See man lxc.container.conf #MOUNT POINTS - mountDest = filepath.Join(spec.Root.Path, mountDest) - mnt := fmt.Sprintf("%s %s %s %s", ms.Source, mountDest, ms.Type, opts) log.Debugf("adding mount entry %q", mnt) diff --git a/cmd/utils.go b/cmd/utils.go index 2b6e38e..1241864 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -86,3 +86,54 @@ func RunCommand(args ...string) error { } return nil } + +func resolveRootfsSymlinks(rootfs string, dst string) (string, error) { + stat, err := os.Lstat(dst) + if err != nil { + return dst, err + } + if stat.Mode()&os.ModeSymlink == 0 { + return dst, nil // not a symlink + } + target, err := os.Readlink(dst) + if err != nil { + return dst, err + } + if !strings.HasPrefix(target, rootfs) { + return filepath.Join(rootfs, target), nil + } + return target, nil +} + +// resolveMountDestination resolves mount destination paths for LXC. +// +// Symlinks in mount mount destination paths are not allowed in LXC. +// See CVE-2015-1335: Protect container mounts against symlinks +// and https://github.com/lxc/lxc/commit/592fd47a6245508b79fe6ac819fe6d3b2c1289be +// Mount targets that contain symlinks should be resolved relative to the container rootfs. +// e.g k8s service account tokens are mounted to /var/run/secrets/kubernetes.io/serviceaccount +// but /var/run is (mostly) a symlink to /run, so LXC denies to mount the serviceaccount token. +// +// The mount destination must be either relative to the container root or absolute to +// the directory on the host containing the rootfs. +// LXC simply ignores relative mounts paths to an absolute rootfs. +// See man lxc.container.conf #MOUNT POINTS +// +// The mount option `create=dir` should be set when the error os.ErrNotExist is returned. +// The non-existent directories are then automatically created by LXC. +func resolveMountDestination(rootfs string, dst string) (string, error) { + dst = strings.TrimPrefix(dst, "/") + entries := strings.Split(dst, "/") + dstPath := rootfs + for i, entry := range entries { + dstPath = filepath.Join(dstPath, entry) + resolved, err := resolveRootfsSymlinks(rootfs, dstPath) + dstPath = resolved + if err != nil { + // The already resolved path is concatenated with the remaining path to be resolved, + // if resolution of path fails at some point. + return filepath.Join(dstPath, filepath.Join(entries[i+1:]...)), err + } + } + return dstPath, nil +} From ea7ad04bae7982fd270dc9057705632cc8ceeb3d Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 22 Jul 2020 10:25:25 +0200 Subject: [PATCH 16/19] Set apparmor profile from container spec. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/create.go | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/cmd/create.go b/cmd/create.go index 99e3c4a..a8c81bb 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -223,32 +223,17 @@ func doCreate(ctx *cli.Context) error { } func configureContainerSecurity(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) error { - // https://github.com/kubernetes/kubernetes/blob/a38a02792b55942177ee676a5e1993b18a8b4b0a/pkg/kubelet/apis/cri/runtime/v1alpha2/api.proto#L541 - // // Privileged mode implies the following specific options are applied: - // 1. All capabilities are added. - // 2. Sensitive paths, such as kernel module paths within sysfs, are not masked. - // 3. Any sysfs and procfs mounts are mounted RW. - // 4. Apparmor confinement is not applied. - // 5. Seccomp restrictions are not applied. - // 6. The device cgroup does not restrict access to any devices. - // 7. All devices from the host's /dev are available within the container. - // 8. SELinux restrictions are not applied (e.g. label=disabled). - // security - // FIXME Kubelet does not set the 'io.kubernetes.cri-o.PrivilegedRuntime" - // https://github.com/containers/podman/blob/8704b78a6fbb953acb6b74d1671d5ad6456bf81f/pkg/annotations/annotations.go#L64 - aaprofile := spec.Process.ApparmorProfile if aaprofile == "" { aaprofile = "generated" } - if err := c.SetConfigItem("lxc.apparmor.profile", "generated"); err != nil { - //if err := c.SetConfigItem("lxc.apparmor.profile", "unconfined"); err != nil { - return errors.Wrapf(err, "faield to set apparmor.profile") + if err := c.SetConfigItem("lxc.apparmor.profile", aaprofile); err != nil { + return errors.Wrapf(err, "failed to set apparmor.profile to %s", aaprofile) } if aaprofile == "generated" { - // TODO Create apparmor profile from spec.Linux.Readonly and MaskedPaths - // set lxc.apparmor.raw + // TODO Create apparmor profile from the spec (honoring Linux.Readonly and Linux.MaskedPaths) // see man apparmor.d + // if err := c.SetConfigItem("lxc.apparmor.raw", aaprofile); err != nil { } if err := c.SetConfigItem("lxc.proc.oom_score_adj", fmt.Sprintf("%d", *spec.Process.OOMScoreAdj)); err != nil { From 5dae906b2304217c29fdeaf33baaf248f2cb44e2 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 22 Jul 2020 10:26:22 +0200 Subject: [PATCH 17/19] Fix nil pointer when setting lxc.proc.oom_score_adj. Signed-off-by: Ruben Jenster <[email protected]> --- cmd/create.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmd/create.go b/cmd/create.go index a8c81bb..680ab65 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -236,8 +236,10 @@ func configureContainerSecurity(ctx *cli.Context, c *lxc.Container, spec *specs. // if err := c.SetConfigItem("lxc.apparmor.raw", aaprofile); err != nil { } - if err := c.SetConfigItem("lxc.proc.oom_score_adj", fmt.Sprintf("%d", *spec.Process.OOMScoreAdj)); err != nil { - return errors.Wrap(err, "failed to set lxc.proc.oom_score_adj") + if spec.Process.OOMScoreAdj != nil { + if err := c.SetConfigItem("lxc.proc.oom_score_adj", fmt.Sprintf("%d", *spec.Process.OOMScoreAdj)); err != nil { + return errors.Wrap(err, "failed to set lxc.proc.oom_score_adj") + } } if spec.Process.NoNewPrivileges { From d1c25db70a4ed21f17201d338a199234c39367b4 Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 22 Jul 2020 10:30:49 +0200 Subject: [PATCH 18/19] Do not set lxc.ephemeral Signed-off-by: Ruben Jenster <[email protected]> --- cmd/create.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/create.go b/cmd/create.go index 680ab65..9afc65d 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -248,6 +248,11 @@ func configureContainerSecurity(ctx *cli.Context, c *lxc.Container, spec *specs. } } + // crio deletes the working directory so lxc should not do this itself + //if err := c.SetConfigItem("lxc.ephemeral", "1"); err != nil { + // return errors.Wrapf(err, "failed to set lxc.ephemeral") + //} + return nil } From f7b8d584c22009b04d1b2a2f2138c4ef6cfdcefd Mon Sep 17 00:00:00 2001 From: Ruben Jenster <[email protected]> Date: Wed, 22 Jul 2020 10:31:21 +0200 Subject: [PATCH 19/19] Update Makefile test target. Signed-off-by: Ruben Jenster <[email protected]> --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a6598c3..816a148 100644 --- a/Makefile +++ b/Makefile @@ -17,8 +17,8 @@ check: crio-lxc go test ./... PACKAGES_DIR=$(PACKAGES_DIR) sudo -E "PATH=$$PATH" bats -t $(patsubst %,test/%.bats,$(TEST)) -%.bats: crio-lxc - PACKAGES_DIR=$(PACKAGES_DIR) sudo -E "PATH=$$PATH" bats -t test/$@ +test/*.bats: crio-lxc + PACKAGES_DIR=$(PACKAGES_DIR) sudo -E "PATH=$$PATH" bats -t $@ .PHONY: vendorup vendorup:
_______________________________________________ lxc-devel mailing list [email protected] http://lists.linuxcontainers.org/listinfo/lxc-devel
