Add a fully featured unshare implementation implementing all arguments supported in the upstream version.
Signed-off-by: Bartosz Golaszewski <bartekg...@gmail.com> --- util-linux/unshare.c | 465 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 465 insertions(+) create mode 100644 util-linux/unshare.c diff --git a/util-linux/unshare.c b/util-linux/unshare.c new file mode 100644 index 0000000..742d336 --- /dev/null +++ b/util-linux/unshare.c @@ -0,0 +1,465 @@ +/* vi: set sw=4 ts=4: */ +/* + * Mini unshare implementation for busybox. + * + * Copyright (C) 2016 by Bartosz Golaszewski <bartekg...@gmail.com> + * + * Licensed under GPLv2 or later, see file LICENSE in this source tree. + */ + +//config:config UNSHARE +//config: bool "unshare" +//config: default y +//config: select PLATFORM_LINUX +//config: help +//config: Run program with some namespaces unshared from parent. +//config: +//config:config FEATURE_UNSHARE_LONG_OPTS +//config: bool "enable long options" +//config: default y +//config: depends on UNSHARE && LONG_OPTS +//config: help +//config: Support long options for the unshare applet. This makes +//config: the busybox implementation more compatible with upstream. + +//applet:IF_UNSHARE(APPLET(unshare, BB_DIR_USR_BIN, BB_SUID_DROP)) + +//kbuild:lib-$(CONFIG_UNSHARE) += unshare.o + +//usage:#define unshare_trivial_usage +//usage: "[options] <program> [args...]" +//usage:#if ENABLE_FEATURE_UNSHARE_LONG_OPTS +//usage:#define unshare_full_usage "\n\n" +//usage: "Options:" +//usage: "\n -m, --mount[=<file>] unshare mounts namespace" +//usage: "\n -u, --uts[=<file>] unshare UTS namespace (hostname etc.)" +//usage: "\n -i, --ipc[=<file>] unshare System V IPC namespace" +//usage: "\n -n, --network[=<file>] unshare network namespace" +//usage: "\n -p, --pid[=<file>] unshare pid namespace" +//usage: "\n -U, --user[=<file>] unshare user namespace" +//usage: "\n -f, --fork fork before launching <program>" +//usage: "\n -M, --mount-proc[=<dir>] mount proc filesystem first (implies --mount)" +//usage: "\n -r, --map-root-user map current user to root (implies --user)" +//usage: "\n -P, --propagation slave|shared|private|unchanged" +//usage: "\n modify mount propagation in mount namespace" +//usage: "\n -s, --setgroups allow|deny control the setgroups syscall in user namespaces" +//usage:#else +//usage:#define unshare_full_usage "\n\n" +//usage: "Options:" +//usage: "\n -m [<file>] unshare mounts namespace" +//usage: "\n -u [<file>] unshare UTS namespace (hostname etc.)" +//usage: "\n -i [<file>] unshare System V IPC namespace" +//usage: "\n -n [<file>] unshare network namespace" +//usage: "\n -p [<file>] unshare pid namespace" +//usage: "\n -U [<file>] unshare user namespace" +//usage: "\n -f fork before launching <program>" +//usage: "\n -M [<dir>] mount proc filesystem first (implies -m)" +//usage: "\n -r map current user to root (implies -u)" +//usage: "\n -P slave|shared|private|unchanged" +//usage: "\n modify mount propagation in mount namespace" +//usage: "\n -s allow|deny ontrol the setgroups syscall in user namespaces" +//usage:#endif + +#include "libbb.h" + +#include <sched.h> +#include <sys/types.h> +#include <sys/mount.h> + +/* + * Longest possible path to a procfs file used in unshare. Must be able to + * contain the '/proc/' string, the '/ns/user' string which is the longest + * namespace name and a 32-bit integer representing the process ID. + */ +#define PROC_PATH_MAX (sizeof("/proc//ns/user") + INT_BUF_MAX(pid_t)) + +#define PATH_PROC_SETGROUPS "/proc/self/setgroups" +#define PATH_PROC_UIDMAP "/proc/self/uid_map" +#define PATH_PROC_GIDMAP "/proc/self/gid_map" + +enum { + OPT_mount = BIT( 0), + OPT_uts = BIT( 1), + OPT_ipc = BIT( 2), + OPT_network = BIT( 3), + OPT_pid = BIT( 4), + OPT_user = BIT( 5), + OPT_fork = BIT( 6), + OPT_mount_proc = BIT( 7), + OPT_map_root = BIT( 8), + OPT_propagation = BIT( 9), + OPT_setgroups = BIT(10), +}; + +enum { + NS_MNT_POS = 0, + NS_UTS_POS, + NS_IPC_POS, + NS_NET_POS, + NS_PID_POS, + NS_USR_POS, + NS_COUNT, +}; + +struct namespace_descr { + const int opt; + const int flag; + const char *nsfile; +}; + +struct namespace_ctx { + char *path; +}; + +struct propagation_mode { + const char *name; + unsigned long flags; +}; + +/* + * Upstream unshare doesn't support short options for --mount-proc and + * --propagation, but let's add them here to let the user use them even with + * long options disabled in busybox config. + */ +static const char opt_str[] = "+m::u::i::n::p::U::fM::rP:s:"; + +/* + * Upstream unshare only accepts optional arguments (namespace mountpoints) + * for long options. We support them for both short (for size reduction + * with LONG_OPTS disabled) and long opts (for upstream compatibility). + */ +#if ENABLE_FEATURE_UNSHARE_LONG_OPTS +static const char unshare_longopts[] ALIGN1 = + "mount\0" Optional_argument "m" + "uts\0" Optional_argument "u" + "ipc\0" Optional_argument "i" + "network\0" Optional_argument "n" + "pid\0" Optional_argument "p" + "user\0" Optional_argument "U" + "fork\0" No_argument "f" + "mount-proc\0" Optional_argument "M" + "map-root-user\0" No_argument "r" + "propagation\0" Required_argument "P" + "setgroups\0" Required_argument "s"; +#endif + +static const struct namespace_descr ns_list[] = { + [NS_MNT_POS] = { + .opt = OPT_mount, + .flag = CLONE_NEWNS, + .nsfile = "mnt", + }, + [NS_UTS_POS] = { + .opt = OPT_uts, + .flag = CLONE_NEWUTS, + .nsfile = "uts", + }, + [NS_IPC_POS] = { + .opt = OPT_ipc, + .flag = CLONE_NEWIPC, + .nsfile = "ipc", + }, + [NS_NET_POS] = { + .opt = OPT_network, + .flag = CLONE_NEWNET, + .nsfile = "net", + }, + [NS_PID_POS] = { + .opt = OPT_pid, + .flag = CLONE_NEWPID, + .nsfile = "pid", + }, + [NS_USR_POS] = { + .opt = OPT_user, + .flag = CLONE_NEWUSER, + .nsfile = "user", + }, +}; + +static unsigned long parse_propagation(const char *prop_str) +{ + static const struct propagation_mode prop_modes[] = { + { + .name = "slave", + .flags = MS_REC | MS_SLAVE, + }, + { + .name = "private", + .flags = MS_REC | MS_PRIVATE, + }, + { + .name = "shared", + .flags = MS_REC | MS_SHARED, + }, + { + .name = "unchanged", + .flags = 0, + } + }; + + int i; + + for (i = 0; i < ARRAY_SIZE(prop_modes); i++) { + if (strcmp(prop_modes[i].name, prop_str) == 0) + return prop_modes[i].flags; + } + + bb_error_msg_and_die("unsupported propagation mode: %s", prop_str); +} + +static ino_t get_mnt_ns_inode_by_pid(pid_t pid) +{ + char path[PROC_PATH_MAX]; + struct stat statbuf; + + snprintf(path, sizeof(path), "/proc/%d/ns/mnt", pid); + xstat(path, &statbuf); + + return statbuf.st_ino; +} + +static void mount_namespaces(pid_t pid, struct namespace_ctx *ns_ctx_list) +{ + const struct namespace_descr *ns; + struct namespace_ctx *ns_ctx; + char nsf[PROC_PATH_MAX]; + int i, status; + + for (i = 0; i < NS_COUNT; i++) { + ns = &ns_list[i]; + ns_ctx = &ns_ctx_list[i]; + + if (!ns_ctx->path) + continue; + + snprintf(nsf, sizeof(nsf), "/proc/%d/ns/%s", pid, ns->nsfile); + + status = mount(nsf, ns_ctx->path, NULL, MS_BIND, NULL); + if (status < 0) { + bb_perror_msg_and_die("mount %s on %s failed", + nsf, ns_ctx->path); + } + } +} + +int unshare_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; +int unshare_main(int argc UNUSED_PARAM, char **argv) +{ + int unsflags = 0, i, need_mount = 0, status, setgrp_allow = 0; + const char *proc_mnt_target = "/proc", *prop_str, *setgrp_str; + unsigned long prop_flags = MS_REC | MS_PRIVATE; + struct namespace_ctx ns_ctx_list[NS_COUNT]; + uid_t reuid = geteuid(); + gid_t regid = getegid(); + unsigned int opts; + pid_t pid = -1; + + IF_FEATURE_UNSHARE_LONG_OPTS(applet_long_options = unshare_longopts); + + memset(ns_ctx_list, 0, sizeof(struct namespace_ctx) * NS_COUNT); + + opts = getopt32(argv, opt_str, + &ns_ctx_list[NS_MNT_POS].path, + &ns_ctx_list[NS_UTS_POS].path, + &ns_ctx_list[NS_IPC_POS].path, + &ns_ctx_list[NS_NET_POS].path, + &ns_ctx_list[NS_PID_POS].path, + &ns_ctx_list[NS_USR_POS].path, + &proc_mnt_target, &prop_str, &setgrp_str); + argv += optind; + + /* + * Mounting the proc filesystem before running the program implies + * creating a new mount namespace since the /proc mount would + * otherwise mess up existing programs on the system. + */ + if (opts & OPT_mount_proc) + opts |= OPT_mount; + + /* Mapping user and group IDs to root implies --user. */ + if (opts & OPT_map_root) + opts |= OPT_user; + + if (opts & OPT_setgroups) { + if (strcmp(setgrp_str, "allow") == 0) { + setgrp_allow = 1; + } else if (strcmp(setgrp_str, "deny") == 0) { + setgrp_allow = 0; + } else { + bb_error_msg_and_die( + "unsupported --setgroups argument '%s'", + setgrp_str); + } + } + + for (i = 0; i < NS_COUNT; i++) { + const struct namespace_descr *ns = &ns_list[i]; + struct namespace_ctx *ns_ctx = &ns_ctx_list[i]; + + if (opts & ns->opt) + unsflags |= ns->flag; + + if (ns_ctx->path) + need_mount = 1; + } + + /* Silently ignore --propagation if --mount is not requested. */ + if ((opts & OPT_propagation) && (opts & OPT_mount)) + prop_flags = parse_propagation(prop_str); + + /* + * Special case: if we were requested to unshare the mount namespace + * AND to make any namespace persistent (by bind mounting it) we need + * to spawn a child process which will wait for the parent to call + * unshare(), then mount parent's namespaces while still in the + * previous namespace. + */ + if (need_mount && (opts & OPT_mount)) { + ino_t inop, inoc; + pid_t ppid; + + /* + * Can't use getppid() in child, as we can be unsharing the + * pid namespace. + */ + ppid = getpid(); + + /* + * Save current process' mount namespace file inode number. We + * will later use it in child process to check if it already + * changed meaning that this process already called unshare(). + */ + inop = get_mnt_ns_inode_by_pid(ppid); + + pid = xfork(); + if (pid == 0) { + /* + * Child - wait until parent calls unshare(). No issue + * in busy-waiting - by the time we get here from + * fork(), the parent has usually already unshared the + * mount namespace. We should spin a few times at most. + * + * XXX Should probably use a pipe to notify the child + * about completing unshare(). + */ + do { + inoc = get_mnt_ns_inode_by_pid(ppid); + } while (inoc == inop); + + /* Mount parent's unshared namespaces. */ + mount_namespaces(ppid, ns_ctx_list); + + return EXIT_SUCCESS; + } /* Parent continues. */ + } + + status = unshare(unsflags); + if (status < 0) + bb_perror_msg_and_die("unshare failed"); + + if (need_mount) { + /* Wait for the child to finish mounting the namespaces. */ + if (opts & OPT_mount) { + int exit_status; + + status = safe_waitpid(pid, &exit_status, 0); + if (status < 0) + bb_perror_msg_and_die("waitpid"); + + if (WIFEXITED(exit_status) && + WEXITSTATUS(exit_status) != EXIT_SUCCESS) + return WEXITSTATUS(status); + } else { + /* + * Regular way - we were requested to mount some other + * namespaces: mount them after the call to unshare(). + */ + mount_namespaces(getpid(), ns_ctx_list); + } + } + + /* + * When we're unsharing the pid namespace, it's not the process that + * calls unshare() that is put into the new namespace, but its first + * child. The user may want to use this option to spawn a new process + * that'll become PID 1 in this new namespace. + */ + if (opts & OPT_fork) { + int exit_status; + + pid = xfork(); + if (pid > 0) { + status = safe_waitpid(pid, &exit_status, 0); + if (status < 0) + bb_perror_msg_and_die("waitpid"); + + if (WIFEXITED(exit_status)) + return WEXITSTATUS(exit_status); + else if (WIFSIGNALED(exit_status)) + kill(getpid(), WTERMSIG(exit_status)); + + bb_error_msg_and_die("child exit failed"); + } /* Child continues. */ + } + + if (opts & OPT_map_root) { + char uidmap_buf[sizeof(unsigned int) * 3 + sizeof(" 0 1")]; + + if ((opts & OPT_setgroups) && setgrp_allow) { + bb_error_msg_and_die( + "options --setgroups=allow and --map-root-user are mutually exclusive"); + } + + /* + * Since Linux 3.19 unprivileged writing of /proc/self/gid_map + * has s been disabled unless /proc/self/setgroups is written + * first to permanently disable the ability to call setgroups + * in that user namespace. + */ + xopen_xwrite_close(PATH_PROC_SETGROUPS, "deny"); + snprintf(uidmap_buf, COMMON_BUFSIZE, "%u 0 1", reuid); + xopen_xwrite_close(PATH_PROC_UIDMAP, uidmap_buf); + snprintf(uidmap_buf, COMMON_BUFSIZE, "%u 0 1", regid); + xopen_xwrite_close(PATH_PROC_GIDMAP, uidmap_buf); + } else if (opts & OPT_setgroups) { + xopen_xwrite_close(PATH_PROC_SETGROUPS, setgrp_str); + } + + if (opts & OPT_mount) { + status = mount("none", "/", NULL, prop_flags, NULL); + if (status < 0) { + bb_perror_msg_and_die( + "cannot change root filesystem propagation"); + } + } + + if (opts & OPT_mount_proc) { + int flags; + + /* + * When creating a new pid namespace, we might want the pid + * subdirectories in /proc to remain consistent with the new + * process IDs. Without --mount-proc the pids in /proc would + * still reflect the old pid namespace. This is why we make + * /proc private here and then do a fresh mount. + */ + flags = MS_PRIVATE | MS_REC; + status = mount("none", proc_mnt_target, NULL, flags, NULL); + if (status == 0) { + flags = MS_NOSUID | MS_NOEXEC | MS_NODEV; + status = mount("proc", proc_mnt_target, + "proc", flags, NULL); + } + + if (status < 0) + bb_perror_msg_and_die("mount %s failed", + proc_mnt_target); + } + + if (*argv) { + execvp(*argv, argv); + bb_perror_msg_and_die("failed to execute %s", *argv); + } + + run_shell(getenv("SHELL"), 0, NULL, NULL); +} -- 2.1.4 _______________________________________________ busybox mailing list busybox@busybox.net http://lists.busybox.net/mailman/listinfo/busybox