The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxc/pull/2020
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === When I first solved this problem I went for a fork() + setns() + clone() model. This works fine but has unnecessary overhead for a couple of reasons: - doing a full fork() including copying file descriptor table and virtual memory - using pipes to retrieve the pid of the second child (the actual container process) This can all be avoided by being a little smart in how we employ the clone() syscall: - using CLONE_VM will let us get rid of using pipes since we can simply write to the handler because we share the memory with our parent - using CLONE_VFORK will also let us get rid of using pipes since the execution of the parent is suspended until the child returns - using CLONE_VM will not cause virtual memory to be copied - using CLONE_FILES will not cause the file descriptor table to be copied Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
From 6804ba45795ebfa9369a5447bf94372ae61228d1 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Mon, 11 Dec 2017 12:55:23 +0100 Subject: [PATCH 1/2] start: intelligently use clone() on ns sharing When I first solved this problem I went for a fork() + setns() + clone() model. This works fine but has unnecessary overhead for a couple of reasons: - doing a full fork() including copying file descriptor table and virtual memory - using pipes to retrieve the pid of the second child (the actual container process) This can all be avoided by being a little smart in how we employ the clone() syscall: - using CLONE_VM will let us get rid of using pipes since we can simply write to the handler because we share the memory with our parent - using CLONE_VFORK will also let us get rid of using pipes since the execution of the parent is suspended until the child returns - using CLONE_VM will let not cause virtual memory to be copied - using CLONE_FILES will not cause the file descriptor table to be copied Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- src/lxc/start.c | 61 +++++++++++++++++++++++---------------------------------- src/lxc/start.h | 6 ++++++ 2 files changed, 30 insertions(+), 37 deletions(-) diff --git a/src/lxc/start.c b/src/lxc/start.c index b82768687..d56e87d53 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -1157,43 +1157,30 @@ void resolve_clone_flags(struct lxc_handler *handler) INFO("Inheriting pid namespace"); } -static pid_t lxc_fork_attach_clone(int (*fn)(void *), void *arg, int flags) + +static inline int do_share_ns(void *arg) { - int i, r, ret; - pid_t pid, init_pid; + int i, flags, ret; struct lxc_handler *handler = arg; - pid = fork(); - if (pid < 0) - return -1; - - if (!pid) { - for (i = 0; i < LXC_NS_MAX; i++) { - if (handler->nsfd[i] < 0) - continue; - - ret = setns(handler->nsfd[i], 0); - if (ret < 0) - exit(EXIT_FAILURE); - - DEBUG("Inherited %s namespace", ns_info[i].proc_name); - } + for (i = 0; i < LXC_NS_MAX; i++) { + if (handler->nsfd[i] < 0) + continue; - init_pid = lxc_clone(do_start, handler, flags); - ret = lxc_write_nointr(handler->data_sock[1], &init_pid, - sizeof(init_pid)); + ret = setns(handler->nsfd[i], 0); if (ret < 0) - exit(EXIT_FAILURE); + return -1; - exit(EXIT_SUCCESS); + DEBUG("Inherited %s namespace", ns_info[i].proc_name); } - r = lxc_read_nointr(handler->data_sock[0], &init_pid, sizeof(init_pid)); - ret = wait_for_pid(pid); - if (ret < 0 || r < 0) + flags = handler->on_clone_flags; + flags |= CLONE_PARENT; + handler->pid = lxc_clone(do_start, handler, flags); + if (handler->pid < 0) return -1; - return init_pid; + return 0; } /* lxc_spawn() performs crucial setup tasks and clone()s the new process which @@ -1205,13 +1192,13 @@ static pid_t lxc_fork_attach_clone(int (*fn)(void *), void *arg, int flags) */ static int lxc_spawn(struct lxc_handler *handler) { - int i, flags, ret; + int i, ret; char pidstr[20]; bool wants_to_map_ids; struct lxc_list *id_map; const char *name = handler->name; const char *lxcpath = handler->lxcpath; - bool cgroups_connected = false, fork_before_clone = false; + bool cgroups_connected = false, share_ns = false; struct lxc_conf *conf = handler->conf; id_map = &conf->id_map; @@ -1225,7 +1212,7 @@ static int lxc_spawn(struct lxc_handler *handler) if (handler->nsfd[i] < 0) return -1; - fork_before_clone = true; + share_ns = true; } if (lxc_sync_init(handler)) @@ -1288,28 +1275,28 @@ static int lxc_spawn(struct lxc_handler *handler) } /* Create a process in a new set of namespaces. */ - flags = handler->clone_flags; + handler->on_clone_flags = handler->clone_flags; if (handler->clone_flags & CLONE_NEWUSER) { /* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to * clone a new user namespace first and only later unshare our * network namespace to ensure that network devices ownership is * set up correctly. */ - flags &= ~CLONE_NEWNET; + handler->on_clone_flags &= ~CLONE_NEWNET; } - if (fork_before_clone) - handler->pid = lxc_fork_attach_clone(do_start, handler, flags | CLONE_PARENT); + if (share_ns) + ret = lxc_clone(do_share_ns, handler, CLONE_VFORK | CLONE_VM | CLONE_FILES); else - handler->pid = lxc_clone(do_start, handler, flags); - if (handler->pid < 0) { + handler->pid = lxc_clone(do_start, handler, handler->on_clone_flags); + if (handler->pid < 0 || ret < 0) { SYSERROR("Failed to clone a new set of namespaces."); goto out_delete_net; } TRACE("Cloned child process %d", handler->pid); for (i = 0; i < LXC_NS_MAX; i++) - if (flags & ns_info[i].clone_flag) + if (handler->on_clone_flags & ns_info[i].clone_flag) INFO("Cloned %s", ns_info[i].flag_name); if (!preserve_ns(handler->nsfd, handler->clone_flags & ~CLONE_NEWNET, handler->pid)) { diff --git a/src/lxc/start.h b/src/lxc/start.h index e2b13141b..2161565d4 100644 --- a/src/lxc/start.h +++ b/src/lxc/start.h @@ -40,6 +40,12 @@ struct lxc_handler { /* The clone flags that were requested. */ int clone_flags; + /* The clone flags to actually use when calling lxc_clone(). They may + * differ from clone_flags because of ordering requirements (e.g. + * CLONE_NEWNET and CLONE_NEWUSER). + */ + int on_clone_flags; + /* File descriptor to pin the rootfs for privileged containers. */ int pinfd; From fae592368e1ddfe7a91e85c7186e2d8d24f004d0 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Mon, 11 Dec 2017 14:47:24 +0100 Subject: [PATCH 2/2] tests: add namespace sharing tests This also ensures that the new more efficient clone() way of sharing namespaces is tested. Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- src/tests/Makefile.am | 6 +- src/tests/share_ns.c | 317 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 321 insertions(+), 2 deletions(-) create mode 100644 src/tests/share_ns.c diff --git a/src/tests/Makefile.am b/src/tests/Makefile.am index f223463d7..b38c93c67 100644 --- a/src/tests/Makefile.am +++ b/src/tests/Makefile.am @@ -31,6 +31,7 @@ lxc_test_config_jump_table_SOURCES = config_jump_table.c lxctest.h lxc_test_shortlived_SOURCES = shortlived.c lxc_test_livepatch_SOURCES = livepatch.c lxctest.h lxc_test_state_server_SOURCES = state_server.c lxctest.h +lxc_test_share_ns_SOURCES = share_ns.c lxctest.h AM_CFLAGS=-DLXCROOTFSMOUNT=\"$(LXCROOTFSMOUNT)\" \ -DLXCPATH=\"$(LXCPATH)\" \ @@ -60,7 +61,7 @@ bin_PROGRAMS = lxc-test-containertests lxc-test-locktests lxc-test-startone \ lxc-test-reboot lxc-test-list lxc-test-attach lxc-test-device-add-remove \ lxc-test-apparmor lxc-test-utils lxc-test-parse-config-file \ lxc-test-config-jump-table lxc-test-shortlived lxc-test-livepatch \ - lxc-test-api-reboot lxc-test-state-server + lxc-test-api-reboot lxc-test-state-server lxc-test-share-ns bin_SCRIPTS = lxc-test-automount \ lxc-test-autostart \ @@ -121,7 +122,8 @@ EXTRA_DIST = \ shutdowntest.c \ snapshot.c \ startone.c \ - state_server.c + state_server.c \ + share_ns.c clean-local: rm -f lxc-test-utils-* diff --git a/src/tests/share_ns.c b/src/tests/share_ns.c new file mode 100644 index 000000000..620527699 --- /dev/null +++ b/src/tests/share_ns.c @@ -0,0 +1,317 @@ +/* liblxcapi + * + * Copyright © 2017 Christian Brauner <christian.brau...@ubuntu.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <alloca.h> +#include <errno.h> +#include <pthread.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/reboot.h> +#include <sys/types.h> +#include <sys/wait.h> + +#include "lxc/lxccontainer.h" +#include "lxctest.h" + +struct thread_args { + int thread_id; + int timeout; + bool success; + pid_t init_pid; + char *inherited_ipc_ns; + char *inherited_net_ns; +}; + +void *ns_sharing_wrapper(void *data) +{ + int init_pid; + ssize_t ret; + char name[100]; + char owning_ns_init_pid[100]; + char proc_ns_path[4096]; + char ns_buf[4096]; + struct lxc_container *c; + struct thread_args *args = data; + + lxc_debug("Starting namespace sharing thread %d\n", args->thread_id); + + sprintf(name, "share-ns-%d", args->thread_id); + c = lxc_container_new(name, NULL); + if (!c) { + lxc_error("Failed to create container \"%s\"\n", name); + goto out; + } + + if (c->is_defined(c)) { + lxc_error("Container \"%s\" is defined\n", name); + goto out; + } + + if (!c->createl(c, "busybox", NULL, NULL, 0, NULL)) { + lxc_error("Failed to create busybox container \"%s\"\n", name); + goto out; + } + + if (!c->is_defined(c)) { + lxc_error("Container \"%s\" is not defined\n", name); + goto out; + } + + if (!c->load_config(c, NULL)) { + lxc_error("Failed to load config for container \"%s\"\n", name); + goto out; + } + + /* share ipc namespace by container name */ + if (!c->set_config_item(c, "lxc.namespace.ipc", "owning-ns")) { + lxc_error("Failed to set \"lxc.namespace.ipc=owning-ns\" for container \"%s\"\n", name); + goto out; + } + + /* clear all network configuration */ + if (!c->set_config_item(c, "lxc.net", "")) { + lxc_error("Failed to set \"lxc.namespace.ipc=owning-ns\" for container \"%s\"\n", name); + goto out; + } + + if (!c->set_config_item(c, "lxc.net.0.type", "empty")) { + lxc_error("Failed to set \"lxc.net.0.type=empty\" for container \"%s\"\n", name); + goto out; + } + + sprintf(owning_ns_init_pid, "%d", args->init_pid); + /* share net namespace by pid */ + if (!c->set_config_item(c, "lxc.namespace.net", owning_ns_init_pid)) { + lxc_error("Failed to set \"lxc.namespace.net=%s\" for container \"%s\"\n", owning_ns_init_pid, name); + goto out; + } + + if (!c->want_daemonize(c, true)) { + lxc_error("Failed to mark container \"%s\" daemonized\n", name); + goto out; + } + + if (!c->startl(c, 0, NULL)) { + lxc_error("Failed to start container \"%s\" daemonized\n", name); + goto out; + } + + init_pid = c->init_pid(c); + if (init_pid < 0) { + lxc_error("Failed to retrieve init pid of container \"%s\"\n", name); + goto out; + } + + /* Check whether we correctly inherited the ipc namespace. */ + ret = snprintf(proc_ns_path, sizeof(proc_ns_path), "/proc/%d/ns/ipc", init_pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_ns_path)) { + lxc_error("Failed to create string for container \"%s\"\n", name); + goto out; + } + + ret = readlink(proc_ns_path, ns_buf, sizeof(ns_buf)); + if (ret < 0 || (size_t)ret >= sizeof(ns_buf)) { + lxc_error("Failed to retrieve ipc namespace for container \"%s\"\n", name); + goto out; + } + ns_buf[ret] = '\0'; + + if (strcmp(args->inherited_ipc_ns, ns_buf) != 0) { + lxc_error("Failed to inherit ipc namespace from container \"owning-ns\": %s != %s\n", args->inherited_ipc_ns, ns_buf); + goto out; + } + lxc_debug("Inherited ipc namespace from container \"owning-ns\": %s == %s\n", args->inherited_ipc_ns, ns_buf); + + /* Check whether we correctly inherited the net namespace. */ + ret = snprintf(proc_ns_path, sizeof(proc_ns_path), "/proc/%d/ns/net", init_pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_ns_path)) { + lxc_error("Failed to create string for container \"%s\"\n", name); + goto out; + } + + ret = readlink(proc_ns_path, ns_buf, sizeof(ns_buf)); + if (ret < 0 || (size_t)ret >= sizeof(ns_buf)) { + lxc_error("Failed to retrieve ipc namespace for container \"%s\"\n", name); + goto out; + } + ns_buf[ret] = '\0'; + + if (strcmp(args->inherited_net_ns, ns_buf) != 0) { + lxc_error("Failed to inherit net namespace from container \"owning-ns\": %s != %s\n", args->inherited_net_ns, ns_buf); + goto out; + } + lxc_debug("Inherited net namespace from container \"owning-ns\": %s == %s\n", args->inherited_net_ns, ns_buf); + + args->success = true; + +out: + if (c->is_running(c) && !c->stop(c)) { + lxc_error("Failed to stop container \"%s\"\n", name); + goto out; + } + + if (!c->destroy(c)) { + lxc_error("Failed to destroy container \"%s\"\n", name); + goto out; + } + + pthread_exit(NULL); + return NULL; +} + +int main(int argc, char *argv[]) +{ + int i, init_pid, j; + char proc_ns_path[4096]; + char ipc_ns_buf[4096]; + char net_ns_buf[4096]; + pthread_attr_t attr; + pthread_t threads[10]; + struct thread_args args[10]; + struct lxc_container *c; + int ret = EXIT_FAILURE; + + c = lxc_container_new("owning-ns", NULL); + if (!c) { + lxc_error("%s", "Failed to create container \"owning-ns\""); + exit(ret); + } + + if (c->is_defined(c)) { + lxc_error("%s\n", "Container \"owning-ns\" is defined"); + goto on_error_put; + } + + if (!c->createl(c, "busybox", NULL, NULL, 0, NULL)) { + lxc_error("%s\n", "Failed to create busybox container \"owning-ns\""); + goto on_error_put; + } + + if (!c->is_defined(c)) { + lxc_error("%s\n", "Container \"owning-ns\" is not defined"); + goto on_error_put; + } + + c->clear_config(c); + + if (!c->load_config(c, NULL)) { + lxc_error("%s\n", "Failed to load config for container \"owning-ns\""); + goto on_error_stop; + } + + if (!c->want_daemonize(c, true)) { + lxc_error("%s\n", "Failed to mark container \"owning-ns\" daemonized"); + goto on_error_stop; + } + + if (!c->startl(c, 0, NULL)) { + lxc_error("%s\n", "Failed to start container \"owning-ns\" daemonized"); + goto on_error_stop; + } + + init_pid = c->init_pid(c); + if (init_pid < 0) { + lxc_error("%s\n", "Failed to retrieve init pid of container \"owning-ns\""); + goto on_error_stop; + } + + /* record our ipc namespace */ + ret = snprintf(proc_ns_path, sizeof(proc_ns_path), "/proc/%d/ns/ipc", init_pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_ns_path)) { + lxc_error("%s\n", "Failed to create string for container \"owning-ns\""); + goto on_error_stop; + } + + ret = readlink(proc_ns_path, ipc_ns_buf, sizeof(ipc_ns_buf)); + if (ret < 0 || (size_t)ret >= sizeof(ipc_ns_buf)) { + lxc_error("%s\n", "Failed to retrieve ipc namespace for container \"owning-ns\""); + goto on_error_stop; + + } + ipc_ns_buf[ret] = '\0'; + + /* record our net namespace */ + ret = snprintf(proc_ns_path, sizeof(proc_ns_path), "/proc/%d/ns/net", init_pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_ns_path)) { + lxc_error("%s\n", "Failed to create string for container \"owning-ns\""); + goto on_error_stop; + } + + ret = readlink(proc_ns_path, net_ns_buf, sizeof(net_ns_buf)); + if (ret < 0 || (size_t)ret >= sizeof(net_ns_buf)) { + lxc_error("%s\n", "Failed to retrieve ipc namespace for container \"owning-ns\""); + goto on_error_stop; + } + net_ns_buf[ret] = '\0'; + + sleep(5); + + pthread_attr_init(&attr); + + for (j = 0; j < 10; j++) { + lxc_debug("Starting namespace sharing test iteration %d\n", j); + + for (i = 0; i < 10; i++) { + int ret; + + args[i].thread_id = i; + args[i].init_pid = init_pid; + args[i].timeout = -1; + args[i].inherited_ipc_ns = ipc_ns_buf; + args[i].inherited_net_ns = net_ns_buf; + /* test non-blocking shutdown request */ + if (i == 0) + args[i].timeout = 0; + + ret = pthread_create(&threads[i], &attr, ns_sharing_wrapper, (void *) &args[i]); + if (ret != 0) + goto on_error_stop; + } + + for (i = 0; i < 10; i++) { + int ret; + + ret = pthread_join(threads[i], NULL); + if (ret != 0) + goto on_error_stop; + + if (!args[i].success) { + lxc_error("ns sharing thread %d failed\n", args[i].thread_id); + goto on_error_stop; + } + } + } + + ret = EXIT_SUCCESS; + +on_error_stop: + if (c->is_running(c) && !c->stop(c)) + lxc_error("%s\n", "Failed to stop container \"owning-ns\""); + + if (!c->destroy(c)) + lxc_error("%s\n", "Failed to destroy container \"owning-ns\""); + +on_error_put: + lxc_container_put(c); + if (ret == EXIT_SUCCESS) + lxc_debug("%s\n", "All state namespace sharing tests passed"); + exit(ret); +}
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel