Add an operation, SECCOMP_CLONE_FILTER, that can copy the seccomp filters
from another process to the current process.

I roughly reproduced the Docker seccomp filter [1] and timed how long it
takes to build it (via libseccomp) and attach it to a process.  After
1000 runs, on average it took 3,740,000 TSC ticks (or ~1440 microseconds)
on an AMD EPYC 9J14 running at 2596 MHz.  The median build/load time was
3,715,000 TSC ticks.

On the same system, I preloaded the above Docker seccomp filter onto a
process.  (Note that I opened a pidfd to the reference process and left
the pidfd open for the entire run.)  I then cloned the filter using the
feature in this patch to 1000 new processes.  On average, it took 9,300
TSC ticks (or ~3.6 microseconds) to copy the filter to the new processes.
The median clone time was 9,048 TSC ticks.

This is approximately a 400x performance improvement for those container
managers that are using the exact same seccomp filter across all of their
containers.

[1] 
https://raw.githubusercontent.com/moby/moby/refs/heads/master/profiles/seccomp/default.json

Signed-off-by: Tom Hromatka <[email protected]>
---
 .../userspace-api/seccomp_filter.rst          |  10 ++
 include/uapi/linux/seccomp.h                  |   1 +
 kernel/seccomp.c                              |  48 ++++++
 samples/seccomp/Makefile                      |   2 +-
 samples/seccomp/clone-filter.c                | 143 ++++++++++++++++++
 tools/include/uapi/linux/seccomp.h            |   1 +
 tools/testing/selftests/seccomp/seccomp_bpf.c |  71 +++++++++
 7 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 samples/seccomp/clone-filter.c

diff --git a/Documentation/userspace-api/seccomp_filter.rst 
b/Documentation/userspace-api/seccomp_filter.rst
index cff0fa7f3175..ef1797d093f6 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -289,6 +289,16 @@ above in this document: all arguments being read from the 
tracee's memory
 should be read into the tracer's memory before any policy decisions are made.
 This allows for an atomic decision on syscall arguments.
 
+Cloning an Existing Seccomp Filter
+==================================
+
+Constructing and loading a complex seccomp filter can often take a non-trivial
+amount of time. If a user wants to use the same seccomp filter across more
+than one process, it can be cloned to new processes via the
+``SECCOMP_CLONE_FILTER`` operation. Note that the clone will only succeed if
+the destination process does not have any seccomp filters already applied to
+it. See ``samples/seccomp/clone-filter.c`` for an example.
+
 Sysctls
 =======
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index dbfc9b37fcae..b0917e333b4b 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,6 +16,7 @@
 #define SECCOMP_SET_MODE_FILTER                1
 #define SECCOMP_GET_ACTION_AVAIL       2
 #define SECCOMP_GET_NOTIF_SIZES                3
+#define SECCOMP_CLONE_FILTER           4
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC              (1UL << 0)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 41aa761c7738..b726e0d6715d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -2081,6 +2081,49 @@ static long seccomp_get_notif_sizes(void __user *usizes)
        return 0;
 }
 
+static long seccomp_clone_filter(void __user *upidfd)
+{
+       struct task_struct *task;
+       unsigned int flags;
+       pid_t pidfd;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (atomic_read(&current->seccomp.filter_count) > 0)
+               return -EINVAL;
+
+       if (copy_from_user(&pidfd, upidfd, sizeof(pid_t)))
+               return -EFAULT;
+
+       task = pidfd_get_task(pidfd, &flags);
+       if (IS_ERR(task))
+               return -ESRCH;
+
+       spin_lock_irq(&current->sighand->siglock);
+       spin_lock_irq(&task->sighand->siglock);
+
+       if (atomic_read(&task->seccomp.filter_count) == 0) {
+               spin_unlock_irq(&task->sighand->siglock);
+               spin_unlock_irq(&current->sighand->siglock);
+               put_task_struct(task);
+               return -EINVAL;
+       }
+
+       get_seccomp_filter(task);
+       current->seccomp = task->seccomp;
+
+       spin_unlock_irq(&task->sighand->siglock);
+
+       set_task_syscall_work(current, SECCOMP);
+
+       spin_unlock_irq(&current->sighand->siglock);
+
+       put_task_struct(task);
+
+       return 0;
+}
+
 /* Common entry point for both prctl and syscall. */
 static long do_seccomp(unsigned int op, unsigned int flags,
                       void __user *uargs)
@@ -2102,6 +2145,11 @@ static long do_seccomp(unsigned int op, unsigned int 
flags,
                        return -EINVAL;
 
                return seccomp_get_notif_sizes(uargs);
+       case SECCOMP_CLONE_FILTER:
+               if (flags != 0)
+                       return -EINVAL;
+
+               return seccomp_clone_filter(uargs);
        default:
                return -EINVAL;
        }
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
index c85ae0ed8342..d38977f41b86 100644
--- a/samples/seccomp/Makefile
+++ b/samples/seccomp/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-userprogs-always-y += bpf-fancy dropper bpf-direct user-trap
+userprogs-always-y += bpf-fancy dropper bpf-direct user-trap clone-filter
 
 bpf-fancy-objs := bpf-fancy.o bpf-helper.o
 
diff --git a/samples/seccomp/clone-filter.c b/samples/seccomp/clone-filter.c
new file mode 100644
index 000000000000..d26e1375b9dc
--- /dev/null
+++ b/samples/seccomp/clone-filter.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Seccomp filter example for cloning a filter
+ *
+ * Copyright (c) 2025 Oracle and/or its affiliates.
+ * Author: Tom Hromatka <[email protected]>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications that reuse the same seccomp filter
+ * across many processes.
+ */
+#include <linux/seccomp.h>
+#include <linux/filter.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+static int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+       errno = 0;
+       return syscall(__NR_seccomp, op, flags, args);
+}
+
+static int install_filter(void)
+{
+       struct sock_filter deny_filter[] = {
+               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+                       offsetof(struct seccomp_data, nr)),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | ESRCH),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+       };
+       struct sock_fprog deny_prog = {
+               .len = (unsigned short)ARRAY_SIZE(deny_filter),
+               .filter = deny_filter,
+       };
+
+       return seccomp(SECCOMP_SET_MODE_FILTER, 0, &deny_prog);
+}
+
+static int clone_filter(pid_t ref_pid)
+{
+       int ref_pidfd, ret;
+
+       ref_pidfd = syscall(SYS_pidfd_open, ref_pid, 0);
+       if (ref_pidfd < 0)
+               return -errno;
+
+       ret = seccomp(SECCOMP_CLONE_FILTER, 0, &ref_pidfd);
+
+       close(ref_pidfd);
+
+       return ret;
+}
+
+static void do_ref_filter(void)
+{
+       int ret;
+
+       ret = install_filter();
+       if (ret) {
+               perror("Failed to install ref filter\n");
+               exit(1);
+       }
+
+       while (true)
+               sleep(1);
+}
+
+static void do_child_process(pid_t ref_pid)
+{
+       pid_t res;
+       int ret;
+
+       ret = clone_filter(ref_pid);
+       if (ret != 0) {
+               perror("Failed to clone filter. Installing filter from 
scratch\n");
+
+               ret = install_filter();
+               if (ret != 0) {
+                       perror("Filter install failed\n");
+                       exit(ret);
+               }
+       }
+
+       res = syscall(__NR_getpid);
+       if (res < 0) {
+               perror("getpid() unexpectedly failed\n");
+               exit(errno);
+       }
+
+       res = syscall(__NR_getppid);
+       if (res > 0) {
+               perror("getppid() unexpectedly succeeded\n");
+               exit(1);
+       }
+
+       exit(0);
+}
+
+int main(void)
+{
+       pid_t ref_pid = -1, child_pid = -1;
+       int ret, status;
+
+       ref_pid = fork();
+       if (ref_pid < 0)
+               exit(errno);
+       else if (ref_pid == 0)
+               do_ref_filter();
+
+       child_pid = fork();
+       if (child_pid < 0)
+               goto out;
+       else if (child_pid == 0)
+               do_child_process(ref_pid);
+
+       waitpid(child_pid, &status, 0);
+       if (WEXITSTATUS(status) != 0) {
+               perror("child process failed");
+               ret = WEXITSTATUS(status);
+               goto out;
+       }
+
+       ret = 0;
+
+out:
+       if (ref_pid != -1)
+               kill(ref_pid, SIGKILL);
+       if (child_pid != -1)
+               kill(child_pid, SIGKILL);
+
+       exit(ret);
+}
diff --git a/tools/include/uapi/linux/seccomp.h 
b/tools/include/uapi/linux/seccomp.h
index dbfc9b37fcae..b0917e333b4b 100644
--- a/tools/include/uapi/linux/seccomp.h
+++ b/tools/include/uapi/linux/seccomp.h
@@ -16,6 +16,7 @@
 #define SECCOMP_SET_MODE_FILTER                1
 #define SECCOMP_GET_ACTION_AVAIL       2
 #define SECCOMP_GET_NOTIF_SIZES                3
+#define SECCOMP_CLONE_FILTER           4
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC              (1UL << 0)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c 
b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 61acbd45ffaa..df5e0f615da0 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -177,6 +177,10 @@ struct seccomp_data {
 #define SECCOMP_GET_NOTIF_SIZES 3
 #endif
 
+#ifndef SECCOMP_CLONE_FILTER
+#define SECCOMP_CLONE_FILTER 4
+#endif
+
 #ifndef SECCOMP_FILTER_FLAG_TSYNC
 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
 #endif
@@ -5090,6 +5094,73 @@ TEST_F(URETPROBE, 
uretprobe_default_block_with_uretprobe_syscall)
        ASSERT_EQ(0, run_probed_with_filter(&prog));
 }
 
+TEST(clone_filter)
+{
+       struct sock_filter deny_filter[] = {
+               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+                       offsetof(struct seccomp_data, nr)),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | ESRCH),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+       };
+       struct sock_fprog deny_prog = {
+               .len = (unsigned short)ARRAY_SIZE(deny_filter),
+               .filter = deny_filter,
+       };
+       struct timespec ts = {
+               .tv_sec = 0,
+               .tv_nsec = 100000000,
+       };
+
+       pid_t child_pid, self_pid, res;
+       int child_pidfd, ret;
+
+       /* Only real root can copy a filter. */
+       if (geteuid()) {
+               SKIP(return, "clone_filter requires real root");
+               return;
+       }
+
+       self_pid = getpid();
+
+       child_pid = fork();
+       ASSERT_LE(0, child_pid);
+
+       if (child_pid == 0) {
+               ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+               ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 
&deny_prog));
+
+               while (true)
+                       EXPECT_EQ(0, syscall(__NR_nanosleep, &ts, NULL));
+       }
+
+       /* wait for the child pid to create its seccomp filter */
+       ASSERT_EQ(0, syscall(__NR_nanosleep, &ts, NULL));
+
+       child_pidfd = syscall(SYS_pidfd_open, child_pid, 0);
+       EXPECT_LE(0, child_pidfd);
+
+       /* Invalid flag provided */
+       ret = seccomp(SECCOMP_CLONE_FILTER, 1, &child_pidfd);
+       EXPECT_EQ(-1, ret);
+       EXPECT_EQ(errno, EINVAL);
+
+       errno = 0;
+       ret = seccomp(SECCOMP_CLONE_FILTER, 0, &child_pidfd);
+       EXPECT_EQ(0, ret);
+       EXPECT_EQ(errno, 0);
+
+       res = syscall(__NR_getppid);
+       EXPECT_EQ(res, -1);
+       EXPECT_EQ(errno, ESRCH);
+
+       res = syscall(__NR_getpid);
+       EXPECT_EQ(res, self_pid);
+
+       close(child_pidfd);
+       kill(child_pid, SIGKILL);
+}
+
 /*
  * TODO:
  * - expand NNP testing
-- 
2.47.3


Reply via email to