On Thu, May 31, 2018 at 08:49:46AM -0600, Tycho Andersen wrote:
> This patch introduces a means for syscalls matched in seccomp to notify
> some other task that a particular filter has been triggered.
> 
> The motivation for this is primarily for use with containers. For example,
> if a container does an init_module(), we obviously don't want to load this
> untrusted code, which may be compiled for the wrong version of the kernel
> anyway. Instead, we could parse the module image, figure out which module
> the container is trying to load and load it on the host.
> 
> As another example, containers cannot mknod(), since this checks
> capable(CAP_SYS_ADMIN). However, harmless devices like /dev/null or
> /dev/zero should be ok for containers to mknod, but we'd like to avoid hard
> coding some whitelist in the kernel. Another example is mount(), which has
> many security restrictions for good reason, but configuration or runtime
> knowledge could potentially be used to relax these restrictions.
> 
> This patch adds functionality that is already possible via at least two
> other means that I know about, both of which involve ptrace(): first, one
> could ptrace attach, and then iterate through syscalls via PTRACE_SYSCALL.
> Unfortunately this is slow, so a faster version would be to install a
> filter that does SECCOMP_RET_TRACE, which triggers a PTRACE_EVENT_SECCOMP.
> Since ptrace allows only one tracer, if the container runtime is that
> tracer, users inside the container (or outside) trying to debug it will not
> be able to use ptrace, which is annoying. It also means that older
> distributions based on Upstart cannot boot inside containers using ptrace,
> since upstart itself uses ptrace to start services.
> 
> The actual implementation of this is fairly small, although getting the
> synchronization right was/is slightly complex.
> 
> Finally, it's worth noting that the classic seccomp TOCTOU of reading
> memory data from the task still applies here, but can be avoided with
> careful design of the userspace handler: if the userspace handler reads all
> of the task memory that is necessary before applying its security policy,
> the tracee's subsequent memory edits will not be read by the tracer.
> 
> v2: * make id a u64; the idea here being that it will never overflow,
>       because 64 is huge (one syscall every nanosecond => wrap every 584
>       years) (Andy)
>     * prevent nesting of user notifications: if someone is already attached
>       the tree in one place, nobody else can attach to the tree (Andy)
>     * notify the listener of signals the tracee receives as well (Andy)
>     * implement poll
> v3: * lockdep fix (Oleg)
>     * drop unnecessary WARN()s (Christian)
>     * rearrange error returns to be more rpetty (Christian)
>     * fix build in !CONFIG_SECCOMP_USER_NOTIFICATION case
> 
> Signed-off-by: Tycho Andersen <ty...@tycho.ws>
> CC: Kees Cook <keesc...@chromium.org>
> CC: Andy Lutomirski <l...@amacapital.net>
> CC: Oleg Nesterov <o...@redhat.com>
> CC: Eric W. Biederman <ebied...@xmission.com>
> CC: "Serge E. Hallyn" <se...@hallyn.com>
> CC: Christian Brauner <christian.brau...@ubuntu.com>
> CC: Tyler Hicks <tyhi...@canonical.com>
> CC: Akihiro Suda <suda.akih...@lab.ntt.co.jp>
> ---
>  arch/Kconfig                                  |   7 +
>  include/linux/seccomp.h                       |   3 +-
>  include/uapi/linux/seccomp.h                  |  18 +-
>  kernel/seccomp.c                              | 398 +++++++++++++++++-
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 195 ++++++++-
>  5 files changed, 615 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 75dd23acf133..1c1ae8d8c8b9 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -401,6 +401,13 @@ config SECCOMP_FILTER
>  
>         See Documentation/prctl/seccomp_filter.txt for details.
>  
> +config SECCOMP_USER_NOTIFICATION
> +     bool "Enable the SECCOMP_RET_USER_NOTIF seccomp action"
> +     depends on SECCOMP_FILTER
> +     help
> +       Enable SECCOMP_RET_USER_NOTIF, a return code which can be used by 
> seccomp
> +       programs to notify a userspace listener that a particular event 
> happened.
> +
>  config HAVE_GCC_PLUGINS
>       bool
>       help
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index c723a5c4e3ff..0fd3e0676a1c 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -5,7 +5,8 @@
>  #include <uapi/linux/seccomp.h>
>  
>  #define SECCOMP_FILTER_FLAG_MASK     (SECCOMP_FILTER_FLAG_TSYNC | \
> -                                      SECCOMP_FILTER_FLAG_LOG)
> +                                      SECCOMP_FILTER_FLAG_LOG | \
> +                                      SECCOMP_FILTER_FLAG_GET_LISTENER)
>  
>  #ifdef CONFIG_SECCOMP
>  
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index 2a0bd9dd104d..8160e6cad528 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -17,8 +17,9 @@
>  #define SECCOMP_GET_ACTION_AVAIL     2
>  
>  /* Valid flags for SECCOMP_SET_MODE_FILTER */
> -#define SECCOMP_FILTER_FLAG_TSYNC    1
> -#define SECCOMP_FILTER_FLAG_LOG              2
> +#define SECCOMP_FILTER_FLAG_TSYNC            1
> +#define SECCOMP_FILTER_FLAG_LOG                      2
> +#define SECCOMP_FILTER_FLAG_GET_LISTENER     4
>  
>  /*
>   * All BPF programs must return a 32-bit value.
> @@ -34,6 +35,7 @@
>  #define SECCOMP_RET_KILL      SECCOMP_RET_KILL_THREAD
>  #define SECCOMP_RET_TRAP      0x00030000U /* disallow and force a SIGSYS */
>  #define SECCOMP_RET_ERRNO     0x00050000U /* returns an errno */
> +#define SECCOMP_RET_USER_NOTIF   0x7fc00000U /* notifies userspace */
>  #define SECCOMP_RET_TRACE     0x7ff00000U /* pass to a tracer or disallow */
>  #define SECCOMP_RET_LOG               0x7ffc0000U /* allow after logging */
>  #define SECCOMP_RET_ALLOW     0x7fff0000U /* allow */
> @@ -59,4 +61,16 @@ struct seccomp_data {
>       __u64 args[6];
>  };
>  
> +struct seccomp_notif {
> +     __u64 id;
> +     pid_t pid;
> +     struct seccomp_data data;
> +};
> +
> +struct seccomp_notif_resp {
> +     __u64 id;
> +     __s32 error;
> +     __s64 val;
> +};
> +
>  #endif /* _UAPI_LINUX_SECCOMP_H */
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index dc77548167ef..f69327d5f7c7 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -31,6 +31,7 @@
>  #endif
>  
>  #ifdef CONFIG_SECCOMP_FILTER
> +#include <linux/file.h>
>  #include <linux/filter.h>
>  #include <linux/pid.h>
>  #include <linux/ptrace.h>
> @@ -38,6 +39,52 @@
>  #include <linux/tracehook.h>
>  #include <linux/uaccess.h>
>  
> +#ifdef CONFIG_SECCOMP_USER_NOTIFICATION
> +#include <linux/anon_inodes.h>
> +
> +enum notify_state {
> +     SECCOMP_NOTIFY_INIT,
> +     SECCOMP_NOTIFY_SENT,
> +     SECCOMP_NOTIFY_REPLIED,
> +};
> +
> +struct seccomp_knotif {
> +     /* The pid whose filter triggered the notification */
> +     pid_t pid;
> +
> +     /*
> +      * The "cookie" for this request; this is unique for this filter.
> +      */
> +     u32 id;

Perhaps
         /* The "cookie" for this request; this is unique for this filter. */


Epic patch review :)

thanks,
Tobin.

Reply via email to