Re: [iovisor-dev][PATCHv5 RFC 1/3] BPF: New helper to obtain namespace data from current task.

neirac Thu, 25 Jul 2019 11:53:38 -0700

Hey Yonghong,
I have changed getname_kernel interface to specify the allocation type,
all previous callers preserve GFP_KERNEL as the allocation type, after this 
change
getname_kernel could be used with GFP_ATOMIC in ebpf helpers.
If this change goes in, I could complete bpf_get_current_pidns_info helper.
Let me know your thoughts.


>From dad6b281a744afb88660137a6a5f27057a72d7d5 Mon Sep 17 00:00:00 2001
From: Carlos <cneirabus...@gmail.com>
Date: Mon, 22 Jul 2019 17:50:02 -0400
Subject: [PATCH] [bpf-next 1/1] BPF: getname_kernel specify allocation
 type

A parameter to specify the allocation type has been added to getname_kernel, as
currently all allocations done with getname_kernel could sleep, the reason for 
this
change is that currently ebpf cannot call functions that could sleep and future 
work
will need to call getname_kernel that currently could sleep on allocation.
This change preserves the GFP_KERNEL allocation type on all existing code that 
calls
getname_kernel.
---
 fs/coredump.c      |  2 +-
 fs/exec.c          |  2 +-
 fs/fs_parser.c     |  2 +-
 fs/namei.c         | 23 ++++++++++++++---------
 fs/open.c          |  2 +-
 include/linux/fs.h |  2 +-
 init/main.c        |  2 +-
 kernel/umh.c       |  2 +-
 8 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index e42e17e55bfd..5c43a32f7ba2 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -684,7 +684,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
                         * If it doesn't exist, that's fine. If there's some
                         * other problem, we'll catch it at the filp_open().
                         */
-                       do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
+                       do_unlinkat(AT_FDCWD, getname_kernel(cn.corename, 
GFP_KERNEL));
                }
 
                /*
diff --git a/fs/exec.c b/fs/exec.c
index c71cbfe6826a..3811afb37d9a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -882,7 +882,7 @@ static struct file *do_open_execat(int fd, struct filename 
*name, int flags)
 
 struct file *open_exec(const char *name)
 {
-       struct filename *filename = getname_kernel(name);
+       struct filename *filename = getname_kernel(name, GFP_KERNEL);
        struct file *f = ERR_CAST(filename);
 
        if (!IS_ERR(filename)) {
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 0d388faa25d1..2a8f27cbbf39 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -248,7 +248,7 @@ int fs_lookup_param(struct fs_context *fc,
 
        switch (param->type) {
        case fs_value_is_string:
-               f = getname_kernel(param->string);
+               f = getname_kernel(param->string, GFP_KERNEL);
                if (IS_ERR(f))
                        return PTR_ERR(f);
                put_f = true;
diff --git a/fs/namei.c b/fs/namei.c
index 209c51a5226c..0e0a9710bdfe 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -122,6 +122,11 @@
  * PATH_MAX includes the nul terminator --RR.
  */
 
+/* [July 2019 neirac] getname_kernel added allocation type parameter as
+ * currently all allocations on getname_kernel could sleep, but ebpf
+ * needs non blocking allocations.
+ */
+
 #define EMBEDDED_NAME_MAX      (PATH_MAX - offsetof(struct filename, iname))
 
 struct filename *
@@ -210,12 +215,12 @@ getname(const char __user * filename)
 }
 
 struct filename *
-getname_kernel(const char * filename)
+getname_kernel(const char * filename, gfp_t flags)
 {
        struct filename *result;
        int len = strlen(filename) + 1;
 
-       result = __getname();
+       result = kmem_cache_alloc(names_cachep, flags);
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);
 
@@ -225,7 +230,7 @@ getname_kernel(const char * filename)
                const size_t size = offsetof(struct filename, iname[1]);
                struct filename *tmp;
 
-               tmp = kmalloc(size, GFP_KERNEL);
+               tmp = kmalloc(size, flags);
                if (unlikely(!tmp)) {
                        __putname(result);
                        return ERR_PTR(-ENOMEM);
@@ -2408,7 +2413,7 @@ struct dentry *kern_path_locked(const char *name, struct 
path *path)
        struct qstr last;
        int type;
 
-       filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
+       filename = filename_parentat(AT_FDCWD, getname_kernel(name, 
GFP_KERNEL), 0, path,
                                    &last, &type);
        if (IS_ERR(filename))
                return ERR_CAST(filename);
@@ -2429,7 +2434,7 @@ struct dentry *kern_path_locked(const char *name, struct 
path *path)
 
 int kern_path(const char *name, unsigned int flags, struct path *path)
 {
-       return filename_lookup(AT_FDCWD, getname_kernel(name),
+       return filename_lookup(AT_FDCWD, getname_kernel(name, GFP_KERNEL),
                               flags, path, NULL);
 }
 EXPORT_SYMBOL(kern_path);
@@ -2448,7 +2453,7 @@ int vfs_path_lookup(struct dentry *dentry, struct 
vfsmount *mnt,
 {
        struct path root = {.mnt = mnt, .dentry = dentry};
        /* the first argument of filename_lookup() is ignored with root */
-       return filename_lookup(AT_FDCWD, getname_kernel(name),
+       return filename_lookup(AT_FDCWD, getname_kernel(name, GFP_KERNEL),
                               flags , path, &root);
 }
 EXPORT_SYMBOL(vfs_path_lookup);
@@ -2749,7 +2754,7 @@ int
 kern_path_mountpoint(int dfd, const char *name, struct path *path,
                        unsigned int flags)
 {
-       return filename_mountpoint(dfd, getname_kernel(name), path, flags);
+       return filename_mountpoint(dfd, getname_kernel(name, GFP_KERNEL), path, 
flags);
 }
 EXPORT_SYMBOL(kern_path_mountpoint);
 
@@ -3583,7 +3588,7 @@ struct file *do_file_open_root(struct dentry *dentry, 
struct vfsmount *mnt,
        if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);
 
-       filename = getname_kernel(name);
+       filename = getname_kernel(name, GFP_KERNEL);
        if (IS_ERR(filename))
                return ERR_CAST(filename);
 
@@ -3672,7 +3677,7 @@ static struct dentry *filename_create(int dfd, struct 
filename *name,
 struct dentry *kern_path_create(int dfd, const char *pathname,
                                struct path *path, unsigned int lookup_flags)
 {
-       return filename_create(dfd, getname_kernel(pathname),
+       return filename_create(dfd, getname_kernel(pathname, GFP_KERNEL),
                                path, lookup_flags);
 }
 EXPORT_SYMBOL(kern_path_create);
diff --git a/fs/open.c b/fs/open.c
index b5b80469b93d..cc0e23a605f3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1030,7 +1030,7 @@ struct file *file_open_name(struct filename *name, int 
flags, umode_t mode)
  */
 struct file *filp_open(const char *filename, int flags, umode_t mode)
 {
-       struct filename *name = getname_kernel(filename);
+       struct filename *name = getname_kernel(filename, GFP_KERNEL);
        struct file *file = ERR_CAST(name);
        
        if (!IS_ERR(name)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75f2ed289a3f..826ab644f9bf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2532,7 +2532,7 @@ extern int filp_close(struct file *, fl_owner_t id);
 
 extern struct filename *getname_flags(const char __user *, int, int *);
 extern struct filename *getname(const char __user *);
-extern struct filename *getname_kernel(const char *);
+extern struct filename *getname_kernel(const char *, gfp_t flags);
 extern void putname(struct filename *name);
 
 extern int finish_open(struct file *file, struct dentry *dentry,
diff --git a/init/main.c b/init/main.c
index ff5803b0841c..29972a5a0ebc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1046,7 +1046,7 @@ static int run_init_process(const char *init_filename)
 {
        argv_init[0] = init_filename;
        pr_info("Run %s as init process\n", init_filename);
-       return do_execve(getname_kernel(init_filename),
+       return do_execve(getname_kernel(init_filename, GFP_KERNEL),
                (const char __user *const __user *)argv_init,
                (const char __user *const __user *)envp_init);
 }
diff --git a/kernel/umh.c b/kernel/umh.c
index 7f255b5a8845..f99e08fc327c 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -109,7 +109,7 @@ static int call_usermodehelper_exec_async(void *data)
                if (!retval)
                        current->flags |= PF_UMH;
        } else
-               retval = do_execve(getname_kernel(sub_info->path),
+               retval = do_execve(getname_kernel(sub_info->path, GFP_KERNEL),
                                   (const char __user *const __user 
*)sub_info->argv,
                                   (const char __user *const __user 
*)sub_info->envp);
 out:
-- 
2.11.0


On Tue, Jul 23, 2019 at 12:55:37PM -0400, Carlos Antonio Neira Bustos wrote:
> Yonghong,
> 
> I totally agree with this new approach as it isolates the allocation type per 
> caller.That way we should not have any regressions on existing code, and the 
> only caller using GFP_ATOMIC getname_kernel should be at this point the ebpf 
> helper, I'll also change __getname to use the allocation type requested by 
> getname_kernel flags. 
> 
> Thanks again for your help.
> 
> 
> 
> On Tue, Jul 23, 2019 at 08:56:03AM -0700, Y Song wrote:
> > On Tue, Jul 23, 2019 at 7:37 AM Carlos Antonio Neira Bustos
> > <cneirabus...@gmail.com> wrote:
> > >
> > > Hey Yonghong,
> > > I also needed to replace the call to __getname as allocations on the slab 
> > > cache
> > > could sleep, that makes getname_kernel the only name cache consumer that 
> > > does not block.
> > > What do you think ?
> > 
> > Maybe the following alternative approach is better to preserve the
> > existing behavior.
> > Change getname_kernel(filename) to getname_kernel(filename, flags) and
> > the *flags* will
> > be used for kmem_cache_alloc and kmalloc. Existing behavior won't
> > change and bpf helper
> > will call getname_kernel(filename, GFP_ATOMIC). More code changes but
> > cleaner and not
> > unnecessarily overusing kernel atomic memory region.
> > 
> > 
> > >
> > >
> > > From 639233cc05e96a81054b049d16d1b9193ae667e9 Mon Sep 17 00:00:00 2001
> > > From: Carlos <cneirabus...@gmail.com>
> > > Date: Mon, 22 Jul 2019 17:50:02 -0400
> > > Subject: [PATCH] [bpf-next 1/1] BPF: getname_kernel don't block on
> > >  allocation.
> > >
> > > This patch calls directly kmem_cache_alloc to specify the allocation type 
> > > as GFP_ATOMIC,
> > > the reason is that currently ebpf cannot call functions that could sleep 
> > > and future work
> > > will need to call kern_path that currently could sleep on allocation.
> > > ---
> > >  fs/namei.c | 8 ++++++--
> > >  1 file changed, 6 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/fs/namei.c b/fs/namei.c
> > > index 209c51a5226c..a4269ca52503 100644
> > > --- a/fs/namei.c
> > > +++ b/fs/namei.c
> > > @@ -122,6 +122,10 @@
> > >   * PATH_MAX includes the nul terminator --RR.
> > >   */
> > >
> > > +/* [July 2019 neirac] getname_kernel changed allocation type to 
> > > GFP_ATOMIC,
> > > + * as ebpf needs to use kern_path.
> > > + */
> > > +
> > >  #define EMBEDDED_NAME_MAX      (PATH_MAX - offsetof(struct filename, 
> > > iname))
> > >
> > >  struct filename *
> > > @@ -215,7 +219,7 @@ getname_kernel(const char * filename)
> > >         struct filename *result;
> > >         int len = strlen(filename) + 1;
> > >
> > > -       result = __getname();
> > > +       result = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
> > >         if (unlikely(!result))
> > >                 return ERR_PTR(-ENOMEM);
> > >
> > > @@ -225,7 +229,7 @@ getname_kernel(const char * filename)
> > >                 const size_t size = offsetof(struct filename, iname[1]);
> > >                 struct filename *tmp;
> > >
> > > -               tmp = kmalloc(size, GFP_KERNEL);
> > > +               tmp = kmalloc(size, GFP_ATOMIC);
> > >                 if (unlikely(!tmp)) {
> > >                         __putname(result);
> > >                         return ERR_PTR(-ENOMEM);
> > > --
> > > 2.11.0
> > >
> > >
> > >
> > > On Fri, Jul 19, 2019 at 09:06:54AM -0700, Y Song wrote:
> > > > On Fri, Jul 19, 2019 at 6:02 AM cnb <cneirabus...@gmail.com> wrote:
> > > > >
> > > > > kern_path is the one, should I move this and dependencies to a new 
> > > > > file called bfp_namei.c, in there I'll change it to use GFP_ATOMIC or 
> > > > > create a new function to replace kern_path. What do you think?
> > > >
> > > > I think adding GFP_ATOMIC to function getname_kernel() is reasonable.
> > > >
> > > >                 const size_t size = offsetof(struct filename, iname[1]);
> > > >                 struct filename *tmp;
> > > >
> > > >                 tmp = kmalloc(size, GFP_KERNEL);
> > > >
> > > > /* fs/open.c */
> > > > struct audit_names;
> > > > struct filename {
> > > >         const char              *name;  /* pointer to actual string */
> > > >         const __user char       *uptr;  /* original userland pointer */
> > > >         int                     refcnt;
> > > >         struct audit_names      *aname;
> > > >         const char              iname[];
> > > > };
> > > >
> > > > The size is pretty small. on x64, it should be 25 bytes. If the system
> > > > cannot honor
> > > > this 25 bytes in non-blocking mode, it is probably already in stress,
> > > > bpf subsystem itself may not work reliably as it uses some GFP_ATOMIC
> > > > as well for some map (e.g., update) helpers.
> > > >
> > > > So let us have this patch (adding GFP_ATOMIC) in the first series. We 
> > > > can
> > > > think of alternatives (e.g., separate functions)  if anybody objects.
> > > >
> > > > >
> > > > > El jue., 18 de jul. de 2019 22:10, Y Song <ys114...@gmail.com> 
> > > > > escribió:
> > > > >>
> > > > >> On Thu, Jul 18, 2019 at 6:21 PM carlos antonio neira bustos
> > > > >> <cneirabus...@gmail.com> wrote:
> > > > >> >
> > > > >> > Hi,
> > > > >> > Yes, I'm still interested as I need this capability at $WORK, but 
> > > > >> > haven't had the time to re-write the dentry functions we need, as 
> > > > >> > currently they could sleep.
> > > > >> > I could resume work on this next monday when I get back from 
> > > > >> > vacations.
> > > > >> > I really appreciate your help and guidance on this.
> > > > >>
> > > > >> Great. Which dentry function are you referring to? dput? kern_path
> > > > >> cannot be used as it uses
> > > > >> kmalloc without GFP_ATOMIC. See 
> > > > >> https://github.com/iovisor/bcc/issues/1329.
> > > > >>
> > > > >> >
> > > > >> > Bests
> > > > >> >
> > > > >> >
> > > > >> > El jue., 18 de jul. de 2019 20:29, Y Song <ys114...@gmail.com> 
> > > > >> > escribió:
> > > > >> >>
> > > > >> >> Hi, Carlos,
> > > > >> >>
> > > > >> >> Are you still interested in upstreaming this patch? Looks like 
> > > > >> >> there
> > > > >> >> still a desire
> > > > >> >> to make bcc work inside the containers.
> > > > >> >>
> > > > >> >> The bpf-next will open next week. If you would like, could you 
> > > > >> >> submit again?
> > > > >> >> I will review the patch on bpf-next as well to make sure we made
> > > > >> >> forward progress.
> > > > >> >>
> > > > >> >> Please let me know. Thanks!
> > > > >> >>
> > > > >> >> Yonghong
> > > > >> >>
> > > > >> >> On Tue, Apr 16, 2019 at 6:31 PM Carlos Antonio Neira Bustos
> > > > >> >> <cneirabus...@gmail.com> wrote:
> > > > >> >> >
> > > > >> >> > As a bpf program cannot sleep, I needed to add a spinlock to 
> > > > >> >> > kern_path, as
> > > > >> >> > it calls getname_kernel() which may sleep.
> > > > >> >> > The inode is accessed directly, as we are just interested in 
> > > > >> >> > the inode's s_dev.
> > > > >> >> > Let me know if this approach is the correct one.
> > > > >> >> > -------------------------------------------------------------------------------
> > > > >> >> > From 35b7bfcbf6524ec807f107302209a0fb07614cc8 Mon Sep 17 
> > > > >> >> > 00:00:00 2001
> > > > >> >> > From: Carlos <cneirabus...@gmail.com>
> > > > >> >> > Date: Tue, 16 Apr 2019 17:10:46 -0400
> > > > >> >> > Subject: [PATCH] [PATCH bpf-next 1/3] BPF: New helper to obtain 
> > > > >> >> > namespace data
> > > > >> >> >   from current task
> > > > >> >> >
> > > > >> >> > This helper obtains the active namespace from current and 
> > > > >> >> > returns pid, tgid,
> > > > >> >> > device and namespace id as seen from that namespace, allowing 
> > > > >> >> > to instrument
> > > > >> >> > a process inside a container.
> > > > >> >> > Device is read from /proc/self/ns/pid, as in the future it's 
> > > > >> >> > possible that
> > > > >> >> > different pid_ns files may belong to different devices, 
> > > > >> >> > according
> > > > >> >> > to the discussion between Eric Biederman and Yonghong in 2017 
> > > > >> >> > linux plumbers
> > > > >> >> > conference.
> > > > >> >> > Currently bpf_get_current_pid_tgid(), is used to do pid 
> > > > >> >> > filtering in bcc's
> > > > >> >> > scripts but this helper returns the pid as seen by the root 
> > > > >> >> > namespace which is
> > > > >> >> > fine when a bcc script is not executed inside a container.
> > > > >> >> > When the process of interest is inside a container, pid 
> > > > >> >> > filtering will not work
> > > > >> >> > if bpf_get_current_pid_tgid() is used. This helper addresses 
> > > > >> >> > this limitation
> > > > >> >> > returning the pid as it's seen by the current namespace where 
> > > > >> >> > the script is
> > > > >> >> > executing.
> > > > >> >> >
> > > > >> >> > This helper has the same use cases as 
> > > > >> >> > bpf_get_current_pid_tgid() as it can be
> > > > >> >> > used to do pid filtering even inside a container.
> > > > >> >> >
> > > > >> >> > For example a bcc script using bpf_get_current_pid_tgid() 
> > > > >> >> > (tools/funccount.py):
> > > > >> >> >
> > > > >> >> >         u32 pid = bpf_get_current_pid_tgid() >> 32;
> > > > >> >> >         if (pid != <pid_arg_passed_in>)
> > > > >> >> >                 return 0;
> > > > >> >> > Could be modified to use bpf_get_current_pidns_info() as 
> > > > >> >> > follows:
> > > > >> >> >
> > > > >> >> >         struct bpf_pidns pidns;
> > > > >> >> >         bpf_get_current_pidns_info(&pidns, sizeof(struct 
> > > > >> >> > bpf_pidns));
> > > > >> >> >         u32 pid = pidns.tgid;
> > > > >> >> >         u32 nsid = pidns.nsid;
> > > > >> >> >         if ((pid != <pid_arg_passed_in>) && (nsid != 
> > > > >> >> > <nsid_arg_passed_in>))
> > > > >> >> >                 return 0;
> > > > >> >> >
> > > > >> >> > To find out the name PID namespace id of a process, you could 
> > > > >> >> > use this command:
> > > > >> >> >
> > > > >> >> > $ ps -h -o pidns -p <pid_of_interest>
> > > > >> >> >
> > > > >> >> > Or this other command:
> > > > >> >> >
> > > > >> >> > $ ls -Li /proc/<pid_of_interest>/ns/pid
> > > > >> >> >
> > > > >> >> > Signed-off-by: Carlos Antonio Neira Bustos 
> > > > >> >> > <cneirabus...@gmail.com>
> > > > >> >> > ---
> > > > >> >> >  include/linux/bpf.h      |  1 +
> > > > >> >> >  include/uapi/linux/bpf.h | 25 ++++++++++++++++++-
> > > > >> >> >  kernel/bpf/core.c        |  1 +
> > > > >> >> >  kernel/bpf/helpers.c     | 65 
> > > > >> >> > ++++++++++++++++++++++++++++++++++++++++++++++++
> > > > >> >> >  kernel/trace/bpf_trace.c |  2 ++
> > > > >> >> >  5 files changed, 93 insertions(+), 1 deletion(-)
> > > > >> >> >
> > > > >> >> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > >> >> > index e4d4c1771ab0..4393f8f088cc 100644
> > > > >> >> > --- a/include/linux/bpf.h
> > > > >> >> > +++ b/include/linux/bpf.h
> > > > >> >> > @@ -987,6 +987,7 @@ extern const struct bpf_func_proto 
> > > > >> >> > bpf_sk_redirect_map_proto;
> > > > >> >> >  extern const struct bpf_func_proto bpf_spin_lock_proto;
> > > > >> >> >  extern const struct bpf_func_proto bpf_spin_unlock_proto;
> > > > >> >> >  extern const struct bpf_func_proto bpf_get_local_storage_proto;
> > > > >> >> > +extern const struct bpf_func_proto 
> > > > >> >> > bpf_get_current_pidns_info_proto;
> > > > >> >> >
> > > > >> >> >  /* Shared helpers among cBPF and eBPF. */
> > > > >> >> >  void bpf_user_rnd_init_once(void);
> > > > >> >> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > >> >> > index 31a27dd337dc..7bf457875c31 100644
> > > > >> >> > --- a/include/uapi/linux/bpf.h
> > > > >> >> > +++ b/include/uapi/linux/bpf.h
> > > > >> >> > @@ -2500,6 +2500,18 @@ union bpf_attr {
> > > > >> >> >   *     Return
> > > > >> >> >   *             0 if iph and th are a valid SYN cookie ACK, or 
> > > > >> >> > a negative error
> > > > >> >> >   *             otherwise.
> > > > >> >> > + *
> > > > >> >> > + * int bpf_get_current_pidns_info(struct bpf_pidns_info 
> > > > >> >> > *pidns, u32 size_of_pidns)
> > > > >> >> > + *     Description
> > > > >> >> > + *             Copies into *pidns* pid, namespace id and tgid 
> > > > >> >> > as seen by the
> > > > >> >> > + *             current namespace and also device from 
> > > > >> >> > /proc/self/ns/pid.
> > > > >> >> > + *             *size_of_pidns* must be the size of *pidns*
> > > > >> >> > + *
> > > > >> >> > + *             This helper is used when pid filtering is 
> > > > >> >> > needed inside a
> > > > >> >> > + *             container as bpf_get_current_tgid() helper 
> > > > >> >> > returns always the
> > > > >> >> > + *             pid id as seen by the root namespace.
> > > > >> >> > + *     Return
> > > > >> >> > + *             0 on success -EINVAL on error.
> > > > >> >> >   */
> > > > >> >> >  #define __BPF_FUNC_MAPPER(FN)          \
> > > > >> >> >         FN(unspec),                     \
> > > > >> >> > @@ -2602,7 +2614,8 @@ union bpf_attr {
> > > > >> >> >         FN(skb_ecn_set_ce),             \
> > > > >> >> >         FN(get_listener_sock),          \
> > > > >> >> >         FN(skc_lookup_tcp),             \
> > > > >> >> > -       FN(tcp_check_syncookie),
> > > > >> >> > +       FN(tcp_check_syncookie),        \
> > > > >> >> > +       FN(get_current_pidns_info),
> > > > >> >> >
> > > > >> >> >  /* integer value in 'imm' field of BPF_CALL instruction 
> > > > >> >> > selects which helper
> > > > >> >> >   * function eBPF program intends to call
> > > > >> >> > @@ -3298,4 +3311,14 @@ struct bpf_line_info {
> > > > >> >> >  struct bpf_spin_lock {
> > > > >> >> >         __u32   val;
> > > > >> >> >  };
> > > > >> >> > +
> > > > >> >> > +/* helper bpf_get_current_pidns_info will store the following
> > > > >> >> > + * data, dev will contain major/minor from /proc/self/pid.
> > > > >> >> > +*/
> > > > >> >> > +struct bpf_pidns_info {
> > > > >> >> > +       __u32 dev;
> > > > >> >> > +       __u32 nsid;
> > > > >> >> > +       __u32 tgid;
> > > > >> >> > +       __u32 pid;
> > > > >> >> > +};
> > > > >> >> >  #endif /* _UAPI__LINUX_BPF_H__ */
> > > > >> >> > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> > > > >> >> > index ace8c22c8b0e..ecbdc72ba459 100644
> > > > >> >> > --- a/kernel/bpf/core.c
> > > > >> >> > +++ b/kernel/bpf/core.c
> > > > >> >> > @@ -2046,6 +2046,7 @@ const struct bpf_func_proto 
> > > > >> >> > bpf_get_current_uid_gid_proto __weak;
> > > > >> >> >  const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> > > > >> >> >  const struct bpf_func_proto bpf_get_current_cgroup_id_proto 
> > > > >> >> > __weak;
> > > > >> >> >  const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> > > > >> >> > +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
> > > > >> >> >
> > > > >> >> >  const struct bpf_func_proto * __weak 
> > > > >> >> > bpf_get_trace_printk_proto(void)
> > > > >> >> >  {
> > > > >> >> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > > > >> >> > index a411fc17d265..2de82d14424d 100644
> > > > >> >> > --- a/kernel/bpf/helpers.c
> > > > >> >> > +++ b/kernel/bpf/helpers.c
> > > > >> >> > @@ -18,6 +18,11 @@
> > > > >> >> >  #include <linux/sched.h>
> > > > >> >> >  #include <linux/uidgid.h>
> > > > >> >> >  #include <linux/filter.h>
> > > > >> >> > +#include <linux/pid_namespace.h>
> > > > >> >> > +#include <linux/major.h>
> > > > >> >> > +#include <linux/stat.h>
> > > > >> >> > +#include <linux/namei.h>
> > > > >> >> > +#include <linux/version.h>
> > > > >> >> >
> > > > >> >> >  /* If kernel subsystem is allowing eBPF programs to call this 
> > > > >> >> > function,
> > > > >> >> >   * inside its own verifier_ops->get_func_proto() callback it 
> > > > >> >> > should return
> > > > >> >> > @@ -317,6 +322,66 @@ void copy_map_value_locked(struct bpf_map 
> > > > >> >> > *map, void *dst, void *src,
> > > > >> >> >         preempt_enable();
> > > > >> >> >  }
> > > > >> >> >
> > > > >> >> > +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info 
> > > > >> >> > *, pidns_info, u32,
> > > > >> >> > +        size)
> > > > >> >> > +{
> > > > >> >> > +       const char *pidnspath = "/proc/self/ns/pid";
> > > > >> >> > +       struct pid_namespace *pidns = NULL;
> > > > >> >> > +       DEFINE_SPINLOCK(bpf_spinlock);
> > > > >> >> > +       struct inode *inode;
> > > > >> >> > +       struct kstat ks;
> > > > >> >> > +       struct path kp;
> > > > >> >> > +       pid_t tgid = 0;
> > > > >> >> > +       pid_t pid = 0;
> > > > >> >> > +       int ret;
> > > > >> >> > +
> > > > >> >> > +       if (unlikely(size != sizeof(struct bpf_pidns_info)))
> > > > >> >> > +               goto clear;
> > > > >> >> > +
> > > > >> >> > +       pidns = task_active_pid_ns(current);
> > > > >> >> > +
> > > > >> >> > +       if (unlikely(!pidns))
> > > > >> >> > +               goto clear;
> > > > >> >> > +
> > > > >> >> > +       pidns_info->nsid =  pidns->ns.inum;
> > > > >> >> > +       pid = task_pid_nr_ns(current, pidns);
> > > > >> >> > +
> > > > >> >> > +       if (unlikely(!pid))
> > > > >> >> > +               goto clear;
> > > > >> >> > +
> > > > >> >> > +       tgid = task_tgid_nr_ns(current, pidns);
> > > > >> >> > +
> > > > >> >> > +       if (unlikely(!tgid))
> > > > >> >> > +               goto clear;
> > > > >> >> > +
> > > > >> >> > +       pidns_info->tgid = (u32) tgid;
> > > > >> >> > +       pidns_info->pid = (u32) pid;
> > > > >> >> > +
> > > > >> >> > +       spin_lock(&bpf_spinlock);
> > > > >> >> > +        ret = kern_path(pidnspath, 0, &kp);
> > > > >> >> > +       if (ret)
> > > > >> >> > +               goto clear;
> > > > >> >> > +       inode = d_backing_inode(kp.dentry);
> > > > >> >> > +       pidns_info->dev = inode->i_sb->s_dev;
> > > > >> >> > +       spin_unlock(&bpf_spinlock);
> > > > >> >> > +
> > > > >> >> > +       return 0;
> > > > >> >> > +
> > > > >> >> > +       clear:
> > > > >> >> > +       if (pidns_info)
> > > > >> >> > +               memset((void *)pidns, 0, (size_t) size);
> > > > >> >> > +
> > > > >> >> > +       return -EINVAL;
> > > > >> >> > +}
> > > > >> >> > +
> > > > >> >> > +const struct bpf_func_proto bpf_get_current_pidns_info_proto = 
> > > > >> >> > {
> > > > >> >> > +       .func   = bpf_get_current_pidns_info,
> > > > >> >> > +       .gpl_only       = false,
> > > > >> >> > +       .ret_type       = RET_INTEGER,
> > > > >> >> > +       .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> > > > >> >> > +       .arg2_type      = ARG_CONST_SIZE,
> > > > >> >> > +};
> > > > >> >> > +
> > > > >> >> >  #ifdef CONFIG_CGROUPS
> > > > >> >> >  BPF_CALL_0(bpf_get_current_cgroup_id)
> > > > >> >> >  {
> > > > >> >> > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > > > >> >> > index d64c00afceb5..2ef0de78d4ec 100644
> > > > >> >> > --- a/kernel/trace/bpf_trace.c
> > > > >> >> > +++ b/kernel/trace/bpf_trace.c
> > > > >> >> > @@ -603,6 +603,8 @@ tracing_func_proto(enum bpf_func_id 
> > > > >> >> > func_id, const struct bpf_prog *prog)
> > > > >> >> >         case BPF_FUNC_get_current_cgroup_id:
> > > > >> >> >                 return &bpf_get_current_cgroup_id_proto;
> > > > >> >> >  #endif
> > > > >> >> > +       case BPF_FUNC_get_current_pidns_info:
> > > > >> >> > +               return &bpf_get_current_pidns_info_proto;
> > > > >> >> >         default:
> > > > >> >> >                 return NULL;
> > > > >> >> >         }
> > > > >> >> > --
> > > > >> >> > 2.11.0
> > > > >> >> >
> > > > >
> > > > > 

-=-=-=-=-=-=-=-=-=-=-=-
Links: You receive all messages sent to this group.

View/Reply Online (#1750): https://lists.iovisor.org/g/iovisor-dev/message/1750
Mute This Topic: https://lists.iovisor.org/mt/31207586/21656
Group Owner: iovisor-dev+ow...@lists.iovisor.org
Unsubscribe: https://lists.iovisor.org/g/iovisor-dev/unsub  
[arch...@mail-archive.com]
-=-=-=-=-=-=-=-=-=-=-=-

Re: [iovisor-dev][PATCHv5 RFC 1/3] BPF: New helper to obtain namespace data from current task.

Reply via email to