Re: [PATCH bpf-next v4 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

Martin KaFai Lau Wed, 23 May 2018 22:09:07 -0700

On Wed, May 23, 2018 at 05:18:42PM -0700, Yonghong Song wrote:
> Currently, suppose a userspace application has loaded a bpf program
> and attached it to a tracepoint/kprobe/uprobe, and a bpf
> introspection tool, e.g., bpftool, wants to show which bpf program
> is attached to which tracepoint/kprobe/uprobe. Such attachment
> information will be really useful to understand the overall bpf
> deployment in the system.
> 
> There is a name field (16 bytes) for each program, which could
> be used to encode the attachment point. There are some drawbacks
> for this approaches. First, bpftool user (e.g., an admin) may not
> really understand the association between the name and the
> attachment point. Second, if one program is attached to multiple
> places, encoding a proper name which can imply all these
> attachments becomes difficult.
> 
> This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
> Given a pid and fd, if the <pid, fd> is associated with a
> tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
>    . prog_id
>    . tracepoint name, or
>    . k[ret]probe funcname + offset or kernel addr, or
>    . u[ret]probe filename + offset
> to the userspace.
> The user can use "bpftool prog" to find more information about
> bpf program itself with prog_id.
> 
> Signed-off-by: Yonghong Song <y...@fb.com>
> ---
>  include/linux/trace_events.h |  17 +++++++
>  include/uapi/linux/bpf.h     |  26 ++++++++++
>  kernel/bpf/syscall.c         | 115 
> +++++++++++++++++++++++++++++++++++++++++++
>  kernel/trace/bpf_trace.c     |  48 ++++++++++++++++++
>  kernel/trace/trace_kprobe.c  |  29 +++++++++++
>  kernel/trace/trace_uprobe.c  |  22 +++++++++
>  6 files changed, 257 insertions(+)
> 
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 2bde3ef..d34144a 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, 
> void __user *info);
>  int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
>  int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog 
> *prog);
>  struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
> +                         u32 *fd_type, const char **buf,
> +                         u64 *probe_offset, u64 *probe_addr);
>  #else
>  static inline unsigned int trace_call_bpf(struct trace_event_call *call, 
> void *ctx)
>  {
> @@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map 
> *bpf_find_raw_tracepoint(const char *name
>  {
>       return NULL;
>  }
> +static inline int bpf_get_perf_event_info(const struct perf_event *event,
> +                                       u32 *prog_id, u32 *fd_type,
> +                                       const char **buf, u64 *probe_offset,
> +                                       u64 *probe_addr)
> +{
> +     return -EOPNOTSUPP;
> +}
>  #endif
>  
>  enum {
> @@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, 
> int flags);
>  #ifdef CONFIG_KPROBE_EVENTS
>  extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
>  extern void perf_kprobe_destroy(struct perf_event *event);
> +extern int bpf_get_kprobe_info(const struct perf_event *event,
> +                            u32 *fd_type, const char **symbol,
> +                            u64 *probe_offset, u64 *probe_addr,
> +                            bool perf_type_tracepoint);
>  #endif
>  #ifdef CONFIG_UPROBE_EVENTS
>  extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
>  extern void perf_uprobe_destroy(struct perf_event *event);
> +extern int bpf_get_uprobe_info(const struct perf_event *event,
> +                            u32 *fd_type, const char **filename,
> +                            u64 *probe_offset, bool perf_type_tracepoint);
>  #endif
>  extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
>                                    char *filter_str);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c3e502d..0d51946 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -97,6 +97,7 @@ enum bpf_cmd {
>       BPF_RAW_TRACEPOINT_OPEN,
>       BPF_BTF_LOAD,
>       BPF_BTF_GET_FD_BY_ID,
> +     BPF_TASK_FD_QUERY,
>  };
>  
>  enum bpf_map_type {
> @@ -379,6 +380,22 @@ union bpf_attr {
>               __u32           btf_log_size;
>               __u32           btf_log_level;
>       };
> +
> +     struct {
> +             __u32           pid;            /* input: pid */
> +             __u32           fd;             /* input: fd */
> +             __u32           flags;          /* input: flags */
> +             __u32           buf_len;        /* input/output: buf len */
> +             __aligned_u64   buf;            /* input/output:
> +                                              *   tp_name for tracepoint
> +                                              *   symbol for kprobe
> +                                              *   filename for uprobe
> +                                              */
> +             __u32           prog_id;        /* output: prod_id */
> +             __u32           fd_type;        /* output: BPF_FD_TYPE_* */
> +             __u64           probe_offset;   /* output: probe_offset */
> +             __u64           probe_addr;     /* output: probe_addr */
> +     } task_fd_query;
>  } __attribute__((aligned(8)));
>  
>  /* The description below is an attempt at providing documentation to eBPF
> @@ -2458,4 +2475,13 @@ struct bpf_fib_lookup {
>       __u8    dmac[6];     /* ETH_ALEN */
>  };
>  
> +enum bpf_task_fd_type {
> +     BPF_FD_TYPE_RAW_TRACEPOINT,     /* tp name */
> +     BPF_FD_TYPE_TRACEPOINT,         /* tp name */
> +     BPF_FD_TYPE_KPROBE,             /* (symbol + offset) or addr */
> +     BPF_FD_TYPE_KRETPROBE,          /* (symbol + offset) or addr */
> +     BPF_FD_TYPE_UPROBE,             /* filename + offset */
> +     BPF_FD_TYPE_URETPROBE,          /* filename + offset */
> +};
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0b4c945..7dd8c86 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -18,7 +18,9 @@
>  #include <linux/vmalloc.h>
>  #include <linux/mmzone.h>
>  #include <linux/anon_inodes.h>
> +#include <linux/fdtable.h>
>  #include <linux/file.h>
> +#include <linux/fs.h>
>  #include <linux/license.h>
>  #include <linux/filter.h>
>  #include <linux/version.h>
> @@ -2102,6 +2104,116 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr 
> *attr)
>       return btf_get_fd_by_id(attr->btf_id);
>  }
>  
> +static int bpf_task_fd_query_copy(const union bpf_attr *attr,
> +                                 union bpf_attr __user *uattr,
> +                                 u32 prog_id, u32 fd_type,
> +                                 const char *buf, u64 probe_offset,
> +                                 u64 probe_addr)
> +{
> +     void __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
> +     u32 len = buf ? strlen(buf) + 1 : 0, input_len;
> +     int err = 0;
> +
> +     if (put_user(len, &uattr->task_fd_query.buf_len))
> +             return -EFAULT;
> +     input_len = attr->task_fd_query.buf_len;
> +     if (input_len && len && ubuf) {
When len is 0 and input_len > 0, ubuf will not be touched (and
so not null terminated).


It may be helpful to note in uapi bpf.h that !output_buf_len has to be
checked on top of checking the syscall return value.  It is reasonable for
the userspace to assume that ubuf can be directly used with
strlen()/printf()... as long as the syscall does not return -1/ENOSPC.
I think the comment change could be done in a follow up patch.

or

always null terminate ubuf as long as input_len > 0
and the output_buf_len should be strlen(buf) instead of
strlen(buf) + 1 (i.e. exclude the null char in output_buf_len)
such that the !buf case will have output_buf_len == 0.
The user can depend on ENOSPC or input_buf_len <= output_buf_len
to decide the truncated condition.  This convention should be
closer to the snprintf() situation.

Other than that,

Acked-by: Martin KaFai Lau <ka...@fb.com>

> +             if (input_len < len) {
> +                     err = -ENOSPC;
> +                     len = input_len;
> +             }
> +             if (copy_to_user(ubuf, buf, len))
> +                     return -EFAULT;
> +     }
> +
> +     if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
> +         put_user(fd_type, &uattr->task_fd_query.fd_type) ||
> +         put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
> +         put_user(probe_addr, &uattr->task_fd_query.probe_addr))
> +             return -EFAULT;
> +
> +     return err;
> +}
> +
> +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
> +
> +static int bpf_task_fd_query(const union bpf_attr *attr,
> +                          union bpf_attr __user *uattr)
> +{
> +     pid_t pid = attr->task_fd_query.pid;
> +     u32 fd = attr->task_fd_query.fd;
> +     const struct perf_event *event;
> +     struct files_struct *files;
> +     struct task_struct *task;
> +     struct file *file;
> +     int err;
> +
> +     if (CHECK_ATTR(BPF_TASK_FD_QUERY))
> +             return -EINVAL;
> +
> +     if (!capable(CAP_SYS_ADMIN))
> +             return -EPERM;
> +
> +     if (attr->task_fd_query.flags != 0)
> +             return -EINVAL;
> +
> +     task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
> +     if (!task)
> +             return -ENOENT;
> +
> +     files = get_files_struct(task);
> +     put_task_struct(task);
> +     if (!files)
> +             return -ENOENT;
> +
> +     err = 0;
> +     spin_lock(&files->file_lock);
> +     file = fcheck_files(files, fd);
> +     if (!file)
> +             err = -EBADF;
> +     else
> +             get_file(file);
> +     spin_unlock(&files->file_lock);
> +     put_files_struct(files);
> +
> +     if (err)
> +             goto out;
> +
> +     if (file->f_op == &bpf_raw_tp_fops) {
> +             struct bpf_raw_tracepoint *raw_tp = file->private_data;
> +             struct bpf_raw_event_map *btp = raw_tp->btp;
> +
> +             err = bpf_task_fd_query_copy(attr, uattr,
> +                                          raw_tp->prog->aux->id,
> +                                          BPF_FD_TYPE_RAW_TRACEPOINT,
> +                                          btp->tp->name, 0, 0);
> +             goto put_file;
> +     }
> +
> +     event = perf_get_event(file);
> +     if (!IS_ERR(event)) {
> +             u64 probe_offset, probe_addr;
> +             u32 prog_id, fd_type;
> +             const char *buf;
> +
> +             err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
> +                                           &buf, &probe_offset,
> +                                           &probe_addr);
> +             if (!err)
> +                     err = bpf_task_fd_query_copy(attr, uattr, prog_id,
> +                                                  fd_type, buf,
> +                                                  probe_offset,
> +                                                  probe_addr);
> +             goto put_file;
> +     }
> +
> +     err = -ENOTSUPP;
> +put_file:
> +     fput(file);
> +out:
> +     return err;
> +}
> +
>  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, 
> size)
>  {
>       union bpf_attr attr = {};
> @@ -2188,6 +2300,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, 
> uattr, unsigned int, siz
>       case BPF_BTF_GET_FD_BY_ID:
>               err = bpf_btf_get_fd_by_id(&attr);
>               break;
> +     case BPF_TASK_FD_QUERY:
> +             err = bpf_task_fd_query(&attr, uattr);
> +             break;
>       default:
>               err = -EINVAL;
>               break;
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index ce2cbbf..81fdf2f 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -14,6 +14,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/ctype.h>
>  #include <linux/kprobes.h>
> +#include <linux/syscalls.h>
>  #include <linux/error-injection.h>
>  
>  #include "trace_probe.h"
> @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map 
> *btp, struct bpf_prog *prog)
>       mutex_unlock(&bpf_event_mutex);
>       return err;
>  }
> +
> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
> +                         u32 *fd_type, const char **buf,
> +                         u64 *probe_offset, u64 *probe_addr)
> +{
> +     bool is_tracepoint, is_syscall_tp;
> +     struct bpf_prog *prog;
> +     int flags, err = 0;
> +
> +     prog = event->prog;
> +     if (!prog)
> +             return -ENOENT;
> +
> +     /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
> +     if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
> +             return -EOPNOTSUPP;
> +
> +     *prog_id = prog->aux->id;
> +     flags = event->tp_event->flags;
> +     is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
> +     is_syscall_tp = is_syscall_trace_event(event->tp_event);
> +
> +     if (is_tracepoint || is_syscall_tp) {
> +             *buf = is_tracepoint ? event->tp_event->tp->name
> +                                  : event->tp_event->name;
> +             *fd_type = BPF_FD_TYPE_TRACEPOINT;
> +             *probe_offset = 0x0;
> +             *probe_addr = 0x0;
> +     } else {
> +             /* kprobe/uprobe */
> +             err = -EOPNOTSUPP;
> +#ifdef CONFIG_KPROBE_EVENTS
> +             if (flags & TRACE_EVENT_FL_KPROBE)
> +                     err = bpf_get_kprobe_info(event, fd_type, buf,
> +                                               probe_offset, probe_addr,
> +                                               event->attr.type == 
> PERF_TYPE_TRACEPOINT);
> +#endif
> +#ifdef CONFIG_UPROBE_EVENTS
> +             if (flags & TRACE_EVENT_FL_UPROBE)
> +                     err = bpf_get_uprobe_info(event, fd_type, buf,
> +                                               probe_offset,
> +                                               event->attr.type == 
> PERF_TYPE_TRACEPOINT);
> +#endif
> +     }
> +
> +     return err;
> +}
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 02aed76..daa8157 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct 
> kretprobe_instance *ri,
>                             head, NULL);
>  }
>  NOKPROBE_SYMBOL(kretprobe_perf_func);
> +
> +int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
> +                     const char **symbol, u64 *probe_offset,
> +                     u64 *probe_addr, bool perf_type_tracepoint)
> +{
> +     const char *pevent = trace_event_name(event->tp_event);
> +     const char *group = event->tp_event->class->system;
> +     struct trace_kprobe *tk;
> +
> +     if (perf_type_tracepoint)
> +             tk = find_trace_kprobe(pevent, group);
> +     else
> +             tk = event->tp_event->data;
> +     if (!tk)
> +             return -EINVAL;
> +
> +     *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
> +                                           : BPF_FD_TYPE_KPROBE;
> +     if (tk->symbol) {
> +             *symbol = tk->symbol;
> +             *probe_offset = tk->rp.kp.offset;
> +             *probe_addr = 0;
> +     } else {
> +             *symbol = NULL;
> +             *probe_offset = 0;
> +             *probe_addr = (unsigned long)tk->rp.kp.addr;
> +     }
> +     return 0;
> +}
>  #endif       /* CONFIG_PERF_EVENTS */
>  
>  /*
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index ac89287..bf89a51 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe 
> *tu, unsigned long func,
>  {
>       __uprobe_perf_func(tu, func, regs, ucb, dsize);
>  }
> +
> +int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
> +                     const char **filename, u64 *probe_offset,
> +                     bool perf_type_tracepoint)
> +{
> +     const char *pevent = trace_event_name(event->tp_event);
> +     const char *group = event->tp_event->class->system;
> +     struct trace_uprobe *tu;
> +
> +     if (perf_type_tracepoint)
> +             tu = find_probe_event(pevent, group);
> +     else
> +             tu = event->tp_event->data;
> +     if (!tu)
> +             return -EINVAL;
> +
> +     *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
> +                                 : BPF_FD_TYPE_UPROBE;
> +     *filename = tu->filename;
> +     *probe_offset = tu->offset;
> +     return 0;
> +}
>  #endif       /* CONFIG_PERF_EVENTS */
>  
>  static int
> -- 
> 2.9.5
>

Re: [PATCH bpf-next v4 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

Reply via email to