On 5/23/18 10:13 AM, Martin KaFai Lau wrote:
On Tue, May 22, 2018 at 09:30:46AM -0700, Yonghong Song wrote:Currently, suppose a userspace application has loaded a bpf program and attached it to a tracepoint/kprobe/uprobe, and a bpf introspection tool, e.g., bpftool, wants to show which bpf program is attached to which tracepoint/kprobe/uprobe. Such attachment information will be really useful to understand the overall bpf deployment in the system. There is a name field (16 bytes) for each program, which could be used to encode the attachment point. There are some drawbacks for this approaches. First, bpftool user (e.g., an admin) may not really understand the association between the name and the attachment point. Second, if one program is attached to multiple places, encoding a proper name which can imply all these attachments becomes difficult. This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY. Given a pid and fd, if the <pid, fd> is associated with a tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return . prog_id . tracepoint name, or . k[ret]probe funcname + offset or kernel addr, or . u[ret]probe filename + offset to the userspace. The user can use "bpftool prog" to find more information about bpf program itself with prog_id.LGTM, some comments inline.Signed-off-by: Yonghong Song <y...@fb.com> --- include/linux/trace_events.h | 16 ++++++ include/uapi/linux/bpf.h | 27 ++++++++++ kernel/bpf/syscall.c | 124 +++++++++++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 48 +++++++++++++++++ kernel/trace/trace_kprobe.c | 29 ++++++++++ kernel/trace/trace_uprobe.c | 22 ++++++++ 6 files changed, 266 insertions(+) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2bde3ef..eab806d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *attach_info, const char **buf, + u64 *probe_offset, u64 *probe_addr);The first arg is 'const struct perf_event *event' while...#else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name { return NULL; } +static inline int bpf_get_perf_event_info(const struct file *file, u32 *prog_id,this one has 'const struct file *file'?
Thanks for catching this. Will correct this in the next revision.
+ u32 *attach_info, const char **buf, + u64 *probe_offset, u64 *probe_addr) +{ + return -EOPNOTSUPP; +} #endifenum {@@ -560,10 +569,17 @@ extern void perf_trace_del(struct perf_event *event, int flags); #ifdef CONFIG_KPROBE_EVENTS extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); +extern int bpf_get_kprobe_info(const struct perf_event *event, + u32 *attach_info, const char **symbol, + u64 *probe_offset, u64 *probe_addr, + bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_uprobe_destroy(struct perf_event *event); +extern int bpf_get_uprobe_info(const struct perf_event *event, + u32 *attach_info, const char **filename, + u64 *probe_offset, bool perf_type_tracepoint); #endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 97446bb..a602150 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -97,6 +97,7 @@ enum bpf_cmd { BPF_RAW_TRACEPOINT_OPEN, BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, };enum bpf_map_type {@@ -379,6 +380,22 @@ union bpf_attr { __u32 btf_log_size; __u32 btf_log_level; }; + + struct { + int pid; /* input: pid */ + int fd; /* input: fd */Should fd and pid be always positive? The current fd (like map_fd) in bpf_attr is using __u32.
Will change both pid and fd to __u32. In kernel fd is unsigned, but pid_t is actually an int. The negative pid is often referred to the process group the pid is in. Since here, we are only concerned with actual process pid, make __u32 should be okay.
+ __u32 flags; /* input: flags */ + __u32 buf_len; /* input: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 attach_info; /* output: BPF_ATTACH_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; } __attribute__((aligned(8)));/* The description below is an attempt at providing documentation to eBPF@@ -2458,4 +2475,14 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ };+/* used by <task, fd> based query */+enum {Nit. Instead of a comment, is it better to give this enum a descriptive name?
Yes, will add an enum name bpf_task_fd_info to make it easy for correlation with task_fd_query.
+ BPF_ATTACH_RAW_TRACEPOINT, /* tp name */ + BPF_ATTACH_TRACEPOINT, /* tp name */ + BPF_ATTACH_KPROBE, /* (symbol + offset) or addr */ + BPF_ATTACH_KRETPROBE, /* (symbol + offset) or addr */ + BPF_ATTACH_UPROBE, /* filename + offset */ + BPF_ATTACH_URETPROBE, /* filename + offset */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bfcde94..9356f0e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,7 +18,9 @@ #include <linux/vmalloc.h> #include <linux/mmzone.h> #include <linux/anon_inodes.h> +#include <linux/fdtable.h> #include <linux/file.h> +#include <linux/fs.h> #include <linux/license.h> #include <linux/filter.h> #include <linux/version.h> @@ -2102,6 +2104,125 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) return btf_get_fd_by_id(attr->btf_id); }+static int bpf_task_fd_query_copy(const union bpf_attr *attr,+ union bpf_attr __user *uattr, + u32 prog_id, u32 attach_info, + const char *buf, u64 probe_offset, + u64 probe_addr) +{ + __u64 __user *ubuf;Nit. ubuf is a string instead of an array of __u64?
will change it to void *.
+ int len; + + ubuf = u64_to_user_ptr(attr->task_fd_query.buf); + if (buf) { + len = strlen(buf); + if (attr->task_fd_query.buf_len < len + 1)I think the current convention is to take the min, copy whatever it can to buf and return the real len/size in buf_len. F.e., the prog_ids and prog_cnt in __cgroup_bpf_query(). Should the same be done here or it does not make sense to truncate the string? The user may/may not need the tailing char though if its pretty print has limited width anyway. The user still needs to know what the buf_len should be to retry also but I guess any reasonable buf_len should work?
Make sense, will make buf_len input/output and copy the actually needed length to buf_len and back to user.
+ return -ENOSPC; + if (copy_to_user(ubuf, buf, len + 1)) + return -EFAULT; + } else if (attr->task_fd_query.buf_len) { + /* copy '\0' to ubuf */ + __u8 zero = 0; + + if (copy_to_user(ubuf, &zero, 1)) + return -EFAULT; + } + + if (copy_to_user(&uattr->task_fd_query.prog_id, &prog_id, + sizeof(prog_id)) || + copy_to_user(&uattr->task_fd_query.attach_info, &attach_info, + sizeof(attach_info)) || + copy_to_user(&uattr->task_fd_query.probe_offset, &probe_offset, + sizeof(probe_offset)) || + copy_to_user(&uattr->task_fd_query.probe_addr, &probe_addr, + sizeof(probe_addr)))Nit. put_user() may be able to shorten them.
Indeed, thanks for suggestion.
+ return -EFAULT; + + return 0; +} + +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr + +static int bpf_task_fd_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + pid_t pid = attr->task_fd_query.pid; + int fd = attr->task_fd_query.fd; + const struct perf_event *event; + struct files_struct *files; + struct task_struct *task; + struct file *file; + int err; + + if (CHECK_ATTR(BPF_TASK_FD_QUERY)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->task_fd_query.flags != 0)How flags is used?
flags is not used yet, but for future extension. For example, it could be used to query a specific type of files (raw_tracepoint vs. perf-event based, etc.).
+ return -EINVAL; + + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + if (!task) + return -ENOENT; + + files = get_files_struct(task); + put_task_struct(task); + if (!files) + return -ENOENT; + + err = 0; + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (!file) + err = -EBADF; + else + get_file(file); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (err) + goto out; + + if (file->f_op == &bpf_raw_tp_fops) { + struct bpf_raw_tracepoint *raw_tp = file->private_data; + struct bpf_raw_event_map *btp = raw_tp->btp; + + if (!raw_tp->prog) + err = -ENOENT; + else + err = bpf_task_fd_query_copy(attr, uattr, + raw_tp->prog->aux->id, + BPF_ATTACH_RAW_TRACEPOINT, + btp->tp->name, 0, 0); + goto put_file; + } + + event = perf_get_event(file); + if (!IS_ERR(event)) { + u64 probe_offset, probe_addr; + u32 prog_id, attach_info; + const char *buf; + + err = bpf_get_perf_event_info(event, &prog_id, &attach_info, + &buf, &probe_offset, + &probe_addr); + if (!err) + err = bpf_task_fd_query_copy(attr, uattr, prog_id, + attach_info, buf, + probe_offset, + probe_addr); + goto put_file; + } + + err = -ENOTSUPP; +put_file: + fput(file); +out: + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -2188,6 +2309,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); break; + case BPF_TASK_FD_QUERY: + err = bpf_task_fd_query(&attr, uattr); + break; default: err = -EINVAL; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ce2cbbf..323c80e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,6 +14,7 @@ #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/kprobes.h> +#include <linux/syscalls.h> #include <linux/error-injection.h>#include "trace_probe.h"@@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) mutex_unlock(&bpf_event_mutex); return err; } + +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *attach_info, const char **buf, + u64 *probe_offset, u64 *probe_addr) +{ + bool is_tracepoint, is_syscall_tp; + struct bpf_prog *prog; + int flags, err = 0; + + prog = event->prog; + if (!prog) + return -ENOENT; + + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) + return -EOPNOTSUPP; + + *prog_id = prog->aux->id; + flags = event->tp_event->flags; + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; + is_syscall_tp = is_syscall_trace_event(event->tp_event); + + if (is_tracepoint || is_syscall_tp) { + *buf = is_tracepoint ? event->tp_event->tp->name + : event->tp_event->name; + *attach_info = BPF_ATTACH_TRACEPOINT; + *probe_offset = 0x0; + *probe_addr = 0x0; + } else { + /* kprobe/uprobe */ + err = -EOPNOTSUPP; +#ifdef CONFIG_KPROBE_EVENTS + if (flags & TRACE_EVENT_FL_KPROBE) + err = bpf_get_kprobe_info(event, attach_info, buf, + probe_offset, probe_addr, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (flags & TRACE_EVENT_FL_UPROBE) + err = bpf_get_uprobe_info(event, attach_info, buf, + probe_offset, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif + } + + return err; +} diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 02aed76..32e9190 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, head, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); + +int bpf_get_kprobe_info(const struct perf_event *event, u32 *attach_info, + const char **symbol, u64 *probe_offset, + u64 *probe_addr, bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_kprobe *tk; + + if (perf_type_tracepoint) + tk = find_trace_kprobe(pevent, group); + else + tk = event->tp_event->data; + if (!tk) + return -EINVAL; + + *attach_info = trace_kprobe_is_return(tk) ? BPF_ATTACH_KRETPROBE + : BPF_ATTACH_KPROBE; + if (tk->symbol) { + *symbol = tk->symbol; + *probe_offset = tk->rp.kp.offset; + *probe_addr = 0; + } else { + *symbol = NULL; + *probe_offset = 0; + *probe_addr = (unsigned long)tk->rp.kp.addr; + } + return 0; +} #endif /* CONFIG_PERF_EVENTS *//*diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ac89287..12a3667 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, { __uprobe_perf_func(tu, func, regs, ucb, dsize); } + +int bpf_get_uprobe_info(const struct perf_event *event, u32 *attach_info, + const char **filename, u64 *probe_offset, + bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_uprobe *tu; + + if (perf_type_tracepoint) + tu = find_probe_event(pevent, group); + else + tu = event->tp_event->data; + if (!tu) + return -EINVAL; + + *attach_info = is_ret_probe(tu) ? BPF_ATTACH_URETPROBE + : BPF_ATTACH_UPROBE; + *filename = tu->filename; + *probe_offset = tu->offset; + return 0; +} #endif /* CONFIG_PERF_EVENTS */static int-- 2.9.5