Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.

Signed-off-by: Kaixu Xia <xiaka...@huawei.com>
---
 include/linux/bpf.h        |   2 +
 include/linux/perf_event.h |   2 +
 include/uapi/linux/bpf.h   |   1 +
 kernel/bpf/arraymap.c      | 113 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c      |  15 ++++++
 kernel/events/core.c       |  23 +++++++++
 6 files changed, 156 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476..9cf74c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
 #include <uapi/linux/bpf.h>
 #include <linux/workqueue.h>
 #include <linux/file.h>
+#include <linux/perf_event.h>
 
 struct bpf_map;
 
@@ -143,6 +144,7 @@ struct bpf_array {
        union {
                char value[0] __aligned(8);
                struct bpf_prog *prog[0] __aligned(8);
+               struct perf_event *events[0] __aligned(8);
        };
 };
 #define MAX_TAIL_CALL_CNT 32
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809..2ea4067 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,7 @@ extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
@@ -979,6 +980,7 @@ static inline int perf_event_init_task(struct task_struct 
*child)   { return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)     { }
 static inline void perf_event_free_task(struct task_struct *task)      { }
 static inline void perf_event_delayed_put(struct task_struct *task)    { }
+static struct perf_event *perf_event_get(unsigned int fd)              { 
return NULL; }
 static inline void perf_event_print_debug(void)                                
{ }
 static inline int perf_event_task_disable(void)                                
{ return -EINVAL; }
 static inline int perf_event_task_enable(void)                         { 
return -EINVAL; }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f9..69a1f6b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
        BPF_MAP_TYPE_HASH,
        BPF_MAP_TYPE_ARRAY,
        BPF_MAP_TYPE_PROG_ARRAY,
+       BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229..e97efbc 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -255,3 +255,116 @@ static int __init register_prog_array_map(void)
        return 0;
 }
 late_initcall(register_prog_array_map);
+
+static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
+{
+       /* only the pointer to struct perf_event can be stored in
+        * perf_event_array map
+        */
+       if (attr->value_size != sizeof(u32))
+               return ERR_PTR(-EINVAL);
+
+       return array_map_alloc(attr);
+}
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+       struct bpf_array *array = container_of(map, struct bpf_array, map);
+       struct perf_event *event;
+       int i;
+
+       synchronize_rcu();
+
+       /* release the struct perf_event in perf_event_array_map */
+       for(i = 0; i < array->map.max_entries; i++) {
+               event = array->events[i];
+               if (event)
+                       perf_event_release_kernel(event);
+       }
+       kvfree(array);
+}
+
+static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
+                                            void *next_key)
+{
+       return -EINVAL;
+}
+
+static void *perf_event_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+       return NULL;
+}
+
+static struct perf_event *convert_map_with_perf_event(void *value)
+{
+       struct perf_event *event;
+       u32 fd;
+
+       fd = *(u32 *)value;
+
+       event = perf_event_get(fd);
+       if (IS_ERR(event))
+               return NULL;
+
+       /* limit the event type to PERF_TYPE_RAW
+        * and PERF_TYPE_HARDWARE.
+        */
+       if (event->attr.type != PERF_TYPE_RAW &&
+           event->attr.type != PERF_TYPE_HARDWARE)
+               return NULL;
+
+       return event;
+}
+
+/* only called from syscall */
+static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
+                                           void *value, u64 map_flags)
+{
+       struct bpf_array *array = container_of(map, struct bpf_array, map);
+       struct perf_event *event;
+       u32 index = *(u32 *)key;
+
+       if (map_flags != BPF_ANY)
+               return -EINVAL;
+
+       if (index >= array->map.max_entries)
+               return -E2BIG;
+
+       /* check if the value is already stored */
+       if (array->events[index])
+               return -EINVAL;
+
+       /* convert the fd to the pointer to struct perf_event */
+       event = convert_map_with_perf_event(value);
+       if (!event)
+               return -EBADF;
+
+       xchg(array->events + index, event);
+       return 0;
+}
+
+static int perf_event_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+       return -EINVAL;
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+       .map_alloc = perf_event_array_map_alloc,
+       .map_free = perf_event_array_map_free,
+       .map_get_next_key = perf_event_array_map_get_next_key,
+       .map_lookup_elem = perf_event_array_map_lookup_elem,
+       .map_update_elem = perf_event_array_map_update_elem,
+       .map_delete_elem = perf_event_array_map_delete_elem,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+       .ops = &perf_event_array_ops,
+       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+       bpf_register_map_type(&perf_event_array_type);
+       return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866..c70f7e7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -924,6 +924,21 @@ static int check_call(struct verifier_env *env, int 
func_id)
                 */
                return -EINVAL;
 
+       if (map && map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
+           func_id != BPF_FUNC_perf_event_read)
+               /* perf_event_array map type needs extra care:
+                * only allow to pass it into bpf_perf_event_read() for now.
+                * bpf_map_update/delete_elem() must only be done via syscall
+                */
+               return -EINVAL;
+
+       if (func_id == BPF_FUNC_perf_event_read &&
+           map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+               /* don't allow any other map type to be passed into
+                * bpf_perf_event_read()
+                */
+               return -EINVAL;
+
        return 0;
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae34..08cb467 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8574,6 +8574,29 @@ void perf_event_delayed_put(struct task_struct *task)
                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 }
 
+struct perf_event *perf_event_get(unsigned int fd)
+{
+       struct perf_event *event;
+       struct fd f;
+
+       f = fdget(fd);
+
+       if (!f.file)
+               return ERR_PTR(-EBADF);
+
+       if (f.file->f_op != &perf_fops) {
+               fdput(f);
+               return ERR_PTR(-EINVAL);
+       }
+
+       event = f.file->private_data;
+
+       atomic_long_inc(&event->refcount);
+       fdput(f);
+
+       return event;
+}
+
 /*
  * inherit a event from parent task to child task:
  */
-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to