[PATCH V5 1/1] bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling
This patch adds the flag soft_enable to control the trace data output process when perf sampling. By setting this flag and integrating with ebpf, we can control the data output process and get the samples we are most interested in. The bpf helper bpf_perf_event_control() can control either the perf event on current cpu or all the perf events stored in the maps by checking the third parameter 'flags'. Signed-off-by: Kaixu Xia <xiaka...@huawei.com> --- include/linux/perf_event.h | 1 + include/uapi/linux/bpf.h| 11 include/uapi/linux/perf_event.h | 3 +- kernel/bpf/verifier.c | 3 +- kernel/events/core.c| 13 + kernel/trace/bpf_trace.c| 62 + 6 files changed, 91 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 092a0e8..bb3bf87 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -472,6 +472,7 @@ struct perf_event { struct irq_work pending; atomic_tevent_limit; + atomic_tsoft_enable; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 564f1f0..164d2a9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -287,6 +287,17 @@ enum bpf_func_id { * Return: realm if != 0 */ BPF_FUNC_get_route_realm, + + /** +* u64 bpf_perf_event_control(, index, flags) - control perf events in maps +* @map: pointer to PERF_EVENT_ARRAY maps +* @index: the key of perf event +* @flags: bit 0 - if true, dump event data on current cpu +* bit 1 - if true, control all the events in maps +* other bits - reserved +* Return: 0 on success +*/ + BPF_FUNC_perf_event_control, __BPF_FUNC_MAX_ID, }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 2881145..a791b03 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -331,7 +331,8 @@ struct perf_event_attr { comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid: 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ - __reserved_1 : 37; + soft_disable : 1, /* output data on samples by default */ + __reserved_1 : 36; union { __u32 wakeup_events;/* wakeup every n events */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d6b97b..ffec14b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -245,6 +245,7 @@ static const struct { } func_limit[] = { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_control}, }; static void print_verifier_state(struct verifier_env *env) @@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_map != bool_func) + if (bool_func && bool_map != bool_func) return -EINVAL; } diff --git a/kernel/events/core.c b/kernel/events/core.c index b11756f..5219635 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(>pending); } + if (unlikely(!atomic_read(>soft_enable))) + return 0; + if (event->overflow_handler) event->overflow_handler(event, data, regs); else @@ -7709,6 +7712,14 @@ static void account_event(struct perf_event *event) account_event_cpu(event, event->cpu); } +static void perf_event_check_dump_flag(struct perf_event *event) +{ + if (event->attr.soft_disable == 1) + atomic_set(>soft_enable, 0); + else + atomic_set(>soft_enable, 1); +} + /* * Allocate and initialize a event structure */ @@ -7840,6 +7851,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + perf_event_check_dump_flag(event); + return event; err_per_task: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0fe96c7..398ed94 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -215,6 +215,66 @@ const struct bpf_func_proto bpf_perf_event_read_proto = {
[PATCH V5 0/1] bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling
Previous patch V4 url: https://lkml.org/lkml/2015/10/19/247 This patchset introduces the new perf_event_attr attribute 'soft_disable'. The already existed 'disabled' flag doesn't meet the requirements. The cpu_function_call is too much to do from bpf program and we control the perf_event stored in maps like soft_disable, so if the 'disabled' flag is set to true, we can't enable/disable the perf event by bpf programs. changes in V5: - move the bpf helper parameter 'flags' defination to bpf_trace.c and document the flags bits in uapi header. changes in V4: - make the naming more proper; - fix the initial value set of attr->soft_disable bug; - add unlikely() to the check of event->soft_enable; - squash the 2ed patch into 1st patch; changes in V3: - make the flag name and condition check consistent; - check the bpf helper flag only bit 0 and check all other bits are reserved; - use atomic_dec_if_positive() and atomic_inc_unless_negative(); - make bpf_perf_event_dump_control_proto be static; - remove the ioctl PERF_EVENT_IOC_SET_ENABLER and 'enabler' event; - implement the function that controlling all the perf events stored in PERF_EVENT_ARRAY maps by setting the parameter 'index' to maps max_entries; changes in V2: - rebase the whole patch set to net-next tree(4b418bf); - remove the added flag perf_sample_disable in bpf_map; - move the added fields in structure perf_event to proper place to avoid cacheline miss; - use counter based flag instead of 0/1 switcher in considering of reentering events; - use a single helper bpf_perf_event_sample_control() to enable/ disable events; - implement a light-weight solution to control the trace data output on current cpu; - create a new ioctl PERF_EVENT_IOC_SET_ENABLER to enable/disable a set of events; Before this patch, $ ./perf record -e cycles -a sleep 1 $ ./perf report --stdio # To display the perf.data header info, please use --header/--header-only option # # # Total Lost Samples: 0 # # Samples: 527 of event 'cycles' # Event count (approx.): 87824857 ... After this patch, $ ./perf record -e pmux=cycles --event perf-bpf.o/my_cycles_map=pmux/ -a sleep 1 $ ./perf report --stdio # To display the perf.data header info, please use --header/--header-only option # # # Total Lost Samples: 0 # # Samples: 22 of event 'cycles' # Event count (approx.): 4213922 ... The bpf program example: struct bpf_map_def SEC("maps") my_cycles_map = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, .key_size = sizeof(int), .value_size = sizeof(u32), .max_entries = 32, }; SEC("enter=sys_write") int bpf_prog_1(struct pt_regs *ctx) { bpf_perf_event_control(_cycles_map, 0, 3); return 0; } SEC("exit=sys_write%return") int bpf_prog_2(struct pt_regs *ctx) { bpf_perf_event_control(_cycles_map, 0, 2); return 0; } Consider control sampling in function level, we have to set a high sample frequency to dump trace data when enable/disable the perf event on current cpu. Kaixu Xia (1): bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling include/linux/perf_event.h | 1 + include/uapi/linux/bpf.h| 11 include/uapi/linux/perf_event.h | 3 +- kernel/bpf/verifier.c | 3 +- kernel/events/core.c| 13 + kernel/trace/bpf_trace.c| 62 + 6 files changed, 91 insertions(+), 2 deletions(-) -- 1.8.3.4 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 1/1] bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling
This patch adds the flag soft_enable to control the trace data output process when perf sampling. By setting this flag and integrating with ebpf, we can control the data output process and get the samples we are most interested in. The bpf helper bpf_perf_event_control() can control either the perf event on current cpu or all the perf events stored in the maps by checking the third parameter 'flag'. Signed-off-by: Kaixu Xia <xiaka...@huawei.com> --- include/linux/perf_event.h | 1 + include/uapi/linux/bpf.h| 19 +++ include/uapi/linux/perf_event.h | 3 ++- kernel/bpf/verifier.c | 3 ++- kernel/events/core.c| 13 +++ kernel/trace/bpf_trace.c| 51 + 6 files changed, 88 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 092a0e8..bb3bf87 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -472,6 +472,7 @@ struct perf_event { struct irq_work pending; atomic_tevent_limit; + atomic_tsoft_enable; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 564f1f0..a2b0d9d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -132,6 +132,20 @@ enum bpf_prog_type { #define BPF_NOEXIST1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +/* flags for PERF_EVENT_ARRAY maps*/ +enum { + BPF_EVENT_CTL_BIT_CUR = 0, + BPF_EVENT_CTL_BIT_ALL = 1, + __NR_BPF_EVENT_CTL_BITS, +}; + +#defineBPF_CTL_BIT_FLAG_MASK \ + ((1ULL << __NR_BPF_EVENT_CTL_BITS) - 1) +#defineBPF_CTL_BIT_DUMP_CUR \ + (1ULL << BPF_EVENT_CTL_BIT_CUR) +#defineBPF_CTL_BIT_DUMP_ALL \ + (1ULL << BPF_EVENT_CTL_BIT_ALL) + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -287,6 +301,11 @@ enum bpf_func_id { * Return: realm if != 0 */ BPF_FUNC_get_route_realm, + + /** +* u64 bpf_perf_event_control(, index, flag) +*/ + BPF_FUNC_perf_event_control, __BPF_FUNC_MAX_ID, }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 2881145..a791b03 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -331,7 +331,8 @@ struct perf_event_attr { comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid: 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ - __reserved_1 : 37; + soft_disable : 1, /* output data on samples by default */ + __reserved_1 : 36; union { __u32 wakeup_events;/* wakeup every n events */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d6b97b..ffec14b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -245,6 +245,7 @@ static const struct { } func_limit[] = { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_control}, }; static void print_verifier_state(struct verifier_env *env) @@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_map != bool_func) + if (bool_func && bool_map != bool_func) return -EINVAL; } diff --git a/kernel/events/core.c b/kernel/events/core.c index b11756f..5219635 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(>pending); } + if (unlikely(!atomic_read(>soft_enable))) + return 0; + if (event->overflow_handler) event->overflow_handler(event, data, regs); else @@ -7709,6 +7712,14 @@ static void account_event(struct perf_event *event) account_event_cpu(event, event->cpu); } +static void perf_event_check_dump_flag(struct perf_event *event) +{ + if (event->attr.soft_disable == 1) + atomic_set(>soft_enable, 0); + else + atomic_set(>soft_enable, 1); +} + /* * Allocate and initialize a event structure */ @@ -7840,6
[PATCH V4 0/1] bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling
Previous patch V3 url: https://lkml.org/lkml/2015/10/16/101 This patchset introduces the new perf_event_attr attribute 'soft_disable'. The already existed 'disabled' flag doesn't meet the requirements. The cpu_function_call is too much to do from bpf program and we control the perf_event stored in maps like soft_disable, so if the 'disabled' flag is set to true, we can't enable/disable the perf event by bpf programs. changes in V4: - make the naming more proper; - fix the initial value set of attr->soft_disable bug; - add unlikely() to the check of event->soft_enable; - squash the 2ed patch into 1st patch; changes in V3: - make the flag name and condition check consistent; - check the bpf helper flag only bit 0 and check all other bits are reserved; - use atomic_dec_if_positive() and atomic_inc_unless_negative(); - make bpf_perf_event_dump_control_proto be static; - remove the ioctl PERF_EVENT_IOC_SET_ENABLER and 'enabler' event; - implement the function that controlling all the perf events stored in PERF_EVENT_ARRAY maps by setting the parameter 'index' to maps max_entries; changes in V2: - rebase the whole patch set to net-next tree(4b418bf); - remove the added flag perf_sample_disable in bpf_map; - move the added fields in structure perf_event to proper place to avoid cacheline miss; - use counter based flag instead of 0/1 switcher in considering of reentering events; - use a single helper bpf_perf_event_sample_control() to enable/ disable events; - implement a light-weight solution to control the trace data output on current cpu; - create a new ioctl PERF_EVENT_IOC_SET_ENABLER to enable/disable a set of events; Before this patch, $ ./perf record -e cycles -a sleep 1 $ ./perf report --stdio # To display the perf.data header info, please use --header/--header-only option # # # Total Lost Samples: 0 # # Samples: 527 of event 'cycles' # Event count (approx.): 87824857 ... After this patch, $ ./perf record -e pmux=cycles --event perf-bpf.o/my_cycles_map=pmux/ -a sleep 1 $ ./perf report --stdio # To display the perf.data header info, please use --header/--header-only option # # # Total Lost Samples: 0 # # Samples: 22 of event 'cycles' # Event count (approx.): 4213922 ... The bpf program example: struct bpf_map_def SEC("maps") my_cycles_map = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, .key_size = sizeof(int), .value_size = sizeof(u32), .max_entries = 32, }; SEC("enter=sys_write") int bpf_prog_1(struct pt_regs *ctx) { bpf_perf_event_control(_cycles_map, 0, 2); return 0; } SEC("exit=sys_write%return") int bpf_prog_2(struct pt_regs *ctx) { bpf_perf_event_control(_cycles_map, 0, 3); return 0; } Consider control sampling in function level, we have to set a high sample frequency to dump trace data when enable/disable the perf event on current cpu. Kaixu Xia (1): bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling include/linux/perf_event.h | 1 + include/uapi/linux/bpf.h| 19 +++ include/uapi/linux/perf_event.h | 3 ++- kernel/bpf/verifier.c | 3 ++- kernel/events/core.c| 13 +++ kernel/trace/bpf_trace.c| 51 + 6 files changed, 88 insertions(+), 2 deletions(-) -- 1.8.3.4 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V3 2/2] bpf: control all the perf events stored in PERF_EVENT_ARRAY maps
This patch implements the function that controlling all the perf events stored in PERF_EVENT_ARRAY maps by setting the parameter 'index' to maps max_entries. Signed-off-by: Kaixu Xia <xiaka...@huawei.com> --- kernel/trace/bpf_trace.c | 20 ++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3175600..4b385863 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -229,13 +229,30 @@ static u64 bpf_perf_event_dump_control(u64 r1, u64 index, u64 flag, u64 r4, u64 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct perf_event *event; + int i; - if (unlikely(index >= array->map.max_entries)) + if (unlikely(index > array->map.max_entries)) return -E2BIG; if (flag & BIT_FLAG_CHECK) return -EINVAL; + if (index == array->map.max_entries) { + bool dump_control = flag & BIT_DUMP_CTL; + + for (i = 0; i < array->map.max_entries; i++) { + event = (struct perf_event *)array->ptrs[i]; + if (!event) + continue; + + if (dump_control) + atomic_dec_if_positive(>dump_enable); + else + atomic_inc_unless_negative(>dump_enable); + } + return 0; + } + event = (struct perf_event *)array->ptrs[index]; if (!event) return -ENOENT; @@ -244,7 +261,6 @@ static u64 bpf_perf_event_dump_control(u64 r1, u64 index, u64 flag, u64 r4, u64 atomic_dec_if_positive(>dump_enable); else atomic_inc_unless_negative(>dump_enable); - return 0; } -- 1.8.3.4 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V3 0/2] bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling
Previous patch V2 url: https://lkml.org/lkml/2015/10/14/347 This patchset introduces the new perf_event_attr attribute 'dump_enable'. The already existed 'disabled' flag doesn't meet the requirements. The cpu_function_call is too much to do from bpf program and we control the perf_event stored in maps like soft_disable, so if the 'disabled' flag is set to true, we can't enable/disable the perf event by bpf programs. changes in V3: - make the flag name and condition check consistent; - check the bpf helper flag only bit 0 and check all other bits are reserved; - use atomic_dec_if_positive() and atomic_inc_unless_negative(); - make bpf_perf_event_dump_control_proto be static; - remove the ioctl PERF_EVENT_IOC_SET_ENABLER and 'enabler' event; - implement the function that controlling all the perf events stored in PERF_EVENT_ARRAY maps by setting the parameter 'index' to maps max_entries; changes in V2: - rebase the whole patch set to net-next tree(4b418bf); - remove the added flag perf_sample_disable in bpf_map; - move the added fields in structure perf_event to proper place to avoid cacheline miss; - use counter based flag instead of 0/1 switcher in considering of reentering events; - use a single helper bpf_perf_event_sample_control() to enable/ disable events; - implement a light-weight solution to control the trace data output on current cpu; - create a new ioctl PERF_EVENT_IOC_SET_ENABLER to enable/disable a set of events; Before this patch, $ ./perf record -e cycles -a sleep 1 $ ./perf report --stdio # To display the perf.data header info, please use --header/--header-only option # # # Total Lost Samples: 0 # # Samples: 643 of event 'cycles' # Event count (approx.): 128313904 ... After this patch, $ ./perf record -e pmux=cycles --event perf-bpf.o/my_cycles_map=pmux/ -a sleep 1 $ ./perf report --stdio # To display the perf.data header info, please use --header/--header-only option # # # Total Lost Samples: 0 # # Samples: 25 of event 'cycles' # Event count (approx.): 5788400 ... The bpf program example: struct bpf_map_def SEC("maps") my_cycles_map = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, .key_size = sizeof(int), .value_size = sizeof(u32), .max_entries = 32, }; SEC("enter=sys_write") int bpf_prog_1(struct pt_regs *ctx) { bpf_perf_event_dump_control(_cycles_map, 32, 0); return 0; } SEC("exit=sys_write%return") int bpf_prog_2(struct pt_regs *ctx) { bpf_perf_event_dump_control(_cycles_map, 32, 1); return 0; } Consider control sampling in function level, we have to set a high sample frequency to dump trace data when enable/disable the perf event on current cpu. Kaixu Xia (2): bpf: control the trace data output on current cpu when perf sampling bpf: control all the perf events stored in PERF_EVENT_ARRAY maps include/linux/perf_event.h | 1 + include/uapi/linux/bpf.h| 5 include/uapi/linux/perf_event.h | 3 ++- kernel/bpf/verifier.c | 3 ++- kernel/events/core.c| 13 + kernel/trace/bpf_trace.c| 60 + 6 files changed, 83 insertions(+), 2 deletions(-) -- 1.8.3.4 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V3 1/2] bpf: control the trace data output on current cpu when perf sampling
This patch adds the flag dump_enable to control the trace data output process when perf sampling. By setting this flag and integrating with ebpf, we can control the data output process and get the samples we are most interested in. The bpf helper bpf_perf_event_dump_control() can control the perf_event on current cpu. Signed-off-by: Kaixu Xia <xiaka...@huawei.com> --- include/linux/perf_event.h | 1 + include/uapi/linux/bpf.h| 5 + include/uapi/linux/perf_event.h | 3 ++- kernel/bpf/verifier.c | 3 ++- kernel/events/core.c| 13 kernel/trace/bpf_trace.c| 44 + 6 files changed, 67 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 092a0e8..2af527e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -472,6 +472,7 @@ struct perf_event { struct irq_work pending; atomic_tevent_limit; + atomic_tdump_enable; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 564f1f0..ba08034 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -287,6 +287,11 @@ enum bpf_func_id { * Return: realm if != 0 */ BPF_FUNC_get_route_realm, + + /** +* u64 bpf_perf_event_dump_control(, index, flag) +*/ + BPF_FUNC_perf_event_dump_control, __BPF_FUNC_MAX_ID, }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 2881145..f4b8f08 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -331,7 +331,8 @@ struct perf_event_attr { comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid: 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ - __reserved_1 : 37; + dump_enable: 1, /* don't output data on samples */ + __reserved_1 : 36; union { __u32 wakeup_events;/* wakeup every n events */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d6b97b..26b55f2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -245,6 +245,7 @@ static const struct { } func_limit[] = { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_dump_control}, }; static void print_verifier_state(struct verifier_env *env) @@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_map != bool_func) + if (bool_func && bool_map != bool_func) return -EINVAL; } diff --git a/kernel/events/core.c b/kernel/events/core.c index b11756f..74a16af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(>pending); } + if (!atomic_read(>dump_enable)) + return ret; + if (event->overflow_handler) event->overflow_handler(event, data, regs); else @@ -7709,6 +7712,14 @@ static void account_event(struct perf_event *event) account_event_cpu(event, event->cpu); } +static void perf_event_check_dump_flag(struct perf_event *event) +{ + if (event->attr.dump_enable == 1) + atomic_set(>dump_enable, 1); + else + atomic_set(>dump_enable, 0); +} + /* * Allocate and initialize a event structure */ @@ -7840,6 +7851,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + perf_event_check_dump_flag(event); + return event; err_per_task: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0fe96c7..3175600 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -215,6 +215,48 @@ const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +/* flags for PERF_EVENT_ARRAY maps*/ +enum { + PERF_EVENT_CTL_BIT_DUMP = 0, + _NR_PERF_EVENT_CTL_BITS, +}; + +#defineBIT_FLAG_CHECK GENMASK_ULL(63, _NR_PERF_EVENT_CTL_BITS) +#defineBIT_DUMP_CTLBIT_ULL(PERF_EVENT_CTL_BIT_DUMP) + +static u64 bpf_perf_event_dump_control(u64 r1, u64 index, u64 flag, u64 r4, u64 r5) +{ + struct bpf_map *map =
[PATCH V2 2/2] bpf: control a set of perf events by creating a new ioctl PERF_EVENT_IOC_SET_ENABLER
This patch creates a new ioctl PERF_EVENT_IOC_SET_ENABLER to let perf to select an event as 'enabler'. So we can set this 'enabler' event to enable/disable a set of events. The event on CPU 0 is treated as the 'enabler' event by default. Signed-off-by: Kaixu Xia <xiaka...@huawei.com> --- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 1 + kernel/events/core.c| 42 - kernel/trace/bpf_trace.c| 5 - 4 files changed, 47 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dcbf7d5..bc9fe77 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -473,6 +473,7 @@ struct perf_event { atomic_tevent_limit; atomic_tsample_disable; + atomic_t*p_sample_disable; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index a2b9dd7..3b4fb90 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -393,6 +393,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) +#define PERF_EVENT_IOC_SET_ENABLER _IO ('$', 9) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/events/core.c b/kernel/events/core.c index 942351c..03d2594 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4152,6 +4152,7 @@ static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); +static int perf_event_set_sample_enabler(struct perf_event *event, u32 enabler_fd); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -4208,6 +4209,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg); + case PERF_EVENT_IOC_SET_ENABLER: + return perf_event_set_sample_enabler(event, arg); + default: return -ENOTTY; } @@ -6337,7 +6341,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(>pending); } - if (!atomic_read(>sample_disable)) + if (!atomic_read(event->p_sample_disable)) return ret; if (event->overflow_handler) @@ -6989,6 +6993,35 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return 0; } +static int perf_event_set_sample_enabler(struct perf_event *event, u32 enabler_fd) +{ + int ret; + struct fd enabler; + struct perf_event *enabler_event; + + if (enabler_fd == -1) + return 0; + + ret = perf_fget_light(enabler_fd, ); + if (ret) + return ret; + enabler_event = enabler.file->private_data; + if (event == enabler_event) { + fdput(enabler); + return 0; + } + + /* they must be on the same PMU*/ + if (event->pmu != enabler_event->pmu) { + fdput(enabler); + return -EINVAL; + } + + event->p_sample_disable = _event->sample_disable; + fdput(enabler); + return 0; +} + static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; @@ -7023,6 +7056,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -ENOENT; } +static int perf_event_set_sample_enabler(struct perf_event *event, u32 group_fd) +{ + return -ENOENT; +} + static void perf_event_free_bpf_prog(struct perf_event *event) { } @@ -7718,6 +7756,8 @@ static void perf_event_check_sample_flag(struct perf_event *event) atomic_set(>sample_disable, 0); else atomic_set(>sample_disable, 1); + + event->p_sample_disable = >sample_disable; } /* diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f261333..d012be3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -221,9 +221,12 @@ static u64 bpf_perf_event_sample_control(u64 r1, u64 index, u64 flag, u64 r4, u6 struct bpf_array *array = container_of(map, struct bpf_array, map); struct perf_event *event; - if (unlikely(index >= array->map.max_entries)) + if (unlikely(index > array->map.max_entries)) return -E2BIG; +
[PATCH V2 1/2] bpf: control the trace data output on current cpu when perf sampling
This patch adds the flag sample_disable to control the trace data output process when perf sampling. By setting this flag and integrating with ebpf, we can control the data output process and get the samples we are most interested in. The bpf helper bpf_perf_event_sample_control() can control the perf_event on current cpu. Signed-off-by: Kaixu Xia <xiaka...@huawei.com> --- include/linux/perf_event.h | 1 + include/uapi/linux/bpf.h| 5 + include/uapi/linux/perf_event.h | 3 ++- kernel/bpf/verifier.c | 3 ++- kernel/events/core.c| 13 + kernel/trace/bpf_trace.c| 32 6 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 092a0e8..dcbf7d5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -472,6 +472,7 @@ struct perf_event { struct irq_work pending; atomic_tevent_limit; + atomic_tsample_disable; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 564f1f0..e2c99c6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -287,6 +287,11 @@ enum bpf_func_id { * Return: realm if != 0 */ BPF_FUNC_get_route_realm, + + /** +* u64 bpf_perf_event_sample_control(, index, flag) +*/ + BPF_FUNC_perf_event_sample_control, __BPF_FUNC_MAX_ID, }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 2881145..a2b9dd7 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -331,7 +331,8 @@ struct perf_event_attr { comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid: 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ - __reserved_1 : 37; + sample_disable : 1, /* don't output data on samples */ + __reserved_1 : 36; union { __u32 wakeup_events;/* wakeup every n events */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d6b97b..3ffe630 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -245,6 +245,7 @@ static const struct { } func_limit[] = { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_sample_control}, }; static void print_verifier_state(struct verifier_env *env) @@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_map != bool_func) + if (bool_func && bool_map != bool_func) return -EINVAL; } diff --git a/kernel/events/core.c b/kernel/events/core.c index b11756f..942351c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(>pending); } + if (!atomic_read(>sample_disable)) + return ret; + if (event->overflow_handler) event->overflow_handler(event, data, regs); else @@ -7709,6 +7712,14 @@ static void account_event(struct perf_event *event) account_event_cpu(event, event->cpu); } +static void perf_event_check_sample_flag(struct perf_event *event) +{ + if (event->attr.sample_disable == 1) + atomic_set(>sample_disable, 0); + else + atomic_set(>sample_disable, 1); +} + /* * Allocate and initialize a event structure */ @@ -7840,6 +7851,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + perf_event_check_sample_flag(event); + return event; err_per_task: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0fe96c7..f261333 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -215,6 +215,36 @@ const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +static u64 bpf_perf_event_sample_control(u64 r1, u64 index, u64 flag, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_event *event; + + if (unlikely(index >= array->map.max_entries)) + return
[PATCH V2 0/2] bpf: enable/disable events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling
Previous RFC patch url: https://lkml.org/lkml/2015/10/12/135 changes in V2: - rebase the whole patch set to net-next tree(4b418bf); - remove the added flag perf_sample_disable in bpf_map; - move the added fields in structure perf_event to proper place to avoid cacheline miss; - use counter based flag instead of 0/1 switcher in considering of reentering events; - use a single helper bpf_perf_event_sample_control() to enable/ disable events; - implement a light-weight solution to control the trace data output on current cpu; - create a new ioctl PERF_EVENT_IOC_SET_ENABLER to enable/disable a set of events; Before this patch, $ ./perf record -e cycles -a sleep 1 $ ./perf report --stdio # To display the perf.data header info, please use --header/--header-only option # # # Total Lost Samples: 0 # # Samples: 643 of event 'cycles' # Event count (approx.): 128313904 ... After this patch, $ ./perf record -e pmux=cycles --event perf-bpf.o/my_cycles_map=pmux/ -a sleep 1 $ ./perf report --stdio # To display the perf.data header info, please use --header/--header-only option # # # Total Lost Samples: 0 # # Samples: 25 of event 'cycles' # Event count (approx.): 5788400 ... The bpf program example: struct bpf_map_def SEC("maps") my_cycles_map = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, .key_size = sizeof(int), .value_size = sizeof(u32), .max_entries = 32, }; SEC("enter=sys_write") int bpf_prog_1(struct pt_regs *ctx) { bpf_perf_event_sample_control(_cycles_map, 32, 0); return 0; } SEC("exit=sys_write%return") int bpf_prog_2(struct pt_regs *ctx) { bpf_perf_event_sample_control(_cycles_map, 32, 1); return 0; } Consider control sampling in function level, if we don't use the PERF_EVENT_IOC_SET_ENABLER ioctl in perf user side, we must set a high sample frequency to dump trace data. Kaixu Xia (2): bpf: control the trace data output on current cpu when perf sampling bpf: control a set of perf events by creating a new ioctl PERF_EVENT_IOC_SET_ENABLER include/linux/perf_event.h | 2 ++ include/uapi/linux/bpf.h| 5 include/uapi/linux/perf_event.h | 4 +++- kernel/bpf/verifier.c | 3 ++- kernel/events/core.c| 53 + kernel/trace/bpf_trace.c| 35 +++ 6 files changed, 100 insertions(+), 2 deletions(-) -- 1.8.3.4 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 0/2] bpf: enable/disable events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling
In some scenarios we don't want to output trace data when perf sampling in order to reduce overhead. For example, perf can be run as daemon to dump trace data when necessary, such as the system performance goes down. This patchset adds the helpers bpf_perf_event_sample_enable/disable() to implement this function. By applying these helpers, we can enable/disable events stored in PERF_EVENT_ARRAY maps trace data output and get the samples we are most interested in. We also need to make the perf user side can adds the normal PMU events from perf cmdline to PERF_EVENT_ARRAY maps. My colleague He Kuang is doing this work. In the following example, the cycles will be stored in the PERF_EVENT_ARRAY maps. Before this patch, $ ./perf record -e cycles -a sleep 1 $ ./perf report --stdio # To display the perf.data header info, please use --header/--header-only option # # # Total Lost Samples: 0 # # Samples: 655 of event 'cycles' # Event count (approx.): 129323548 ... After this patch, $ ./perf record -e pmux=cycles --event perf-bpf.o/my_cycles_map=pmux/ -a sleep 1 $ ./perf report --stdio # To display the perf.data header info, please use --header/--header-only option # # # Total Lost Samples: 0 # # Samples: 23 of event 'cycles' # Event count (approx.): 2064170 ... The bpf program example: struct bpf_map_def SEC("maps") my_cycles_map = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, .key_size = sizeof(int), .value_size = sizeof(u32), .max_entries = 32, }; SEC("enter=sys_write") int bpf_prog_1(struct pt_regs *ctx) { bpf_perf_event_sample_enable(_cycles_map); return 0; } SEC("exit=sys_write%return") int bpf_prog_2(struct pt_regs *ctx) { bpf_perf_event_sample_disable(_cycles_map); return 0; } Kaixu Xia (2): perf: Add the flag sample_disable not to output data on samples bpf: Implement bpf_perf_event_sample_enable/disable() helpers include/linux/bpf.h| 3 +++ include/linux/perf_event.h | 2 ++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 5 + kernel/bpf/verifier.c | 4 +++- kernel/events/core.c | 3 +++ kernel/trace/bpf_trace.c | 34 ++ 7 files changed, 52 insertions(+), 1 deletion(-) -- 1.8.3.4 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 1/2] perf: Add the flag sample_disable not to output data on samples
In some scenarios we don't want to output trace data when sampling to reduce overhead. This patch adds the flag sample_disable to implement this function. By setting this flag and integrating with ebpf, we can control the data output process and get the samples we are most interested in. Signed-off-by: Kaixu Xia <xiaka...@huawei.com> --- include/linux/bpf.h| 1 + include/linux/perf_event.h | 2 ++ kernel/bpf/arraymap.c | 5 + kernel/events/core.c | 3 +++ 4 files changed, 11 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f57d7fe..25e073d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,6 +39,7 @@ struct bpf_map { u32 max_entries; const struct bpf_map_ops *ops; struct work_struct work; + atomic_t perf_sample_disable; }; struct bpf_map_type_list { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 092a0e8..0606d1d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -483,6 +483,8 @@ struct perf_event { perf_overflow_handler_t overflow_handler; void*overflow_handler_context; + atomic_t*sample_disable; + #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 29ace10..4ae82c9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -51,6 +51,9 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->elem_size = elem_size; + if (attr->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY) + atomic_set(>map.perf_sample_disable, 1); + return >map; } @@ -298,6 +301,8 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) perf_event_release_kernel(event); return ERR_PTR(-EINVAL); } + + event->sample_disable = >perf_sample_disable; return event; } diff --git a/kernel/events/core.c b/kernel/events/core.c index b11756f..f6ef45c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(>pending); } + if ((event->sample_disable) && atomic_read(event->sample_disable)) + return ret; + if (event->overflow_handler) event->overflow_handler(event, data, regs); else -- 1.8.3.4 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 2/2] bpf: Implement bpf_perf_event_sample_enable/disable() helpers
The functions bpf_perf_event_sample_enable/disable() can set the flag sample_disable to enable/disable output trace data on samples. Signed-off-by: Kaixu Xia <xiaka...@huawei.com> --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/verifier.c| 4 +++- kernel/trace/bpf_trace.c | 34 ++ 4 files changed, 41 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 25e073d..09148ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -192,6 +192,8 @@ extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_perf_event_read_proto; +extern const struct bpf_func_proto bpf_perf_event_sample_enable_proto; +extern const struct bpf_func_proto bpf_perf_event_sample_disable_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 92a48e2..5229c550 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -272,6 +272,8 @@ enum bpf_func_id { BPF_FUNC_skb_get_tunnel_key, BPF_FUNC_skb_set_tunnel_key, BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(, index) */ + BPF_FUNC_perf_event_sample_enable, /* u64 bpf_perf_event_enable() */ + BPF_FUNC_perf_event_sample_disable, /* u64 bpf_perf_event_disable() */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b074b23..6428daf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -244,6 +244,8 @@ static const struct { } func_limit[] = { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_sample_enable}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_sample_disable}, }; static void print_verifier_state(struct verifier_env *env) @@ -860,7 +862,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_map != bool_func) + if (bool_func && bool_map != bool_func) return -EINVAL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0fe96c7..abe943a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -215,6 +215,36 @@ const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +static u64 bpf_perf_event_sample_enable(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + + atomic_set(>perf_sample_disable, 0); + return 0; +} + +const struct bpf_func_proto bpf_perf_event_sample_enable_proto = { + .func = bpf_perf_event_sample_enable, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, +}; + +static u64 bpf_perf_event_sample_disable(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + + atomic_set(>perf_sample_disable, 1); + return 0; +} + +const struct bpf_func_proto bpf_perf_event_sample_disable_proto = { + .func = bpf_perf_event_sample_disable, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -242,6 +272,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return _get_smp_processor_id_proto; case BPF_FUNC_perf_event_read: return _perf_event_read_proto; + case BPF_FUNC_perf_event_sample_enable: + return _perf_event_sample_enable_proto; + case BPF_FUNC_perf_event_sample_disable: + return _perf_event_sample_disable_proto; default: return NULL; } -- 1.8.3.4 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next] bpf: fix build warnings and add function read_trace_pipe()
There are two improvements in this patch: 1. Fix the build warnings; 2. Add function read_trace_pipe() to print the result on the screen; Before this patch, we can get the result through /sys/kernel/de bug/tracing/trace_pipe and get nothing on the screen. By applying this patch, the result can be printed on the screen. $ ./tracex6 ... tracex6-705 [003] d..1 131.428593: : CPU-3 19981414 sshd-683 [000] d..1 131.428727: : CPU-0 221682321 sshd-683 [000] d..1 131.428821: : CPU-0 221808766 sshd-683 [000] d..1 131.428950: : CPU-0 221982984 sshd-683 [000] d..1 131.429045: : CPU-0 222111851 tracex6-705 [003] d..1 131.429168: : CPU-3 20757551 sshd-683 [000] d..1 131.429170: : CPU-0 81240 sshd-683 [000] d..1 131.429261: : CPU-0 222403340 sshd-683 [000] d..1 131.429378: : CPU-0 222561024 ... Signed-off-by: Kaixu Xia xiaka...@huawei.com --- samples/bpf/tracex6_kern.c | 1 + samples/bpf/tracex6_user.c | 22 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c index 23d1cff..be479c4 100644 --- a/samples/bpf/tracex6_kern.c +++ b/samples/bpf/tracex6_kern.c @@ -1,3 +1,4 @@ +#include linux/ptrace.h #include linux/version.h #include uapi/linux/bpf.h #include bpf_helpers.h diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index 928f05e..8ea4976 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -17,8 +17,7 @@ static void test_bpf_perf_event(void) { int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); int *pmu_fd = malloc(nr_cpus * sizeof(int)); - unsigned long value; - int i; + int status, i; struct perf_event_attr attr_insn_pmu = { .freq = 0, @@ -32,22 +31,26 @@ static void test_bpf_perf_event(void) for (i = 0; i nr_cpus; i++) { pmu_fd[i] = perf_event_open(attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0); - if (pmu_fd[i] 0) + if (pmu_fd[i] 0) { printf(event syscall failed\n); + goto exit; + } bpf_update_elem(map_fd[0], i, pmu_fd[i], BPF_ANY); ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0); } - system(ls); - system(pwd); - system(sleep 2); + status = system(ls /dev/null); + if (status) + goto exit; + status = system(sleep 2); + if (status) + goto exit; +exit: for (i = 0; i nr_cpus; i++) close(pmu_fd[i]); - - close(map_fd); - + close(map_fd[0]); free(pmu_fd); } @@ -63,6 +66,7 @@ int main(int argc, char **argv) } test_bpf_perf_event(); + read_trace_pipe(); return 0; } -- 1.8.3.4 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 net-next] bpf: s390: Fix build error caused by the struct bpf_array member name changed
There is a build error that 'struct bpf_array' has no member named 'prog' on s390. In commit 2a36f0b, the member 'prog' of struct bpf_array is replaced by 'ptrs'. So this patch fixes it. Signed-off-by: Kaixu Xia xiaka...@huawei.com --- arch/s390/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 9f4bbc0..eeda051 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1032,7 +1032,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i MAX_TAIL_CALL_CNT, 0, 0x2); /* -* prog = array-prog[index]; +* prog = array-ptrs[index]; * if (prog == NULL) * goto out; */ @@ -1041,7 +1041,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT6_DISP_LH(0xeb00, 0x000d, REG_1, BPF_REG_3, REG_0, 3); /* lg %r1,prog(%b2,%r1) */ EMIT6_DISP_LH(0xe300, 0x0004, REG_1, BPF_REG_2, - REG_1, offsetof(struct bpf_array, prog)); + REG_1, offsetof(struct bpf_array, ptrs)); /* clgij %r1,0,0x8,label0 */ EMIT6_PCREL_IMM_LABEL(0xec00, 0x007d, REG_1, 0, 0, 0x8); -- 1.8.3.4 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 net-next] bpf: s390: Fix build error caused by the struct bpf_array member name changed
There is a build error that 'struct bpf_array' has no member named 'prog' on s390. In commit 2a36f0b92eb6 (bpf: Make the bpf_prog_array_map more generic), the member 'prog' of struct bpf_array is replaced by 'ptrs'. So this patch fixes it. Fixes: 2a36f0b92eb6 (bpf: Make the bpf_prog_array_map more generic) Reported-by: Wu Fengguang fengguang...@intel.com Signed-off-by: Kaixu Xia xiaka...@huawei.com --- arch/s390/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 9f4bbc0..eeda051 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1032,7 +1032,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i MAX_TAIL_CALL_CNT, 0, 0x2); /* -* prog = array-prog[index]; +* prog = array-ptrs[index]; * if (prog == NULL) * goto out; */ @@ -1041,7 +1041,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT6_DISP_LH(0xeb00, 0x000d, REG_1, BPF_REG_3, REG_0, 3); /* lg %r1,prog(%b2,%r1) */ EMIT6_DISP_LH(0xe300, 0x0004, REG_1, BPF_REG_2, - REG_1, offsetof(struct bpf_array, prog)); + REG_1, offsetof(struct bpf_array, ptrs)); /* clgij %r1,0,0x8,label0 */ EMIT6_PCREL_IMM_LABEL(0xec00, 0x007d, REG_1, 0, 0, 0x8); -- 1.8.3.4 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next] bpf: fix the bug 'struct bpf_array' has no member named 'prog' in s390 architecture
'Kbuild test robot' sent me an email about a build error 'struct bpf_array' has no member named 'prog' in s390 architecture. This error is caused by commit: 2a36f0b92eb 638dd023870574eb471b1c56be9ad [656/692] bpf: Make the bpf _prog_array_map more generic. In this patch, the member 'prog' of struct bpf_array has been replaced by 'ptrs'. So this patch fix it. --- arch/s390/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 9f4bbc0..eeda051 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1032,7 +1032,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i MAX_TAIL_CALL_CNT, 0, 0x2); /* -* prog = array-prog[index]; +* prog = array-ptrs[index]; * if (prog == NULL) * goto out; */ @@ -1041,7 +1041,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT6_DISP_LH(0xeb00, 0x000d, REG_1, BPF_REG_3, REG_0, 3); /* lg %r1,prog(%b2,%r1) */ EMIT6_DISP_LH(0xe300, 0x0004, REG_1, BPF_REG_2, - REG_1, offsetof(struct bpf_array, prog)); + REG_1, offsetof(struct bpf_array, ptrs)); /* clgij %r1,0,0x8,label0 */ EMIT6_PCREL_IMM_LABEL(0xec00, 0x007d, REG_1, 0, 0, 0x8); -- 1.8.3.4 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v7 2/5] bpf: Make the bpf_prog_array_map more generic
From: Wang Nan wangn...@huawei.com All the map backends are of generic nature. In order to avoid adding much special code into the eBPF core, rewrite part of the bpf_prog_array map code and make it more generic. So the new perf_event_array map type can reuse most of code with bpf_prog_array map and add fewer lines of special code. Signed-off-by: Wang Nan wangn...@huawei.com Signed-off-by: Kaixu Xia xiaka...@huawei.com --- arch/x86/net/bpf_jit_comp.c | 6 ++-- include/linux/bpf.h | 8 +++-- kernel/bpf/arraymap.c | 80 +++-- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c| 2 +- 5 files changed, 60 insertions(+), 38 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ec5214f..70efcd0 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog) * goto out; * if (++tail_call_cnt MAX_TAIL_CALL_CNT) * goto out; - * prog = array-prog[index]; + * prog = array-ptrs[index]; * if (prog == NULL) * goto out; * goto *(prog-bpf_func + prologue_size); @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ - /* prog = array-prog[index]; */ + /* prog = array-ptrs[index]; */ EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ - offsetof(struct bpf_array, prog)); + offsetof(struct bpf_array, ptrs)); EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 139d6d2..d495211 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -24,6 +24,10 @@ struct bpf_map_ops { void *(*map_lookup_elem)(struct bpf_map *map, void *key); int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_map *map, void *key); + + /* funcs called by prog_array and perf_event_array map */ + void *(*map_fd_get_ptr) (struct bpf_map *map, int fd); + void (*map_fd_put_ptr) (void *ptr); }; struct bpf_map { @@ -142,13 +146,13 @@ struct bpf_array { bool owner_jited; union { char value[0] __aligned(8); - struct bpf_prog *prog[0] __aligned(8); + void *ptrs[0] __aligned(8); }; }; #define MAX_TAIL_CALL_CNT 32 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); -void bpf_prog_array_map_clear(struct bpf_map *map); +void bpf_fd_array_map_clear(struct bpf_map *map); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cb31229..45df657 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -150,15 +150,15 @@ static int __init register_array_map(void) } late_initcall(register_array_map); -static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { - /* only bpf_prog file descriptors can be stored in prog_array map */ + /* only file descriptors can be stored in this type of map */ if (attr-value_size != sizeof(u32)) return ERR_PTR(-EINVAL); return array_map_alloc(attr); } -static void prog_array_map_free(struct bpf_map *map) +static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map) /* make sure it's empty */ for (i = 0; i array-map.max_entries; i++) - BUG_ON(array-prog[i] != NULL); + BUG_ON(array-ptrs[i] != NULL); kvfree(array); } -static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) +static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return NULL; } /* only called from syscall */ -static int prog_array_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static int fd_array_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct bpf_prog *prog, *old_prog; + void *new_ptr, *old_ptr; u32 index = *(u32 *)key, ufd; if (map_flags != BPF_ANY) @@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key, return -E2BIG; ufd = *(u32 *)value; - prog
[PATCH v7 5/5] samples/bpf: example of get selected PMU counter value
This is a simple example and shows how to use the new ability to get the selected Hardware PMU counter value. Signed-off-by: Kaixu Xia xiaka...@huawei.com --- samples/bpf/Makefile | 4 +++ samples/bpf/bpf_helpers.h | 2 ++ samples/bpf/tracex6_kern.c | 26 ++ samples/bpf/tracex6_user.c | 68 ++ 4 files changed, 100 insertions(+) create mode 100644 samples/bpf/tracex6_kern.c create mode 100644 samples/bpf/tracex6_user.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 4450fed..63e7d50 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -12,6 +12,7 @@ hostprogs-y += tracex2 hostprogs-y += tracex3 hostprogs-y += tracex4 hostprogs-y += tracex5 +hostprogs-y += tracex6 hostprogs-y += lathist test_verifier-objs := test_verifier.o libbpf.o @@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o tracex3-objs := bpf_load.o libbpf.o tracex3_user.o tracex4-objs := bpf_load.o libbpf.o tracex4_user.o tracex5-objs := bpf_load.o libbpf.o tracex5_user.o +tracex6-objs := bpf_load.o libbpf.o tracex6_user.o lathist-objs := bpf_load.o libbpf.o lathist_user.o # Tell kbuild to always build the programs @@ -37,6 +39,7 @@ always += tracex2_kern.o always += tracex3_kern.o always += tracex4_kern.o always += tracex5_kern.o +always += tracex6_kern.o always += tcbpf1_kern.o always += lathist_kern.o @@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf HOSTLOADLIBES_tracex3 += -lelf HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex5 += -lelf +HOSTLOADLIBES_tracex6 += -lelf HOSTLOADLIBES_lathist += -lelf # point this to your LLVM backend with bpf support diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index c77c872..3a44d3a 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) = (void *) BPF_FUNC_get_current_uid_gid; static int (*bpf_get_current_comm)(void *buf, int buf_size) = (void *) BPF_FUNC_get_current_comm; +static int (*bpf_perf_event_read)(void *map, int index) = + (void *) BPF_FUNC_perf_event_read; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c new file mode 100644 index 000..23d1cff --- /dev/null +++ b/samples/bpf/tracex6_kern.c @@ -0,0 +1,26 @@ +#include linux/version.h +#include uapi/linux/bpf.h +#include bpf_helpers.h + +struct bpf_map_def SEC(maps) my_map = { + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(u32), + .max_entries = 32, +}; + +SEC(kprobe/sys_write) +int bpf_prog1(struct pt_regs *ctx) +{ + u64 count; + u32 key = bpf_get_smp_processor_id(); + char fmt[] = CPU-%d %llu\n; + + count = bpf_perf_event_read(my_map, key); + bpf_trace_printk(fmt, sizeof(fmt), key, count); + + return 0; +} + +char _license[] SEC(license) = GPL; +u32 _version SEC(version) = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c new file mode 100644 index 000..928f05e --- /dev/null +++ b/samples/bpf/tracex6_user.c @@ -0,0 +1,68 @@ +#include stdio.h +#include unistd.h +#include stdlib.h +#include stdbool.h +#include string.h +#include fcntl.h +#include poll.h +#include sys/ioctl.h +#include linux/perf_event.h +#include linux/bpf.h +#include libbpf.h +#include bpf_load.h + +#define SAMPLE_PERIOD 0x7fffULL + +static void test_bpf_perf_event(void) +{ + int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + int *pmu_fd = malloc(nr_cpus * sizeof(int)); + unsigned long value; + int i; + + struct perf_event_attr attr_insn_pmu = { + .freq = 0, + .sample_period = SAMPLE_PERIOD, + .inherit = 0, + .type = PERF_TYPE_HARDWARE, + .read_format = 0, + .sample_type = 0, + .config = 0,/* PMU: cycles */ + }; + + for (i = 0; i nr_cpus; i++) { + pmu_fd[i] = perf_event_open(attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0); + if (pmu_fd[i] 0) + printf(event syscall failed\n); + + bpf_update_elem(map_fd[0], i, pmu_fd[i], BPF_ANY); + ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0); + } + + system(ls); + system(pwd); + system(sleep 2); + + for (i = 0; i nr_cpus; i++) + close(pmu_fd[i]); + + close(map_fd); + + free(pmu_fd); +} + +int main(int argc, char **argv) +{ + char filename[256]; + + snprintf(filename, sizeof(filename), %s_kern.o, argv[0]); + + if (load_bpf_file(filename)) { + printf(%s, bpf_log_buf); + return 1; + } + + test_bpf_perf_event(); + + return 0; +} -- 1.8.3.4
[PATCH v7 1/5] perf: add the necessary core perf APIs when accessing events counters in eBPF programs
This patch add three core perf APIs: - perf_event_attrs(): export the struct perf_event_attr from struct perf_event; - perf_event_get(): get the struct perf_event from the given fd; - perf_event_read_local(): read the events counters active on the current CPU; These APIs are needed when accessing events counters in eBPF programs. The API perf_event_read_local() comes from Peter and I add the corresponding SOB. Signed-off-by: Kaixu Xia xiaka...@huawei.com Signed-off-by: Peter Zijlstra a.p.zijls...@chello.nl --- include/linux/perf_event.h | 10 ++ kernel/events/core.c | 78 ++ 2 files changed, 88 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2027809..092a0e8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -641,6 +641,8 @@ extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); +extern struct perf_event *perf_event_get(unsigned int fd); +extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); @@ -659,6 +661,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); +extern u64 perf_event_read_local(struct perf_event *event); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -979,6 +982,12 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task){ } +static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) +{ + return ERR_PTR(-EINVAL); +} +static inline u64 perf_event_read_local(struct perf_event *event) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } @@ -1011,6 +1020,7 @@ static inline void perf_event_enable(struct perf_event *event){ } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } +static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } #endif #if defined(CONFIG_PERF_EVENTS) defined(CONFIG_NO_HZ_FULL) diff --git a/kernel/events/core.c b/kernel/events/core.c index d3dae34..e2c6a88 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3212,6 +3212,59 @@ static inline u64 perf_event_count(struct perf_event *event) return __perf_event_count(event); } +/* + * NMI-safe method to read a local event, that is an event that + * is: + * - either for the current task, or for this CPU + * - does not have inherit set, for inherited task events + * will not be local and we cannot read them atomically + * - must not have a pmu::count method + */ +u64 perf_event_read_local(struct perf_event *event) +{ + unsigned long flags; + u64 val; + + /* +* Disabling interrupts avoids all counter scheduling (context +* switches, timer based rotation and IPIs). +*/ + local_irq_save(flags); + + /* If this is a per-task event, it must be for current */ + WARN_ON_ONCE((event-attach_state PERF_ATTACH_TASK) +event-hw.target != current); + + /* If this is a per-CPU event, it must be for this CPU */ + WARN_ON_ONCE(!(event-attach_state PERF_ATTACH_TASK) +event-cpu != smp_processor_id()); + + /* +* It must not be an event with inherit set, we cannot read +* all child counters from atomic context. +*/ + WARN_ON_ONCE(event-attr.inherit); + + /* +* It must not have a pmu::count method, those are not +* NMI safe. +*/ + WARN_ON_ONCE(event-pmu-count); + + /* +* If the event is currently on this CPU, its either a per-task event, +* or local to this CPU. Furthermore it means its ACTIVE (otherwise
[PATCH v7 4/5] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter
According to the perf_event_map_fd and index, the function bpf_perf_event_read() can convert the corresponding map value to the pointer to struct perf_event and return the Hardware PMU counter value. Signed-off-by: Kaixu Xia xiaka...@huawei.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c| 48 +--- kernel/trace/bpf_trace.c | 31 +++ 4 files changed, 66 insertions(+), 15 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4fc1f40..f57d7fe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -190,6 +190,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_perf_event_read_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a1814e8..92a48e2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -271,6 +271,7 @@ enum bpf_func_id { */ BPF_FUNC_skb_get_tunnel_key, BPF_FUNC_skb_set_tunnel_key, + BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(map, index) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cd307df..48e1c71 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,6 +238,14 @@ static const char * const reg_type_str[] = { [CONST_IMM] = imm, }; +static const struct { + int map_type; + int func_id; +} func_limit[] = { + {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, +}; + static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -837,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return err; } +static int check_map_func_compatibility(struct bpf_map *map, int func_id) +{ + bool bool_map, bool_func; + int i; + + if (!map) + return 0; + + for (i = 0; i = ARRAY_SIZE(func_limit); i++) { + bool_map = (map-map_type == func_limit[i].map_type); + bool_func = (func_id == func_limit[i].func_id); + /* only when map func pair match it can continue. +* don't allow any other map type to be passed into +* the special func; +*/ + if (bool_map != bool_func) + return -EINVAL; + } + + return 0; +} + static int check_call(struct verifier_env *env, int func_id) { struct verifier_state *state = env-cur_state; @@ -912,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } - if (map map-map_type == BPF_MAP_TYPE_PROG_ARRAY - func_id != BPF_FUNC_tail_call) - /* prog_array map type needs extra care: -* only allow to pass it into bpf_tail_call() for now. -* bpf_map_delete_elem() can be allowed in the future, -* while bpf_map_update_elem() must only be done via syscall -*/ - return -EINVAL; - - if (func_id == BPF_FUNC_tail_call - map-map_type != BPF_MAP_TYPE_PROG_ARRAY) - /* don't allow any other map type to be passed into -* bpf_tail_call() -*/ - return -EINVAL; + err = check_map_func_compatibility(map, func_id); + if (err) + return err; return 0; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88a041a..ef9936d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -158,6 +158,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return bpf_trace_printk_proto; } +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_event *event; + + if (unlikely(index = array-map.max_entries)) + return -E2BIG; + + event = (struct perf_event *)array-ptrs[index]; + if (!event) + return -ENOENT; + + /* +* we don't know if the function is run successfully by the +* return value. It can be judged in other places, such as +* eBPF programs. +*/ + return perf_event_read_local(event); +} + +const struct bpf_func_proto bpf_perf_event_read_proto = { + .func = bpf_perf_event_read
[PATCH v7 0/5] bpf: Introduce the new ability of eBPF programs to access hardware PMU counter
This patchset is base on the net-next: git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git commit 9dc20a649609c95ce7c5ac4282656ba627b67d49. Previous patch v6 url: https://lkml.org/lkml/2015/8/4/188 changes in V7: - rebase the whole patch set to net-next tree(9dc20a64); - split out the core perf APIs into Patch 1/5; - change the return value of function perf_event_attrs() from struct perf_event * to const struct perf_event * in Patch 1/5; - rename the function perf_event_read_internal() to perf_event_ read_local() and rewrite it in Patch 1/5; - rename the function check_func_limit() to check_map_func _compatibility() and remove the unnecessary pass pointer to a pointer in Patch 4/5; changes in V6: - make the Patch 1/4 commit message more meaning and readable; - remove the unnecessary comment in Patch 2/4 and make it clean; - declare the function perf_event_release_kernel() in include/ linux/perf_event.h to fix the build error when CONFIG_PERF_EVENTS isn't configured in Patch 2/4; - add function perf_event_attrs() to get the struct perf_event_attr in Patch 2/4. - move the related code from kernel/trace/bpf_trace.c to kernel/ events/core.c and add function perf_event_read_internal() to avoid poking inside of the event outside of perf code in Patch 3/4; - generial the func map match-pair with an array in Patch 3/4; changes in V5: - move struct fd_array_map_ops* fd_ops to bpf_map; - move array perf event decrement refcnt function to map_free; - fix the NULL ptr of perf_event_get(); - move bpf_perf_event_read() to kernel/bpf/bpf_trace.c; - get rid of the remaining struct bpf_prog; - move the unnecessay cast on void *; changes in V4: - make the bpf_prog_array_map more generic; - fix the bug of event refcnt leak; - use more useful errno in bpf_perf_event_read(); changes in V3: - collapse V2 patches 1-3 into one; - drop the function map-ops-map_traverse_elem() and release the struct perf_event in map_free; - only allow to access bpf_perf_event_read() from programs; - update the perf_event_array_map elem via xchg(); - pass index directly to bpf_perf_event_read() instead of MAP_KEY; changes in V2: - put atomic_long_inc_not_zero() between fdget() and fdput(); - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE; - Only read the event counter on current CPU or on current process; - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the pointer to the struct perf_event; - according to the perf_event_map_fd and key, the function bpf_perf_event_read() can get the Hardware PMU counter value; Patch 5/5 is a simple example and shows how to use this new eBPF programs ability. The PMU counter data can be found in /sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU value when 'kprobe/sys_write' sampling) $ cat /sys/kernel/debug/tracing/trace_pipe $ ./tracex6 ... syslog-ng-548 [000] d..176.905673: : CPU-0 681765271 syslog-ng-548 [000] d..176.905690: : CPU-0 681787855 syslog-ng-548 [000] d..176.905707: : CPU-0 681810504 syslog-ng-548 [000] d..176.905725: : CPU-0 681834771 syslog-ng-548 [000] d..176.905745: : CPU-0 681859519 syslog-ng-548 [000] d..176.905766: : CPU-0 681890419 syslog-ng-548 [000] d..176.905783: : CPU-0 681914045 syslog-ng-548 [000] d..176.905800: : CPU-0 681935950 syslog-ng-548 [000] d..176.905816: : CPU-0 681958299 ls-690 [005] d..182.241308: : CPU-5 3138451 sh-691 [004] d..182.244570: : CPU-4 7324988 ...-699 [007] d..199.961387: : CPU-7 3194027 ...-695 [003] d..199.961474: : CPU-3 288901 ...-695 [003] d..199.961541: : CPU-3 383145 ...-695 [003] d..199.961591: : CPU-3 450365 ...-695 [003] d..199.961639: : CPU-3 515751 ...-695 [003] d..199.961686: : CPU-3 579047 ... The detail of patches is as follow: Patch 1/5 add the necessary core perf APIs perf_event_attrs(), perf_event_get(),perf_event_read_local() when accessing events counters in eBPF programs Patch 2/5 rewrites part of the bpf_prog_array map code and make it more generic; Patch 3/5 introduces a new bpf map type. This map only stores the pointer to struct perf_event; Patch 4/5 implements function bpf_perf_event_read() that get the selected hardware PMU conuter; Patch 5/5 gives a simple example. Kaixu Xia (4): perf: add the necessary core perf APIs when accessing events counters in eBPF programs bpf: Add new bpf map type to store the pointer to struct perf_event bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter samples/bpf: example of get selected PMU counter value Wang Nan (1): bpf: Make the bpf_prog_array_map more generic arch/x86/net/bpf_jit_comp.c | 6
[PATCH v7 3/5] bpf: Add new bpf map type to store the pointer to struct perf_event
Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'. This map only stores the pointer to struct perf_event. The user space event FDs from perf_event_open() syscall are converted to the pointer to struct perf_event and stored in map. Signed-off-by: Kaixu Xia xiaka...@huawei.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/arraymap.c| 57 3 files changed, 59 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d495211..4fc1f40 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -10,6 +10,7 @@ #include uapi/linux/bpf.h #include linux/workqueue.h #include linux/file.h +#include linux/perf_event.h struct bpf_map; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2ce13c1..a1814e8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -114,6 +114,7 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, + BPF_MAP_TYPE_PERF_EVENT_ARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 45df657..29ace10 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -273,3 +273,60 @@ static int __init register_prog_array_map(void) return 0; } late_initcall(register_prog_array_map); + +static void perf_event_array_map_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct perf_event *event; + const struct perf_event_attr *attr; + + event = perf_event_get(fd); + if (IS_ERR(event)) + return event; + + attr = perf_event_attrs(event); + if (IS_ERR(attr)) + return (void *)attr; + + if (attr-type != PERF_TYPE_RAW + attr-type != PERF_TYPE_HARDWARE) { + perf_event_release_kernel(event); + return ERR_PTR(-EINVAL); + } + return event; +} + +static void perf_event_fd_array_put_ptr(void *ptr) +{ + struct perf_event *event = ptr; + + perf_event_release_kernel(event); +} + +static const struct bpf_map_ops perf_event_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = perf_event_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = perf_event_fd_array_get_ptr, + .map_fd_put_ptr = perf_event_fd_array_put_ptr, +}; + +static struct bpf_map_type_list perf_event_array_type __read_mostly = { + .ops = perf_event_array_ops, + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, +}; + +static int __init register_perf_event_array_map(void) +{ + bpf_register_map_type(perf_event_array_type); + return 0; +} +late_initcall(register_perf_event_array_map); -- 1.8.3.4 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v6 2/4] bpf: Add new bpf map type to store the pointer to struct perf_event
Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'. This map only stores the pointer to struct perf_event. The user space event FDs from perf_event_open() syscall are converted to the pointer to struct perf_event and stored in map. Signed-off-by: Kaixu Xia xiaka...@huawei.com --- include/linux/bpf.h| 1 + include/linux/perf_event.h | 8 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 57 ++ kernel/events/core.c | 25 5 files changed, 92 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a8ce262..d0b394a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -10,6 +10,7 @@ #include uapi/linux/bpf.h #include linux/workqueue.h #include linux/file.h +#include linux/perf_event.h struct bpf_map; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2027809..81fc99e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -641,6 +641,8 @@ extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); +extern struct perf_event *perf_event_get(unsigned int fd); +extern struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); @@ -979,6 +981,11 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task){ } +static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline struct perf_event_attr *perf_event_attrs(struct perf_event *event) +{ + return ERR_PTR(-EINVAL); +} static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } @@ -1011,6 +1018,7 @@ static inline void perf_event_enable(struct perf_event *event){ } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } +static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } #endif #if defined(CONFIG_PERF_EVENTS) defined(CONFIG_NO_HZ_FULL) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 29ef6f9..69a1f6b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -114,6 +114,7 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, + BPF_MAP_TYPE_PERF_EVENT_ARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 45df657..b1e98ff 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -273,3 +273,60 @@ static int __init register_prog_array_map(void) return 0; } late_initcall(register_prog_array_map); + +static void perf_event_array_map_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct perf_event *event; + struct perf_event_attr *attr; + + event = perf_event_get(fd); + if (IS_ERR(event)) + return event; + + attr = perf_event_attrs(event); + if (IS_ERR(attr)) + return attr; + + if (attr-type != PERF_TYPE_RAW + attr-type != PERF_TYPE_HARDWARE) { + perf_event_release_kernel(event); + return ERR_PTR(-EINVAL); + } + return event; +} + +static void perf_event_fd_array_put_ptr(void *ptr) +{ + struct perf_event *event = ptr; + + perf_event_release_kernel(event); +} + +static const struct bpf_map_ops perf_event_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = perf_event_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = perf_event_fd_array_get_ptr, + .map_fd_put_ptr = perf_event_fd_array_put_ptr, +}; + +static struct bpf_map_type_list perf_event_array_type __read_mostly = { + .ops = perf_event_array_ops, + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY
[PATCH v6 3/4] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter
According to the perf_event_map_fd and index, the function bpf_perf_event_read() can convert the corresponding map value to the pointer to struct perf_event and return the Hardware PMU counter value. Signed-off-by: Kaixu Xia xiaka...@huawei.com --- include/linux/bpf.h| 1 + include/linux/perf_event.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 49 -- kernel/events/core.c | 19 ++ kernel/trace/bpf_trace.c | 31 + 6 files changed, 88 insertions(+), 15 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0b394a..db9f781 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -190,6 +190,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_perf_event_read_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 81fc99e..6f1e448 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -643,6 +643,7 @@ extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct perf_event *perf_event_get(unsigned int fd); extern struct perf_event_attr *perf_event_attrs(struct perf_event *event); +extern u64 perf_event_read_internal(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); @@ -986,6 +987,7 @@ static inline struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); } +static inline u64 perf_event_read_internal(struct perf_event *event) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69a1f6b..b9b13ce 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -250,6 +250,7 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_get_current_comm, + BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(map, index) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 039d866..45fae14 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,6 +238,14 @@ static const char * const reg_type_str[] = { [CONST_IMM] = imm, }; +static const struct { + int map_type; + int func_id; +} func_limit[] = { + {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, +}; + static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -833,6 +841,29 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return err; } +static int check_func_limit(struct bpf_map **mapp, int func_id) +{ + struct bpf_map *map = *mapp; + bool bool_map, bool_func; + int i; + + if (!map) + return 0; + + for (i = 0; i = ARRAY_SIZE(func_limit); i++) { + bool_map = (map-map_type == func_limit[i].map_type); + bool_func = (func_id == func_limit[i].func_id); + /* only when map func pair match it can continue. +* don't allow any other map type to be passed into +* the special func; +*/ + if (bool_map != bool_func) + return -EINVAL; + } + + return 0; +} + static int check_call(struct verifier_env *env, int func_id) { struct verifier_state *state = env-cur_state; @@ -908,21 +939,9 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } - if (map map-map_type == BPF_MAP_TYPE_PROG_ARRAY - func_id != BPF_FUNC_tail_call) - /* prog_array map type needs extra care: -* only allow to pass it into bpf_tail_call() for now. -* bpf_map_delete_elem() can be allowed in the future, -* while bpf_map_update_elem() must only be done via syscall -*/ - return -EINVAL; - - if (func_id == BPF_FUNC_tail_call - map-map_type != BPF_MAP_TYPE_PROG_ARRAY) - /* don't allow any other map type to be passed into -* bpf_tail_call
[PATCH v6 0/4] bpf: Introduce the new ability of eBPF programs to access hardware PMU counter
Previous patch v5 url: https://lkml.org/lkml/2015/7/31/299 changes in V6: - make the Patch 1/4 commit message more meaning and readable; - remove the unnecessary comment in Patch 2/4 and make it clean; - declare the function perf_event_release_kernel() in include/ linux/perf_event.h to fix the build error when CONFIG_PERF_EVENTS isn't configured in Patch 2/4; - add function perf_event_attrs() to get the struct perf_event_attr in Patch 2/4. - move the related code from kernel/trace/bpf_trace.c to kernel/ events/core.c and add function perf_event_read_internal() to avoid poking inside of the event outside of perf code in Patch 3/4; - generial the func map match-pair with an array in Patch 3/4; changes in V5: - move struct fd_array_map_ops* fd_ops to bpf_map; - move array perf event decrement refcnt function to map_free; - fix the NULL ptr of perf_event_get(); - move bpf_perf_event_read() to kernel/bpf/bpf_trace.c; - get rid of the remaining struct bpf_prog; - move the unnecessay cast on void *; changes in V4: - make the bpf_prog_array_map more generic; - fix the bug of event refcnt leak; - use more useful errno in bpf_perf_event_read(); changes in V3: - collapse V2 patches 1-3 into one; - drop the function map-ops-map_traverse_elem() and release the struct perf_event in map_free; - only allow to access bpf_perf_event_read() from programs; - update the perf_event_array_map elem via xchg(); - pass index directly to bpf_perf_event_read() instead of MAP_KEY; changes in V2: - put atomic_long_inc_not_zero() between fdget() and fdput(); - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE; - Only read the event counter on current CPU or on current process; - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the pointer to the struct perf_event; - according to the perf_event_map_fd and key, the function bpf_perf_event_read() can get the Hardware PMU counter value; Patch 4/4 is a simple example and shows how to use this new eBPF programs ability. The PMU counter data can be found in /sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU value when 'kprobe/sys_write' sampling) $ cat /sys/kernel/debug/tracing/trace_pipe $ ./tracex6 ... syslog-ng-548 [000] d..176.905673: : CPU-0 681765271 syslog-ng-548 [000] d..176.905690: : CPU-0 681787855 syslog-ng-548 [000] d..176.905707: : CPU-0 681810504 syslog-ng-548 [000] d..176.905725: : CPU-0 681834771 syslog-ng-548 [000] d..176.905745: : CPU-0 681859519 syslog-ng-548 [000] d..176.905766: : CPU-0 681890419 syslog-ng-548 [000] d..176.905783: : CPU-0 681914045 syslog-ng-548 [000] d..176.905800: : CPU-0 681935950 syslog-ng-548 [000] d..176.905816: : CPU-0 681958299 ls-690 [005] d..182.241308: : CPU-5 3138451 sh-691 [004] d..182.244570: : CPU-4 7324988 ...-699 [007] d..199.961387: : CPU-7 3194027 ...-695 [003] d..199.961474: : CPU-3 288901 ...-695 [003] d..199.961541: : CPU-3 383145 ...-695 [003] d..199.961591: : CPU-3 450365 ...-695 [003] d..199.961639: : CPU-3 515751 ...-695 [003] d..199.961686: : CPU-3 579047 ... The detail of patches is as follow: Patch 1/4 rewrites part of the bpf_prog_array map code and make it more generic; Patch 2/4 introduces a new bpf map type. This map only stores the pointer to struct perf_event; Patch 3/4 implements function bpf_perf_event_read() that get the selected hardware PMU conuter; Patch 4/4 gives a simple example. Kaixu Xia (3): bpf: Add new bpf map type to store the pointer to struct perf_event bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter samples/bpf: example of get selected PMU counter value Wang Nan (1): bpf: Make the bpf_prog_array_map more generic arch/x86/net/bpf_jit_comp.c | 6 +- include/linux/bpf.h | 10 +++- include/linux/perf_event.h | 10 include/uapi/linux/bpf.h| 2 + kernel/bpf/arraymap.c | 137 ++-- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c| 2 +- kernel/bpf/verifier.c | 49 +++- kernel/events/core.c| 44 ++ kernel/trace/bpf_trace.c| 31 ++ samples/bpf/Makefile| 4 ++ samples/bpf/bpf_helpers.h | 2 + samples/bpf/tracex6_kern.c | 26 + samples/bpf/tracex6_user.c | 68 ++ 14 files changed, 340 insertions(+), 53 deletions(-) create mode 100644 samples/bpf/tracex6_kern.c create mode 100644 samples/bpf/tracex6_user.c -- 1.8.3.4 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http
[PATCH v6 4/4] samples/bpf: example of get selected PMU counter value
This is a simple example and shows how to use the new ability to get the selected Hardware PMU counter value. Signed-off-by: Kaixu Xia xiaka...@huawei.com --- samples/bpf/Makefile | 4 +++ samples/bpf/bpf_helpers.h | 2 ++ samples/bpf/tracex6_kern.c | 26 ++ samples/bpf/tracex6_user.c | 68 ++ 4 files changed, 100 insertions(+) create mode 100644 samples/bpf/tracex6_kern.c create mode 100644 samples/bpf/tracex6_user.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 4450fed..63e7d50 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -12,6 +12,7 @@ hostprogs-y += tracex2 hostprogs-y += tracex3 hostprogs-y += tracex4 hostprogs-y += tracex5 +hostprogs-y += tracex6 hostprogs-y += lathist test_verifier-objs := test_verifier.o libbpf.o @@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o tracex3-objs := bpf_load.o libbpf.o tracex3_user.o tracex4-objs := bpf_load.o libbpf.o tracex4_user.o tracex5-objs := bpf_load.o libbpf.o tracex5_user.o +tracex6-objs := bpf_load.o libbpf.o tracex6_user.o lathist-objs := bpf_load.o libbpf.o lathist_user.o # Tell kbuild to always build the programs @@ -37,6 +39,7 @@ always += tracex2_kern.o always += tracex3_kern.o always += tracex4_kern.o always += tracex5_kern.o +always += tracex6_kern.o always += tcbpf1_kern.o always += lathist_kern.o @@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf HOSTLOADLIBES_tracex3 += -lelf HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex5 += -lelf +HOSTLOADLIBES_tracex6 += -lelf HOSTLOADLIBES_lathist += -lelf # point this to your LLVM backend with bpf support diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index bdf1c16..c8a3594 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) = (void *) BPF_FUNC_get_current_uid_gid; static int (*bpf_get_current_comm)(void *buf, int buf_size) = (void *) BPF_FUNC_get_current_comm; +static int (*bpf_perf_event_read)(void *map, int index) = + (void *) BPF_FUNC_perf_event_read; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c new file mode 100644 index 000..23d1cff --- /dev/null +++ b/samples/bpf/tracex6_kern.c @@ -0,0 +1,26 @@ +#include linux/version.h +#include uapi/linux/bpf.h +#include bpf_helpers.h + +struct bpf_map_def SEC(maps) my_map = { + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(u32), + .max_entries = 32, +}; + +SEC(kprobe/sys_write) +int bpf_prog1(struct pt_regs *ctx) +{ + u64 count; + u32 key = bpf_get_smp_processor_id(); + char fmt[] = CPU-%d %llu\n; + + count = bpf_perf_event_read(my_map, key); + bpf_trace_printk(fmt, sizeof(fmt), key, count); + + return 0; +} + +char _license[] SEC(license) = GPL; +u32 _version SEC(version) = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c new file mode 100644 index 000..928f05e --- /dev/null +++ b/samples/bpf/tracex6_user.c @@ -0,0 +1,68 @@ +#include stdio.h +#include unistd.h +#include stdlib.h +#include stdbool.h +#include string.h +#include fcntl.h +#include poll.h +#include sys/ioctl.h +#include linux/perf_event.h +#include linux/bpf.h +#include libbpf.h +#include bpf_load.h + +#define SAMPLE_PERIOD 0x7fffULL + +static void test_bpf_perf_event(void) +{ + int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + int *pmu_fd = malloc(nr_cpus * sizeof(int)); + unsigned long value; + int i; + + struct perf_event_attr attr_insn_pmu = { + .freq = 0, + .sample_period = SAMPLE_PERIOD, + .inherit = 0, + .type = PERF_TYPE_HARDWARE, + .read_format = 0, + .sample_type = 0, + .config = 0,/* PMU: cycles */ + }; + + for (i = 0; i nr_cpus; i++) { + pmu_fd[i] = perf_event_open(attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0); + if (pmu_fd[i] 0) + printf(event syscall failed\n); + + bpf_update_elem(map_fd[0], i, pmu_fd[i], BPF_ANY); + ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0); + } + + system(ls); + system(pwd); + system(sleep 2); + + for (i = 0; i nr_cpus; i++) + close(pmu_fd[i]); + + close(map_fd); + + free(pmu_fd); +} + +int main(int argc, char **argv) +{ + char filename[256]; + + snprintf(filename, sizeof(filename), %s_kern.o, argv[0]); + + if (load_bpf_file(filename)) { + printf(%s, bpf_log_buf); + return 1; + } + + test_bpf_perf_event(); + + return 0; +} -- 1.8.3.4
[PATCH v6 1/4] bpf: Make the bpf_prog_array_map more generic
From: Wang Nan wangn...@huawei.com All the map backends are of generic nature. In order to avoid adding much special code into the eBPF core, rewrite part of the bpf_prog_array map code and make it more generic. So the new perf_event_array map type can reuse most of code with bpf_prog_array map and add fewer lines of special code. Signed-off-by: Wang Nan wangn...@huawei.com Signed-off-by: Kaixu Xia xiaka...@huawei.com --- arch/x86/net/bpf_jit_comp.c | 6 ++-- include/linux/bpf.h | 8 +++-- kernel/bpf/arraymap.c | 80 +++-- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c| 2 +- 5 files changed, 60 insertions(+), 38 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 579a8fd..e377f07 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog) * goto out; * if (++tail_call_cnt MAX_TAIL_CALL_CNT) * goto out; - * prog = array-prog[index]; + * prog = array-ptrs[index]; * if (prog == NULL) * goto out; * goto *(prog-bpf_func + prologue_size); @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ - /* prog = array-prog[index]; */ + /* prog = array-ptrs[index]; */ EMIT4(0x48, 0x8D, 0x44, 0xD6);/* lea rax, [rsi + rdx * 8 + 0x50] */ - EMIT1(offsetof(struct bpf_array, prog)); + EMIT1(offsetof(struct bpf_array, ptrs)); EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4383476..a8ce262 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -24,6 +24,10 @@ struct bpf_map_ops { void *(*map_lookup_elem)(struct bpf_map *map, void *key); int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_map *map, void *key); + + /* funcs called by prog_array and perf_event_array map */ + void *(*map_fd_get_ptr) (struct bpf_map *map, int fd); + void (*map_fd_put_ptr) (void *ptr); }; struct bpf_map { @@ -142,13 +146,13 @@ struct bpf_array { bool owner_jited; union { char value[0] __aligned(8); - struct bpf_prog *prog[0] __aligned(8); + void *ptrs[0] __aligned(8); }; }; #define MAX_TAIL_CALL_CNT 32 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); -void bpf_prog_array_map_clear(struct bpf_map *map); +void bpf_fd_array_map_clear(struct bpf_map *map); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cb31229..45df657 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -150,15 +150,15 @@ static int __init register_array_map(void) } late_initcall(register_array_map); -static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { - /* only bpf_prog file descriptors can be stored in prog_array map */ + /* only file descriptors can be stored in this type of map */ if (attr-value_size != sizeof(u32)) return ERR_PTR(-EINVAL); return array_map_alloc(attr); } -static void prog_array_map_free(struct bpf_map *map) +static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map) /* make sure it's empty */ for (i = 0; i array-map.max_entries; i++) - BUG_ON(array-prog[i] != NULL); + BUG_ON(array-ptrs[i] != NULL); kvfree(array); } -static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) +static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return NULL; } /* only called from syscall */ -static int prog_array_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static int fd_array_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct bpf_prog *prog, *old_prog; + void *new_ptr, *old_ptr; u32 index = *(u32 *)key, ufd; if (map_flags != BPF_ANY) @@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key, return -E2BIG; ufd = *(u32 *)value; - prog = bpf_prog_get(ufd
[PATCH v5 0/4] bpf: Introduce the new ability of eBPF programs to access hardware PMU counter
Previous patch v4 url: https://lkml.org/lkml/2015/7/28/432 changes in V5: - move struct fd_array_map_ops* fd_ops to bpf_map; - move array perf event decrement refcnt function to map_free; - fix the NULL ptr of perf_event_get(); - move bpf_perf_event_read() to kernel/bpf/bpf_trace.c; - get rid of the remaining struct bpf_prog; - move the unnecessay cast on void *; changes in V4: - make the bpf_prog_array_map more generic; - fix the bug of event refcnt leak; - use more useful errno in bpf_perf_event_read(); changes in V3: - collapse V2 patches 1-3 into one; - drop the function map-ops-map_traverse_elem() and release the struct perf_event in map_free; - only allow to access bpf_perf_event_read() from programs; - update the perf_event_array_map elem via xchg(); - pass index directly to bpf_perf_event_read() instead of MAP_KEY; changes in V2: - put atomic_long_inc_not_zero() between fdget() and fdput(); - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE; - Only read the event counter on current CPU or on current process; - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the pointer to the struct perf_event; - according to the perf_event_map_fd and key, the function bpf_perf_event_read() can get the Hardware PMU counter value; Patch 4/4 is a simple example and shows how to use this new eBPF programs ability. The PMU counter data can be found in /sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU value when 'kprobe/sys_write' sampling) $ cat /sys/kernel/debug/tracing/trace_pipe $ ./tracex6 ... syslog-ng-548 [000] d..176.905673: : CPU-0 681765271 syslog-ng-548 [000] d..176.905690: : CPU-0 681787855 syslog-ng-548 [000] d..176.905707: : CPU-0 681810504 syslog-ng-548 [000] d..176.905725: : CPU-0 681834771 syslog-ng-548 [000] d..176.905745: : CPU-0 681859519 syslog-ng-548 [000] d..176.905766: : CPU-0 681890419 syslog-ng-548 [000] d..176.905783: : CPU-0 681914045 syslog-ng-548 [000] d..176.905800: : CPU-0 681935950 syslog-ng-548 [000] d..176.905816: : CPU-0 681958299 ls-690 [005] d..182.241308: : CPU-5 3138451 sh-691 [004] d..182.244570: : CPU-4 7324988 ...-699 [007] d..199.961387: : CPU-7 3194027 ...-695 [003] d..199.961474: : CPU-3 288901 ...-695 [003] d..199.961541: : CPU-3 383145 ...-695 [003] d..199.961591: : CPU-3 450365 ...-695 [003] d..199.961639: : CPU-3 515751 ...-695 [003] d..199.961686: : CPU-3 579047 ... The detail of patches is as follow: Patch 1/4 rewrites part of the bpf_prog_array map code and make it more generic; Patch 2/4 introduces a new bpf map type. This map only stores the pointer to struct perf_event; Patch 3/4 implements function bpf_perf_event_read() that get the selected hardware PMU conuter; Patch 4/4 gives a simple example. Kaixu Xia (3): bpf: Add new bpf map type to store the pointer to struct perf_event bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter samples/bpf: example of get selected PMU counter value Wang Nan (1): bpf: Make the bpf_prog_array_map more generic arch/x86/net/bpf_jit_comp.c | 6 +- include/linux/bpf.h | 10 +++- include/linux/perf_event.h | 14 - include/uapi/linux/bpf.h| 2 + kernel/bpf/arraymap.c | 135 ++-- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c| 2 +- kernel/bpf/verifier.c | 56 +- kernel/events/core.c| 27 ++--- kernel/trace/bpf_trace.c| 37 samples/bpf/Makefile| 4 ++ samples/bpf/bpf_helpers.h | 2 + samples/bpf/tracex6_kern.c | 26 + samples/bpf/tracex6_user.c | 68 ++ 14 files changed, 328 insertions(+), 63 deletions(-) create mode 100644 samples/bpf/tracex6_kern.c create mode 100644 samples/bpf/tracex6_user.c -- 1.8.3.4 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 2/4] bpf: Add new bpf map type to store the pointer to struct perf_event
Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'. This map only stores the pointer to struct perf_event. The user space event FDs from perf_event_open() syscall are converted to the pointer to struct perf_event and stored in map. Signed-off-by: Kaixu Xia xiaka...@huawei.com --- include/linux/bpf.h| 1 + include/linux/perf_event.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 55 ++ kernel/events/core.c | 17 ++ 5 files changed, 76 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a8ce262..d0b394a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -10,6 +10,7 @@ #include uapi/linux/bpf.h #include linux/workqueue.h #include linux/file.h +#include linux/perf_event.h struct bpf_map; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2027809..27e05c1 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -641,6 +641,7 @@ extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); +extern struct perf_event *perf_event_get(unsigned int fd); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); @@ -979,6 +980,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task){ } +static struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 29ef6f9..69a1f6b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -114,6 +114,7 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, + BPF_MAP_TYPE_PERF_EVENT_ARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 45df657..b7e0b5d 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -273,3 +273,58 @@ static int __init register_prog_array_map(void) return 0; } late_initcall(register_prog_array_map); + +static void perf_event_array_map_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct perf_event *event; + + event = perf_event_get(fd); + if (IS_ERR(event)) + return event; + + /* +* prevent some crazy events so we can make our life easier +*/ + if (event-attr.type != PERF_TYPE_RAW + event-attr.type != PERF_TYPE_HARDWARE) { + perf_event_release_kernel(event); + return ERR_PTR(-EINVAL); + } + return event; +} + +static void perf_event_fd_array_put_ptr(void *ptr) +{ + struct perf_event *event = ptr; + + perf_event_release_kernel(event); +} + +static const struct bpf_map_ops perf_event_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = perf_event_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = perf_event_fd_array_get_ptr, + .map_fd_put_ptr = perf_event_fd_array_put_ptr, +}; + +static struct bpf_map_type_list perf_event_array_type __read_mostly = { + .ops = perf_event_array_ops, + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, +}; + +static int __init register_perf_event_array_map(void) +{ + bpf_register_map_type(perf_event_array_type); + return 0; +} +late_initcall(register_perf_event_array_map); diff --git a/kernel/events/core.c b/kernel/events/core.c index d3dae34..58f0d47 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8574,6 +8574,23 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task-perf_event_ctxp[ctxn]); } +struct perf_event *perf_event_get(unsigned int fd) +{ + int err; + struct fd f; + struct perf_event *event; + + err = perf_fget_light(fd, f); + if (err) + return ERR_PTR(err); + + event = f.file-private_data
[PATCH v5 4/4] samples/bpf: example of get selected PMU counter value
This is a simple example and shows how to use the new ability to get the selected Hardware PMU counter value. Signed-off-by: Kaixu Xia xiaka...@huawei.com --- samples/bpf/Makefile | 4 +++ samples/bpf/bpf_helpers.h | 2 ++ samples/bpf/tracex6_kern.c | 26 ++ samples/bpf/tracex6_user.c | 68 ++ 4 files changed, 100 insertions(+) create mode 100644 samples/bpf/tracex6_kern.c create mode 100644 samples/bpf/tracex6_user.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 4450fed..63e7d50 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -12,6 +12,7 @@ hostprogs-y += tracex2 hostprogs-y += tracex3 hostprogs-y += tracex4 hostprogs-y += tracex5 +hostprogs-y += tracex6 hostprogs-y += lathist test_verifier-objs := test_verifier.o libbpf.o @@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o tracex3-objs := bpf_load.o libbpf.o tracex3_user.o tracex4-objs := bpf_load.o libbpf.o tracex4_user.o tracex5-objs := bpf_load.o libbpf.o tracex5_user.o +tracex6-objs := bpf_load.o libbpf.o tracex6_user.o lathist-objs := bpf_load.o libbpf.o lathist_user.o # Tell kbuild to always build the programs @@ -37,6 +39,7 @@ always += tracex2_kern.o always += tracex3_kern.o always += tracex4_kern.o always += tracex5_kern.o +always += tracex6_kern.o always += tcbpf1_kern.o always += lathist_kern.o @@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf HOSTLOADLIBES_tracex3 += -lelf HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex5 += -lelf +HOSTLOADLIBES_tracex6 += -lelf HOSTLOADLIBES_lathist += -lelf # point this to your LLVM backend with bpf support diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index bdf1c16..c8a3594 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) = (void *) BPF_FUNC_get_current_uid_gid; static int (*bpf_get_current_comm)(void *buf, int buf_size) = (void *) BPF_FUNC_get_current_comm; +static int (*bpf_perf_event_read)(void *map, int index) = + (void *) BPF_FUNC_perf_event_read; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c new file mode 100644 index 000..23d1cff --- /dev/null +++ b/samples/bpf/tracex6_kern.c @@ -0,0 +1,26 @@ +#include linux/version.h +#include uapi/linux/bpf.h +#include bpf_helpers.h + +struct bpf_map_def SEC(maps) my_map = { + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(u32), + .max_entries = 32, +}; + +SEC(kprobe/sys_write) +int bpf_prog1(struct pt_regs *ctx) +{ + u64 count; + u32 key = bpf_get_smp_processor_id(); + char fmt[] = CPU-%d %llu\n; + + count = bpf_perf_event_read(my_map, key); + bpf_trace_printk(fmt, sizeof(fmt), key, count); + + return 0; +} + +char _license[] SEC(license) = GPL; +u32 _version SEC(version) = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c new file mode 100644 index 000..928f05e --- /dev/null +++ b/samples/bpf/tracex6_user.c @@ -0,0 +1,68 @@ +#include stdio.h +#include unistd.h +#include stdlib.h +#include stdbool.h +#include string.h +#include fcntl.h +#include poll.h +#include sys/ioctl.h +#include linux/perf_event.h +#include linux/bpf.h +#include libbpf.h +#include bpf_load.h + +#define SAMPLE_PERIOD 0x7fffULL + +static void test_bpf_perf_event(void) +{ + int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + int *pmu_fd = malloc(nr_cpus * sizeof(int)); + unsigned long value; + int i; + + struct perf_event_attr attr_insn_pmu = { + .freq = 0, + .sample_period = SAMPLE_PERIOD, + .inherit = 0, + .type = PERF_TYPE_HARDWARE, + .read_format = 0, + .sample_type = 0, + .config = 0,/* PMU: cycles */ + }; + + for (i = 0; i nr_cpus; i++) { + pmu_fd[i] = perf_event_open(attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0); + if (pmu_fd[i] 0) + printf(event syscall failed\n); + + bpf_update_elem(map_fd[0], i, pmu_fd[i], BPF_ANY); + ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0); + } + + system(ls); + system(pwd); + system(sleep 2); + + for (i = 0; i nr_cpus; i++) + close(pmu_fd[i]); + + close(map_fd); + + free(pmu_fd); +} + +int main(int argc, char **argv) +{ + char filename[256]; + + snprintf(filename, sizeof(filename), %s_kern.o, argv[0]); + + if (load_bpf_file(filename)) { + printf(%s, bpf_log_buf); + return 1; + } + + test_bpf_perf_event(); + + return 0; +} -- 1.8.3.4
[PATCH v5 3/4] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter
According to the perf_event_map_fd and index, the function bpf_perf_event_read() can convert the corresponding map value to the pointer to struct perf_event and return the Hardware PMU counter value. Signed-off-by: Kaixu Xia xiaka...@huawei.com --- include/linux/bpf.h| 1 + include/linux/perf_event.h | 12 +- include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 56 +- kernel/events/core.c | 10 + kernel/trace/bpf_trace.c | 37 ++ 6 files changed, 92 insertions(+), 25 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0b394a..db9f781 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -190,6 +190,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_perf_event_read_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 27e05c1..c1a3f39 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -662,7 +662,7 @@ extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); - +extern void __perf_event_read(void *info); struct perf_sample_data { /* @@ -863,6 +863,14 @@ static inline u64 __perf_event_count(struct perf_event *event) return local64_read(event-count) + atomic64_read(event-child_count); } +static inline u64 perf_event_count(struct perf_event *event) +{ + if (event-pmu-count) + return event-pmu-count(event); + + return __perf_event_count(event); +} + extern void perf_event_mmap(struct vm_area_struct *vma); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -984,6 +992,8 @@ static struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EIN static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } +static inline void __perf_event_read(void *info) { } +static inline u64 perf_event_count(struct perf_event *event) { return 0; } static inline int perf_event_refresh(struct perf_event *event, int refresh) { return -EINVAL; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69a1f6b..b9b13ce 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -250,6 +250,7 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_get_current_comm, + BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(map, index) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 039d866..93b6624 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -833,6 +833,44 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return err; } +static int check_func_limit(struct bpf_map **mapp, int func_id) +{ + struct bpf_map *map = *mapp; + + if (map map-map_type == BPF_MAP_TYPE_PROG_ARRAY + func_id != BPF_FUNC_tail_call) + /* prog_array map type needs extra care: +* only allow to pass it into bpf_tail_call() for now. +* bpf_map_delete_elem() can be allowed in the future, +* while bpf_map_update_elem() must only be done via syscall +*/ + return -EINVAL; + + if (func_id == BPF_FUNC_tail_call + map-map_type != BPF_MAP_TYPE_PROG_ARRAY) + /* don't allow any other map type to be passed into +* bpf_tail_call() +*/ + return -EINVAL; + + if (map map-map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY + func_id != BPF_FUNC_perf_event_read) + /* perf_event_array map type needs extra care: +* only allow to pass it into bpf_perf_event_read() for now. +* bpf_map_update/delete_elem() must only be done via syscall +*/ + return -EINVAL; + + if (func_id == BPF_FUNC_perf_event_read + map-map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) + /* don't allow any other map type to be passed into +* bpf_perf_event_read
[PATCH v5 1/4] bpf: Make the bpf_prog_array_map more generic
From: Wang Nan wangn...@huawei.com According to the comments from Daniel, rewrite part of the bpf_prog_array map code and make it more generic. So the new perf_event_array map type can reuse most of code with bpf_prog_array map and add fewer lines of special code. Tested the samples/bpf/tracex5 after this patch: $ sudo ./tracex5 ... dd-1051 [000] d...26.682903: : mmap dd-1051 [000] d...26.698348: : syscall=102 (one of get/set uid/pid/gid) dd-1051 [000] d...26.703892: : read(fd=0, buf=0078c010, size=512) dd-1051 [000] d...26.705847: : write(fd=1, buf=0078c010, size=512) dd-1051 [000] d...26.707914: : read(fd=0, buf=0078c010, size=512) dd-1051 [000] d...26.710988: : write(fd=1, buf=0078c010, size=512) dd-1051 [000] d...26.711865: : read(fd=0, buf=0078c010, size=512) dd-1051 [000] d...26.712704: : write(fd=1, buf=0078c010, size=512) ... Signed-off-by: Wang Nan wangn...@huawei.com Signed-off-by: Kaixu Xia xiaka...@huawei.com --- arch/x86/net/bpf_jit_comp.c | 6 ++-- include/linux/bpf.h | 8 +++-- kernel/bpf/arraymap.c | 80 +++-- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c| 2 +- 5 files changed, 60 insertions(+), 38 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 579a8fd..e377f07 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog) * goto out; * if (++tail_call_cnt MAX_TAIL_CALL_CNT) * goto out; - * prog = array-prog[index]; + * prog = array-ptrs[index]; * if (prog == NULL) * goto out; * goto *(prog-bpf_func + prologue_size); @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ - /* prog = array-prog[index]; */ + /* prog = array-ptrs[index]; */ EMIT4(0x48, 0x8D, 0x44, 0xD6);/* lea rax, [rsi + rdx * 8 + 0x50] */ - EMIT1(offsetof(struct bpf_array, prog)); + EMIT1(offsetof(struct bpf_array, ptrs)); EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4383476..a8ce262 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -24,6 +24,10 @@ struct bpf_map_ops { void *(*map_lookup_elem)(struct bpf_map *map, void *key); int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_map *map, void *key); + + /* funcs called by prog_array and perf_event_array map */ + void *(*map_fd_get_ptr) (struct bpf_map *map, int fd); + void (*map_fd_put_ptr) (void *ptr); }; struct bpf_map { @@ -142,13 +146,13 @@ struct bpf_array { bool owner_jited; union { char value[0] __aligned(8); - struct bpf_prog *prog[0] __aligned(8); + void *ptrs[0] __aligned(8); }; }; #define MAX_TAIL_CALL_CNT 32 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); -void bpf_prog_array_map_clear(struct bpf_map *map); +void bpf_fd_array_map_clear(struct bpf_map *map); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cb31229..45df657 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -150,15 +150,15 @@ static int __init register_array_map(void) } late_initcall(register_array_map); -static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { - /* only bpf_prog file descriptors can be stored in prog_array map */ + /* only file descriptors can be stored in this type of map */ if (attr-value_size != sizeof(u32)) return ERR_PTR(-EINVAL); return array_map_alloc(attr); } -static void prog_array_map_free(struct bpf_map *map) +static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map) /* make sure it's empty */ for (i = 0; i array-map.max_entries; i++) - BUG_ON(array-prog[i] != NULL); + BUG_ON(array-ptrs[i] != NULL); kvfree(array); } -static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) +static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return NULL; } /* only called from