[PATCH V5 1/1] bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling

2015-10-20 Thread Kaixu Xia
This patch adds the flag soft_enable to control the trace data
output process when perf sampling. By setting this flag and
integrating with ebpf, we can control the data output process and
get the samples we are most interested in.

The bpf helper bpf_perf_event_control() can control either the perf
event on current cpu or all the perf events stored in the maps by
checking the third parameter 'flags'.

Signed-off-by: Kaixu Xia <xiaka...@huawei.com>
---
 include/linux/perf_event.h  |  1 +
 include/uapi/linux/bpf.h| 11 
 include/uapi/linux/perf_event.h |  3 +-
 kernel/bpf/verifier.c   |  3 +-
 kernel/events/core.c| 13 +
 kernel/trace/bpf_trace.c| 62 +
 6 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 092a0e8..bb3bf87 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -472,6 +472,7 @@ struct perf_event {
struct irq_work pending;
 
atomic_tevent_limit;
+   atomic_tsoft_enable;
 
void (*destroy)(struct perf_event *);
struct rcu_head rcu_head;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 564f1f0..164d2a9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -287,6 +287,17 @@ enum bpf_func_id {
 * Return: realm if != 0
 */
BPF_FUNC_get_route_realm,
+
+   /**
+* u64 bpf_perf_event_control(, index, flags) - control perf events 
in maps
+* @map: pointer to PERF_EVENT_ARRAY maps
+* @index: the key of perf event
+* @flags: bit 0 - if true, dump event data on current cpu
+* bit 1 - if true, control all the events in maps
+* other bits - reserved
+* Return: 0 on success
+*/
+   BPF_FUNC_perf_event_control,
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 2881145..a791b03 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -331,7 +331,8 @@ struct perf_event_attr {
comm_exec  :  1, /* flag comm events that 
are due to an exec */
use_clockid:  1, /* use @clockid for time 
fields */
context_switch :  1, /* context switch data */
-   __reserved_1   : 37;
+   soft_disable   :  1, /* output data on samples 
by default */
+   __reserved_1   : 36;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1d6b97b..ffec14b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -245,6 +245,7 @@ static const struct {
 } func_limit[] = {
{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+   {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_control},
 };
 
 static void print_verifier_state(struct verifier_env *env)
@@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
 * don't allow any other map type to be passed into
 * the special func;
 */
-   if (bool_map != bool_func)
+   if (bool_func && bool_map != bool_func)
return -EINVAL;
}
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b11756f..5219635 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(>pending);
}
 
+   if (unlikely(!atomic_read(>soft_enable)))
+   return 0;
+
if (event->overflow_handler)
event->overflow_handler(event, data, regs);
else
@@ -7709,6 +7712,14 @@ static void account_event(struct perf_event *event)
account_event_cpu(event, event->cpu);
 }
 
+static void perf_event_check_dump_flag(struct perf_event *event)
+{
+   if (event->attr.soft_disable == 1)
+   atomic_set(>soft_enable, 0);
+   else
+   atomic_set(>soft_enable, 1);
+}
+
 /*
  * Allocate and initialize a event structure
  */
@@ -7840,6 +7851,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
 
+   perf_event_check_dump_flag(event);
+
return event;
 
 err_per_task:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0fe96c7..398ed94 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -215,6 +215,66 @@ const struct bpf_func_proto bpf_perf_event_read_proto = {

[PATCH V5 0/1] bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling

2015-10-20 Thread Kaixu Xia
Previous patch V4 url:
https://lkml.org/lkml/2015/10/19/247

This patchset introduces the new perf_event_attr attribute 
'soft_disable'. The already existed 'disabled' flag doesn't
meet the requirements. The cpu_function_call is too much 
to do from bpf program and we control the perf_event stored in 
maps like soft_disable, so if the 'disabled' flag is set to
true, we can't enable/disable the perf event by bpf programs.

changes in V5:
 - move the bpf helper parameter 'flags' defination to bpf_trace.c
   and document the flags bits in uapi header.

changes in V4:
 - make the naming more proper;
 - fix the initial value set of attr->soft_disable bug;
 - add unlikely() to the check of event->soft_enable;
 - squash the 2ed patch into 1st patch;

changes in V3:
 - make the flag name and condition check consistent;
 - check the bpf helper flag only bit 0 and check all other bits are
   reserved;
 - use atomic_dec_if_positive() and atomic_inc_unless_negative();
 - make bpf_perf_event_dump_control_proto be static;
 - remove the ioctl PERF_EVENT_IOC_SET_ENABLER and 'enabler' event;
 - implement the function that controlling all the perf events
   stored in PERF_EVENT_ARRAY maps by setting the parameter 'index'
   to maps max_entries;

changes in V2:
 - rebase the whole patch set to net-next tree(4b418bf);
 - remove the added flag perf_sample_disable in bpf_map;
 - move the added fields in structure perf_event to proper place
   to avoid cacheline miss;
 - use counter based flag instead of 0/1 switcher in considering
   of reentering events;
 - use a single helper bpf_perf_event_sample_control() to enable/
   disable events;
 - implement a light-weight solution to control the trace data
   output on current cpu;
 - create a new ioctl PERF_EVENT_IOC_SET_ENABLER to enable/disable
   a set of events;

Before this patch,
   $ ./perf record -e cycles -a sleep 1
   $ ./perf report --stdio
# To display the perf.data header info, please use 
--header/--header-only option
#
#
# Total Lost Samples: 0
#
# Samples: 527  of event 'cycles'
# Event count (approx.): 87824857
...

After this patch,
   $ ./perf record -e pmux=cycles --event perf-bpf.o/my_cycles_map=pmux/ -a 
sleep 1
   $ ./perf report --stdio
# To display the perf.data header info, please use 
--header/--header-only option
#
#
# Total Lost Samples: 0
#
# Samples: 22  of event 'cycles'
# Event count (approx.): 4213922
...

The bpf program example:

  struct bpf_map_def SEC("maps") my_cycles_map = {
  .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
  .key_size = sizeof(int),
  .value_size = sizeof(u32),
  .max_entries = 32, 
  };

  SEC("enter=sys_write")
  int bpf_prog_1(struct pt_regs *ctx)
  {
  bpf_perf_event_control(_cycles_map, 0, 3); 
  return 0;
  }

  SEC("exit=sys_write%return")
  int bpf_prog_2(struct pt_regs *ctx)
  {
  bpf_perf_event_control(_cycles_map, 0, 2); 
  return 0;
  }

Consider control sampling in function level, we have to set
a high sample frequency to dump trace data when enable/disable
the perf event on current cpu.

Kaixu Xia (1):
  bpf: control events stored in PERF_EVENT_ARRAY maps trace data output
when perf sampling

 include/linux/perf_event.h  |  1 +
 include/uapi/linux/bpf.h| 11 
 include/uapi/linux/perf_event.h |  3 +-
 kernel/bpf/verifier.c   |  3 +-
 kernel/events/core.c| 13 +
 kernel/trace/bpf_trace.c| 62 +
 6 files changed, 91 insertions(+), 2 deletions(-)

-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V4 1/1] bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling

2015-10-19 Thread Kaixu Xia
This patch adds the flag soft_enable to control the trace data
output process when perf sampling. By setting this flag and
integrating with ebpf, we can control the data output process and
get the samples we are most interested in.

The bpf helper bpf_perf_event_control() can control either the perf
event on current cpu or all the perf events stored in the maps by
checking the third parameter 'flag'.

Signed-off-by: Kaixu Xia <xiaka...@huawei.com>
---
 include/linux/perf_event.h  |  1 +
 include/uapi/linux/bpf.h| 19 +++
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/bpf/verifier.c   |  3 ++-
 kernel/events/core.c| 13 +++
 kernel/trace/bpf_trace.c| 51 +
 6 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 092a0e8..bb3bf87 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -472,6 +472,7 @@ struct perf_event {
struct irq_work pending;
 
atomic_tevent_limit;
+   atomic_tsoft_enable;
 
void (*destroy)(struct perf_event *);
struct rcu_head rcu_head;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 564f1f0..a2b0d9d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,20 @@ enum bpf_prog_type {
 #define BPF_NOEXIST1 /* create new element if it didn't exist */
 #define BPF_EXIST  2 /* update existing element */
 
+/* flags for PERF_EVENT_ARRAY maps*/
+enum {
+   BPF_EVENT_CTL_BIT_CUR = 0,
+   BPF_EVENT_CTL_BIT_ALL = 1,
+   __NR_BPF_EVENT_CTL_BITS,
+};
+
+#defineBPF_CTL_BIT_FLAG_MASK \
+   ((1ULL << __NR_BPF_EVENT_CTL_BITS) - 1)
+#defineBPF_CTL_BIT_DUMP_CUR \
+   (1ULL << BPF_EVENT_CTL_BIT_CUR)
+#defineBPF_CTL_BIT_DUMP_ALL \
+   (1ULL << BPF_EVENT_CTL_BIT_ALL)
+
 union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32   map_type;   /* one of enum bpf_map_type */
@@ -287,6 +301,11 @@ enum bpf_func_id {
 * Return: realm if != 0
 */
BPF_FUNC_get_route_realm,
+
+   /**
+* u64 bpf_perf_event_control(, index, flag)
+*/
+   BPF_FUNC_perf_event_control,
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 2881145..a791b03 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -331,7 +331,8 @@ struct perf_event_attr {
comm_exec  :  1, /* flag comm events that 
are due to an exec */
use_clockid:  1, /* use @clockid for time 
fields */
context_switch :  1, /* context switch data */
-   __reserved_1   : 37;
+   soft_disable   :  1, /* output data on samples 
by default */
+   __reserved_1   : 36;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1d6b97b..ffec14b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -245,6 +245,7 @@ static const struct {
 } func_limit[] = {
{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+   {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_control},
 };
 
 static void print_verifier_state(struct verifier_env *env)
@@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
 * don't allow any other map type to be passed into
 * the special func;
 */
-   if (bool_map != bool_func)
+   if (bool_func && bool_map != bool_func)
return -EINVAL;
}
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b11756f..5219635 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(>pending);
}
 
+   if (unlikely(!atomic_read(>soft_enable)))
+   return 0;
+
if (event->overflow_handler)
event->overflow_handler(event, data, regs);
else
@@ -7709,6 +7712,14 @@ static void account_event(struct perf_event *event)
account_event_cpu(event, event->cpu);
 }
 
+static void perf_event_check_dump_flag(struct perf_event *event)
+{
+   if (event->attr.soft_disable == 1)
+   atomic_set(>soft_enable, 0);
+   else
+   atomic_set(>soft_enable, 1);
+}
+
 /*
  * Allocate and initialize a event structure
  */
@@ -7840,6 

[PATCH V4 0/1] bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling

2015-10-19 Thread Kaixu Xia
Previous patch V3 url:
https://lkml.org/lkml/2015/10/16/101

This patchset introduces the new perf_event_attr attribute 
'soft_disable'. The already existed 'disabled' flag doesn't
meet the requirements. The cpu_function_call is too much 
to do from bpf program and we control the perf_event stored in 
maps like soft_disable, so if the 'disabled' flag is set to
true, we can't enable/disable the perf event by bpf programs.

changes in V4:
 - make the naming more proper;
 - fix the initial value set of attr->soft_disable bug;
 - add unlikely() to the check of event->soft_enable;
 - squash the 2ed patch into 1st patch;

changes in V3:
 - make the flag name and condition check consistent;
 - check the bpf helper flag only bit 0 and check all other bits are
   reserved;
 - use atomic_dec_if_positive() and atomic_inc_unless_negative();
 - make bpf_perf_event_dump_control_proto be static;
 - remove the ioctl PERF_EVENT_IOC_SET_ENABLER and 'enabler' event;
 - implement the function that controlling all the perf events
   stored in PERF_EVENT_ARRAY maps by setting the parameter 'index'
   to maps max_entries;

changes in V2:
 - rebase the whole patch set to net-next tree(4b418bf);
 - remove the added flag perf_sample_disable in bpf_map;
 - move the added fields in structure perf_event to proper place
   to avoid cacheline miss;
 - use counter based flag instead of 0/1 switcher in considering
   of reentering events;
 - use a single helper bpf_perf_event_sample_control() to enable/
   disable events;
 - implement a light-weight solution to control the trace data
   output on current cpu;
 - create a new ioctl PERF_EVENT_IOC_SET_ENABLER to enable/disable
   a set of events;

Before this patch,
   $ ./perf record -e cycles -a sleep 1
   $ ./perf report --stdio
# To display the perf.data header info, please use 
--header/--header-only option
#
#
# Total Lost Samples: 0
#
# Samples: 527  of event 'cycles'
# Event count (approx.): 87824857
...

After this patch,
   $ ./perf record -e pmux=cycles --event perf-bpf.o/my_cycles_map=pmux/ -a 
sleep 1
   $ ./perf report --stdio
# To display the perf.data header info, please use 
--header/--header-only option
#
#
# Total Lost Samples: 0
#
# Samples: 22  of event 'cycles'
# Event count (approx.): 4213922
...

The bpf program example:

  struct bpf_map_def SEC("maps") my_cycles_map = {
  .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
  .key_size = sizeof(int),
  .value_size = sizeof(u32),
  .max_entries = 32, 
  };

  SEC("enter=sys_write")
  int bpf_prog_1(struct pt_regs *ctx)
  {
  bpf_perf_event_control(_cycles_map, 0, 2); 
  return 0;
  }

  SEC("exit=sys_write%return")
  int bpf_prog_2(struct pt_regs *ctx)
  {
  bpf_perf_event_control(_cycles_map, 0, 3); 
  return 0;
  }

Consider control sampling in function level, we have to set
a high sample frequency to dump trace data when enable/disable
the perf event on current cpu.

Kaixu Xia (1):
  bpf: control events stored in PERF_EVENT_ARRAY maps trace data output
when perf sampling

 include/linux/perf_event.h  |  1 +
 include/uapi/linux/bpf.h| 19 +++
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/bpf/verifier.c   |  3 ++-
 kernel/events/core.c| 13 +++
 kernel/trace/bpf_trace.c| 51 +
 6 files changed, 88 insertions(+), 2 deletions(-)

-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V3 2/2] bpf: control all the perf events stored in PERF_EVENT_ARRAY maps

2015-10-16 Thread Kaixu Xia
This patch implements the function that controlling all the perf
events stored in PERF_EVENT_ARRAY maps by setting the parameter
'index' to maps max_entries.

Signed-off-by: Kaixu Xia <xiaka...@huawei.com>
---
 kernel/trace/bpf_trace.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3175600..4b385863 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -229,13 +229,30 @@ static u64 bpf_perf_event_dump_control(u64 r1, u64 index, 
u64 flag, u64 r4, u64
struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct perf_event *event;
+   int i;
 
-   if (unlikely(index >= array->map.max_entries))
+   if (unlikely(index > array->map.max_entries))
return -E2BIG;
 
if (flag & BIT_FLAG_CHECK)
return -EINVAL;
 
+   if (index == array->map.max_entries) {
+   bool dump_control = flag & BIT_DUMP_CTL;
+
+   for (i = 0; i < array->map.max_entries; i++) {
+   event = (struct perf_event *)array->ptrs[i];
+   if (!event)
+   continue;
+
+   if (dump_control)
+   atomic_dec_if_positive(>dump_enable);
+   else
+   atomic_inc_unless_negative(>dump_enable);
+   }
+   return 0;
+   }
+
event = (struct perf_event *)array->ptrs[index];
if (!event)
return -ENOENT;
@@ -244,7 +261,6 @@ static u64 bpf_perf_event_dump_control(u64 r1, u64 index, 
u64 flag, u64 r4, u64
atomic_dec_if_positive(>dump_enable);
else
atomic_inc_unless_negative(>dump_enable);
-
return 0;
 }
 
-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V3 0/2] bpf: control events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling

2015-10-16 Thread Kaixu Xia
Previous patch V2 url:
https://lkml.org/lkml/2015/10/14/347

This patchset introduces the new perf_event_attr attribute 
'dump_enable'. The already existed 'disabled' flag doesn't
meet the requirements. The cpu_function_call is too much 
to do from bpf program and we control the perf_event stored in 
maps like soft_disable, so if the 'disabled' flag is set to
true, we can't enable/disable the perf event by bpf programs.

changes in V3:
 - make the flag name and condition check consistent;
 - check the bpf helper flag only bit 0 and check all other bits are
   reserved;
 - use atomic_dec_if_positive() and atomic_inc_unless_negative();
 - make bpf_perf_event_dump_control_proto be static;
 - remove the ioctl PERF_EVENT_IOC_SET_ENABLER and 'enabler' event;
 - implement the function that controlling all the perf events
   stored in PERF_EVENT_ARRAY maps by setting the parameter 'index'
   to maps max_entries;

changes in V2:
 - rebase the whole patch set to net-next tree(4b418bf);
 - remove the added flag perf_sample_disable in bpf_map;
 - move the added fields in structure perf_event to proper place
   to avoid cacheline miss;
 - use counter based flag instead of 0/1 switcher in considering
   of reentering events;
 - use a single helper bpf_perf_event_sample_control() to enable/
   disable events;
 - implement a light-weight solution to control the trace data
   output on current cpu;
 - create a new ioctl PERF_EVENT_IOC_SET_ENABLER to enable/disable
   a set of events;

Before this patch,
   $ ./perf record -e cycles -a sleep 1
   $ ./perf report --stdio
# To display the perf.data header info, please use 
--header/--header-only option
#
#
# Total Lost Samples: 0
#
# Samples: 643  of event 'cycles'
# Event count (approx.): 128313904
...

After this patch,
   $ ./perf record -e pmux=cycles --event perf-bpf.o/my_cycles_map=pmux/ -a 
sleep 1
   $ ./perf report --stdio
# To display the perf.data header info, please use 
--header/--header-only option
#
#
# Total Lost Samples: 0
#
# Samples: 25  of event 'cycles'
# Event count (approx.): 5788400
...

The bpf program example:

  struct bpf_map_def SEC("maps") my_cycles_map = {
  .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
  .key_size = sizeof(int),
  .value_size = sizeof(u32),
  .max_entries = 32, 
  };

  SEC("enter=sys_write")
  int bpf_prog_1(struct pt_regs *ctx)
  {
  bpf_perf_event_dump_control(_cycles_map, 32, 0); 
  return 0;
  }

  SEC("exit=sys_write%return")
  int bpf_prog_2(struct pt_regs *ctx)
  {
  bpf_perf_event_dump_control(_cycles_map, 32, 1); 
  return 0;
  }

Consider control sampling in function level, we have to set
a high sample frequency to dump trace data when enable/disable
the perf event on current cpu.

Kaixu Xia (2):
  bpf: control the trace data output on current cpu when perf sampling
  bpf: control all the perf events stored in PERF_EVENT_ARRAY maps

 include/linux/perf_event.h  |  1 +
 include/uapi/linux/bpf.h|  5 
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/bpf/verifier.c   |  3 ++-
 kernel/events/core.c| 13 +
 kernel/trace/bpf_trace.c| 60 +
 6 files changed, 83 insertions(+), 2 deletions(-)

-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V3 1/2] bpf: control the trace data output on current cpu when perf sampling

2015-10-16 Thread Kaixu Xia
This patch adds the flag dump_enable to control the trace data
output process when perf sampling. By setting this flag and
integrating with ebpf, we can control the data output process and
get the samples we are most interested in.

The bpf helper bpf_perf_event_dump_control() can control the
perf_event on current cpu.

Signed-off-by: Kaixu Xia <xiaka...@huawei.com>
---
 include/linux/perf_event.h  |  1 +
 include/uapi/linux/bpf.h|  5 +
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/bpf/verifier.c   |  3 ++-
 kernel/events/core.c| 13 
 kernel/trace/bpf_trace.c| 44 +
 6 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 092a0e8..2af527e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -472,6 +472,7 @@ struct perf_event {
struct irq_work pending;
 
atomic_tevent_limit;
+   atomic_tdump_enable;
 
void (*destroy)(struct perf_event *);
struct rcu_head rcu_head;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 564f1f0..ba08034 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -287,6 +287,11 @@ enum bpf_func_id {
 * Return: realm if != 0
 */
BPF_FUNC_get_route_realm,
+
+   /**
+* u64 bpf_perf_event_dump_control(, index, flag)
+*/
+   BPF_FUNC_perf_event_dump_control,
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 2881145..f4b8f08 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -331,7 +331,8 @@ struct perf_event_attr {
comm_exec  :  1, /* flag comm events that 
are due to an exec */
use_clockid:  1, /* use @clockid for time 
fields */
context_switch :  1, /* context switch data */
-   __reserved_1   : 37;
+   dump_enable:  1, /* don't output data on 
samples */
+   __reserved_1   : 36;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1d6b97b..26b55f2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -245,6 +245,7 @@ static const struct {
 } func_limit[] = {
{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+   {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_dump_control},
 };
 
 static void print_verifier_state(struct verifier_env *env)
@@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
 * don't allow any other map type to be passed into
 * the special func;
 */
-   if (bool_map != bool_func)
+   if (bool_func && bool_map != bool_func)
return -EINVAL;
}
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b11756f..74a16af 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(>pending);
}
 
+   if (!atomic_read(>dump_enable))
+   return ret;
+
if (event->overflow_handler)
event->overflow_handler(event, data, regs);
else
@@ -7709,6 +7712,14 @@ static void account_event(struct perf_event *event)
account_event_cpu(event, event->cpu);
 }
 
+static void perf_event_check_dump_flag(struct perf_event *event)
+{
+   if (event->attr.dump_enable == 1)
+   atomic_set(>dump_enable, 1);
+   else
+   atomic_set(>dump_enable, 0);
+}
+
 /*
  * Allocate and initialize a event structure
  */
@@ -7840,6 +7851,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
 
+   perf_event_check_dump_flag(event);
+
return event;
 
 err_per_task:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0fe96c7..3175600 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -215,6 +215,48 @@ const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type  = ARG_ANYTHING,
 };
 
+/* flags for PERF_EVENT_ARRAY maps*/
+enum {
+   PERF_EVENT_CTL_BIT_DUMP = 0,
+   _NR_PERF_EVENT_CTL_BITS,
+};
+
+#defineBIT_FLAG_CHECK  GENMASK_ULL(63, _NR_PERF_EVENT_CTL_BITS)
+#defineBIT_DUMP_CTLBIT_ULL(PERF_EVENT_CTL_BIT_DUMP)
+
+static u64 bpf_perf_event_dump_control(u64 r1, u64 index, u64 flag, u64 r4, 
u64 r5)
+{
+   struct bpf_map *map =

[PATCH V2 2/2] bpf: control a set of perf events by creating a new ioctl PERF_EVENT_IOC_SET_ENABLER

2015-10-14 Thread Kaixu Xia
This patch creates a new ioctl PERF_EVENT_IOC_SET_ENABLER to let
perf to select an event as 'enabler'. So we can set this 'enabler'
event to enable/disable a set of events. The event on CPU 0 is
treated as the 'enabler' event by default.

Signed-off-by: Kaixu Xia <xiaka...@huawei.com>
---
 include/linux/perf_event.h  |  1 +
 include/uapi/linux/perf_event.h |  1 +
 kernel/events/core.c| 42 -
 kernel/trace/bpf_trace.c|  5 -
 4 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dcbf7d5..bc9fe77 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -473,6 +473,7 @@ struct perf_event {
 
atomic_tevent_limit;
atomic_tsample_disable;
+   atomic_t*p_sample_disable;
 
void (*destroy)(struct perf_event *);
struct rcu_head rcu_head;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index a2b9dd7..3b4fb90 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -393,6 +393,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_FILTER  _IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID  _IOR('$', 7, __u64 *)
 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
+#define PERF_EVENT_IOC_SET_ENABLER _IO ('$', 9)
 
 enum perf_event_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 942351c..03d2594 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4152,6 +4152,7 @@ static int perf_event_set_output(struct perf_event *event,
 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+static int perf_event_set_sample_enabler(struct perf_event *event, u32 
enabler_fd);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned 
long arg)
 {
@@ -4208,6 +4209,9 @@ static long _perf_ioctl(struct perf_event *event, 
unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_SET_BPF:
return perf_event_set_bpf_prog(event, arg);
 
+   case PERF_EVENT_IOC_SET_ENABLER:
+   return perf_event_set_sample_enabler(event, arg);
+
default:
return -ENOTTY;
}
@@ -6337,7 +6341,7 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(>pending);
}
 
-   if (!atomic_read(>sample_disable))
+   if (!atomic_read(event->p_sample_disable))
return ret;
 
if (event->overflow_handler)
@@ -6989,6 +6993,35 @@ static int perf_event_set_bpf_prog(struct perf_event 
*event, u32 prog_fd)
return 0;
 }
 
+static int perf_event_set_sample_enabler(struct perf_event *event, u32 
enabler_fd)
+{
+   int ret;
+   struct fd enabler;
+   struct perf_event *enabler_event;
+
+   if (enabler_fd == -1)
+   return 0;
+
+   ret = perf_fget_light(enabler_fd, );
+   if (ret)
+   return ret;
+   enabler_event = enabler.file->private_data;
+   if (event == enabler_event) {
+   fdput(enabler);
+   return 0;
+   }
+
+   /* they must be on the same PMU*/
+   if (event->pmu != enabler_event->pmu) {
+   fdput(enabler);
+   return -EINVAL;
+   }
+
+   event->p_sample_disable = _event->sample_disable;
+   fdput(enabler);
+   return 0;
+}
+
 static void perf_event_free_bpf_prog(struct perf_event *event)
 {
struct bpf_prog *prog;
@@ -7023,6 +7056,11 @@ static int perf_event_set_bpf_prog(struct perf_event 
*event, u32 prog_fd)
return -ENOENT;
 }
 
+static int perf_event_set_sample_enabler(struct perf_event *event, u32 
group_fd)
+{
+   return -ENOENT;
+}
+
 static void perf_event_free_bpf_prog(struct perf_event *event)
 {
 }
@@ -7718,6 +7756,8 @@ static void perf_event_check_sample_flag(struct 
perf_event *event)
atomic_set(>sample_disable, 0);
else
atomic_set(>sample_disable, 1);
+
+   event->p_sample_disable = >sample_disable;
 }
 
 /*
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f261333..d012be3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -221,9 +221,12 @@ static u64 bpf_perf_event_sample_control(u64 r1, u64 
index, u64 flag, u64 r4, u6
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct perf_event *event;
 
-   if (unlikely(index >= array->map.max_entries))
+   if (unlikely(index > array->map.max_entries))
return -E2BIG;
 
+  

[PATCH V2 1/2] bpf: control the trace data output on current cpu when perf sampling

2015-10-14 Thread Kaixu Xia
This patch adds the flag sample_disable to control the trace data
output process when perf sampling. By setting this flag and
integrating with ebpf, we can control the data output process and
get the samples we are most interested in.

The bpf helper bpf_perf_event_sample_control() can control the
perf_event on current cpu.

Signed-off-by: Kaixu Xia <xiaka...@huawei.com>
---
 include/linux/perf_event.h  |  1 +
 include/uapi/linux/bpf.h|  5 +
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/bpf/verifier.c   |  3 ++-
 kernel/events/core.c| 13 +
 kernel/trace/bpf_trace.c| 32 
 6 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 092a0e8..dcbf7d5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -472,6 +472,7 @@ struct perf_event {
struct irq_work pending;
 
atomic_tevent_limit;
+   atomic_tsample_disable;
 
void (*destroy)(struct perf_event *);
struct rcu_head rcu_head;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 564f1f0..e2c99c6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -287,6 +287,11 @@ enum bpf_func_id {
 * Return: realm if != 0
 */
BPF_FUNC_get_route_realm,
+
+   /**
+* u64 bpf_perf_event_sample_control(, index, flag)
+*/
+   BPF_FUNC_perf_event_sample_control,
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 2881145..a2b9dd7 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -331,7 +331,8 @@ struct perf_event_attr {
comm_exec  :  1, /* flag comm events that 
are due to an exec */
use_clockid:  1, /* use @clockid for time 
fields */
context_switch :  1, /* context switch data */
-   __reserved_1   : 37;
+   sample_disable :  1, /* don't output data on 
samples */
+   __reserved_1   : 36;
 
union {
__u32   wakeup_events;/* wakeup every n events */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1d6b97b..3ffe630 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -245,6 +245,7 @@ static const struct {
 } func_limit[] = {
{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+   {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_sample_control},
 };
 
 static void print_verifier_state(struct verifier_env *env)
@@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
 * don't allow any other map type to be passed into
 * the special func;
 */
-   if (bool_map != bool_func)
+   if (bool_func && bool_map != bool_func)
return -EINVAL;
}
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b11756f..942351c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(>pending);
}
 
+   if (!atomic_read(>sample_disable))
+   return ret;
+
if (event->overflow_handler)
event->overflow_handler(event, data, regs);
else
@@ -7709,6 +7712,14 @@ static void account_event(struct perf_event *event)
account_event_cpu(event, event->cpu);
 }
 
+static void perf_event_check_sample_flag(struct perf_event *event)
+{
+   if (event->attr.sample_disable == 1)
+   atomic_set(>sample_disable, 0);
+   else
+   atomic_set(>sample_disable, 1);
+}
+
 /*
  * Allocate and initialize a event structure
  */
@@ -7840,6 +7851,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
 
+   perf_event_check_sample_flag(event);
+
return event;
 
 err_per_task:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0fe96c7..f261333 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -215,6 +215,36 @@ const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type  = ARG_ANYTHING,
 };
 
+static u64 bpf_perf_event_sample_control(u64 r1, u64 index, u64 flag, u64 r4, 
u64 r5)
+{
+   struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+   struct bpf_array *array = container_of(map, struct bpf_array, map);
+   struct perf_event *event;
+
+   if (unlikely(index >= array->map.max_entries))
+   return 

[PATCH V2 0/2] bpf: enable/disable events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling

2015-10-14 Thread Kaixu Xia
Previous RFC patch url:
https://lkml.org/lkml/2015/10/12/135

changes in V2:
 - rebase the whole patch set to net-next tree(4b418bf);
 - remove the added flag perf_sample_disable in bpf_map;
 - move the added fields in structure perf_event to proper place
   to avoid cacheline miss;
 - use counter based flag instead of 0/1 switcher in considering
   of reentering events;
 - use a single helper bpf_perf_event_sample_control() to enable/
   disable events;
 - implement a light-weight solution to control the trace data
   output on current cpu;
 - create a new ioctl PERF_EVENT_IOC_SET_ENABLER to enable/disable
   a set of events;

Before this patch,
   $ ./perf record -e cycles -a sleep 1
   $ ./perf report --stdio
# To display the perf.data header info, please use 
--header/--header-only option
#
#
# Total Lost Samples: 0
#
# Samples: 643  of event 'cycles'
# Event count (approx.): 128313904
...

After this patch,
   $ ./perf record -e pmux=cycles --event perf-bpf.o/my_cycles_map=pmux/ -a 
sleep 1
   $ ./perf report --stdio
# To display the perf.data header info, please use 
--header/--header-only option
#
#
# Total Lost Samples: 0
#
# Samples: 25  of event 'cycles'
# Event count (approx.): 5788400
...

The bpf program example:

  struct bpf_map_def SEC("maps") my_cycles_map = {
  .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
  .key_size = sizeof(int),
  .value_size = sizeof(u32),
  .max_entries = 32, 
  };

  SEC("enter=sys_write")
  int bpf_prog_1(struct pt_regs *ctx)
  {
  bpf_perf_event_sample_control(_cycles_map, 32, 0); 
  return 0;
  }

  SEC("exit=sys_write%return")
  int bpf_prog_2(struct pt_regs *ctx)
  {
  bpf_perf_event_sample_control(_cycles_map, 32, 1); 
  return 0;
  }

Consider control sampling in function level, if we don't use the
PERF_EVENT_IOC_SET_ENABLER ioctl in perf user side, we must set
a high sample frequency to dump trace data.

Kaixu Xia (2):
  bpf: control the trace data output on current cpu when perf sampling
  bpf: control a set of perf events by creating a new ioctl
PERF_EVENT_IOC_SET_ENABLER

 include/linux/perf_event.h  |  2 ++
 include/uapi/linux/bpf.h|  5 
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/bpf/verifier.c   |  3 ++-
 kernel/events/core.c| 53 +
 kernel/trace/bpf_trace.c| 35 +++
 6 files changed, 100 insertions(+), 2 deletions(-)

-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 0/2] bpf: enable/disable events stored in PERF_EVENT_ARRAY maps trace data output when perf sampling

2015-10-12 Thread Kaixu Xia
In some scenarios we don't want to output trace data when perf sampling
in order to reduce overhead. For example, perf can be run as daemon to
dump trace data when necessary, such as the system performance goes down.

This patchset adds the helpers bpf_perf_event_sample_enable/disable() to
implement this function. By applying these helpers, we can enable/disable
events stored in PERF_EVENT_ARRAY maps trace data output and get the
samples we are most interested in.

We also need to make the perf user side can adds the normal PMU events
from perf cmdline to PERF_EVENT_ARRAY maps. My colleague He Kuang is doing
this work. In the following example, the cycles will be stored in the
PERF_EVENT_ARRAY maps.

Before this patch,
   $ ./perf record -e cycles -a sleep 1
   $ ./perf report --stdio
# To display the perf.data header info, please use 
--header/--header-only option
#
#
# Total Lost Samples: 0
#
# Samples: 655  of event 'cycles'
# Event count (approx.): 129323548
...

After this patch,
   $ ./perf record -e pmux=cycles --event perf-bpf.o/my_cycles_map=pmux/ -a 
sleep 1
   $ ./perf report --stdio
# To display the perf.data header info, please use 
--header/--header-only option
#
#
# Total Lost Samples: 0
#
# Samples: 23  of event 'cycles'
# Event count (approx.): 2064170
...

The bpf program example:

  struct bpf_map_def SEC("maps") my_cycles_map = {
  .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
  .key_size = sizeof(int),
  .value_size = sizeof(u32),
  .max_entries = 32, 
  };

  SEC("enter=sys_write")
  int bpf_prog_1(struct pt_regs *ctx)
  {
  bpf_perf_event_sample_enable(_cycles_map);
  return 0;
  }

  SEC("exit=sys_write%return")
  int bpf_prog_2(struct pt_regs *ctx)
  {
  bpf_perf_event_sample_disable(_cycles_map);
      return 0;
  }


Kaixu Xia (2):
  perf: Add the flag sample_disable not to output data on samples
  bpf: Implement bpf_perf_event_sample_enable/disable() helpers

 include/linux/bpf.h|  3 +++
 include/linux/perf_event.h |  2 ++
 include/uapi/linux/bpf.h   |  2 ++
 kernel/bpf/arraymap.c  |  5 +
 kernel/bpf/verifier.c  |  4 +++-
 kernel/events/core.c   |  3 +++
 kernel/trace/bpf_trace.c   | 34 ++
 7 files changed, 52 insertions(+), 1 deletion(-)

-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 1/2] perf: Add the flag sample_disable not to output data on samples

2015-10-12 Thread Kaixu Xia
In some scenarios we don't want to output trace data when sampling
to reduce overhead. This patch adds the flag sample_disable to
implement this function. By setting this flag and integrating with
ebpf, we can control the data output process and get the samples we
are most interested in.

Signed-off-by: Kaixu Xia <xiaka...@huawei.com>
---
 include/linux/bpf.h| 1 +
 include/linux/perf_event.h | 2 ++
 kernel/bpf/arraymap.c  | 5 +
 kernel/events/core.c   | 3 +++
 4 files changed, 11 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f57d7fe..25e073d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@ struct bpf_map {
u32 max_entries;
const struct bpf_map_ops *ops;
struct work_struct work;
+   atomic_t perf_sample_disable;
 };
 
 struct bpf_map_type_list {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 092a0e8..0606d1d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -483,6 +483,8 @@ struct perf_event {
perf_overflow_handler_t overflow_handler;
void*overflow_handler_context;
 
+   atomic_t*sample_disable;
+
 #ifdef CONFIG_EVENT_TRACING
struct trace_event_call *tp_event;
struct event_filter *filter;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 29ace10..4ae82c9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -51,6 +51,9 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
array->elem_size = elem_size;
 
+   if (attr->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+   atomic_set(>map.perf_sample_disable, 1);
+
return >map;
 }
 
@@ -298,6 +301,8 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map 
*map, int fd)
perf_event_release_kernel(event);
return ERR_PTR(-EINVAL);
}
+
+   event->sample_disable = >perf_sample_disable;
return event;
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b11756f..f6ef45c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(>pending);
}
 
+   if ((event->sample_disable) && atomic_read(event->sample_disable))
+   return ret;
+
if (event->overflow_handler)
event->overflow_handler(event, data, regs);
else
-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 2/2] bpf: Implement bpf_perf_event_sample_enable/disable() helpers

2015-10-12 Thread Kaixu Xia
The functions bpf_perf_event_sample_enable/disable() can set the
flag sample_disable to enable/disable output trace data on samples.

Signed-off-by: Kaixu Xia <xiaka...@huawei.com>
---
 include/linux/bpf.h  |  2 ++
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/verifier.c|  4 +++-
 kernel/trace/bpf_trace.c | 34 ++
 4 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 25e073d..09148ff 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -192,6 +192,8 @@ extern const struct bpf_func_proto 
bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 extern const struct bpf_func_proto bpf_perf_event_read_proto;
+extern const struct bpf_func_proto bpf_perf_event_sample_enable_proto;
+extern const struct bpf_func_proto bpf_perf_event_sample_disable_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 92a48e2..5229c550 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -272,6 +272,8 @@ enum bpf_func_id {
BPF_FUNC_skb_get_tunnel_key,
BPF_FUNC_skb_set_tunnel_key,
BPF_FUNC_perf_event_read,   /* u64 bpf_perf_event_read(, index) 
*/
+   BPF_FUNC_perf_event_sample_enable,  /* u64 
bpf_perf_event_enable() */
+   BPF_FUNC_perf_event_sample_disable, /* u64 
bpf_perf_event_disable() */
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b074b23..6428daf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -244,6 +244,8 @@ static const struct {
 } func_limit[] = {
{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+   {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_sample_enable},
+   {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_sample_disable},
 };
 
 static void print_verifier_state(struct verifier_env *env)
@@ -860,7 +862,7 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
 * don't allow any other map type to be passed into
 * the special func;
 */
-   if (bool_map != bool_func)
+   if (bool_func && bool_map != bool_func)
return -EINVAL;
}
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0fe96c7..abe943a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -215,6 +215,36 @@ const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type  = ARG_ANYTHING,
 };
 
+static u64 bpf_perf_event_sample_enable(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+
+   atomic_set(>perf_sample_disable, 0);
+   return 0;
+}
+
+const struct bpf_func_proto bpf_perf_event_sample_enable_proto = {
+   .func   = bpf_perf_event_sample_enable,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_CONST_MAP_PTR,
+};
+
+static u64 bpf_perf_event_sample_disable(u64 r1, u64 r2, u64 r3, u64 r4, u64 
r5)
+{
+   struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+
+   atomic_set(>perf_sample_disable, 1);
+   return 0;
+}
+
+const struct bpf_func_proto bpf_perf_event_sample_disable_proto = {
+   .func   = bpf_perf_event_sample_disable,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_CONST_MAP_PTR,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id 
func_id)
 {
switch (func_id) {
@@ -242,6 +272,10 @@ static const struct bpf_func_proto 
*kprobe_prog_func_proto(enum bpf_func_id func
return _get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return _perf_event_read_proto;
+   case BPF_FUNC_perf_event_sample_enable:
+   return _perf_event_sample_enable_proto;
+   case BPF_FUNC_perf_event_sample_disable:
+   return _perf_event_sample_disable_proto;
default:
return NULL;
}
-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next] bpf: fix build warnings and add function read_trace_pipe()

2015-08-12 Thread Kaixu Xia
There are two improvements in this patch:
 1. Fix the build warnings;
 2. Add function read_trace_pipe() to print the result on
the screen;

Before this patch, we can get the result through /sys/kernel/de
bug/tracing/trace_pipe and get nothing on the screen.
By applying this patch, the result can be printed on the screen.
  $ ./tracex6
...
 tracex6-705   [003] d..1   131.428593: : CPU-3   19981414
sshd-683   [000] d..1   131.428727: : CPU-0   221682321
sshd-683   [000] d..1   131.428821: : CPU-0   221808766
sshd-683   [000] d..1   131.428950: : CPU-0   221982984
sshd-683   [000] d..1   131.429045: : CPU-0   222111851
 tracex6-705   [003] d..1   131.429168: : CPU-3   20757551
sshd-683   [000] d..1   131.429170: : CPU-0   81240
sshd-683   [000] d..1   131.429261: : CPU-0   222403340
sshd-683   [000] d..1   131.429378: : CPU-0   222561024
...

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 samples/bpf/tracex6_kern.c |  1 +
 samples/bpf/tracex6_user.c | 22 +-
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
index 23d1cff..be479c4 100644
--- a/samples/bpf/tracex6_kern.c
+++ b/samples/bpf/tracex6_kern.c
@@ -1,3 +1,4 @@
+#include linux/ptrace.h
 #include linux/version.h
 #include uapi/linux/bpf.h
 #include bpf_helpers.h
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
index 928f05e..8ea4976 100644
--- a/samples/bpf/tracex6_user.c
+++ b/samples/bpf/tracex6_user.c
@@ -17,8 +17,7 @@ static void test_bpf_perf_event(void)
 {
int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
int *pmu_fd = malloc(nr_cpus * sizeof(int));
-   unsigned long value;
-   int i;
+   int status, i;
 
struct perf_event_attr attr_insn_pmu = {
.freq = 0,
@@ -32,22 +31,26 @@ static void test_bpf_perf_event(void)
 
for (i = 0; i  nr_cpus; i++) {
pmu_fd[i] = perf_event_open(attr_insn_pmu, -1/*pid*/, 
i/*cpu*/, -1/*group_fd*/, 0);
-   if (pmu_fd[i]  0)
+   if (pmu_fd[i]  0) {
printf(event syscall failed\n);
+   goto exit;
+   }
 
bpf_update_elem(map_fd[0], i, pmu_fd[i], BPF_ANY);
ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
}
 
-   system(ls);
-   system(pwd);
-   system(sleep 2);
+   status = system(ls  /dev/null);
+   if (status)
+   goto exit;
+   status = system(sleep 2);
+   if (status)
+   goto exit;
 
+exit:
for (i = 0; i  nr_cpus; i++)
close(pmu_fd[i]);
-
-   close(map_fd);
-
+   close(map_fd[0]);
free(pmu_fd);
 }
 
@@ -63,6 +66,7 @@ int main(int argc, char **argv)
}
 
test_bpf_perf_event();
+   read_trace_pipe();
 
return 0;
 }
-- 
1.8.3.4

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next] bpf: s390: Fix build error caused by the struct bpf_array member name changed

2015-08-11 Thread Kaixu Xia
There is a build error that 'struct bpf_array' has no member
named 'prog' on s390. In commit 2a36f0b, the member 'prog' of
struct bpf_array is replaced by 'ptrs'. So this patch fixes it.

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 arch/s390/net/bpf_jit_comp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 9f4bbc0..eeda051 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1032,7 +1032,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
  MAX_TAIL_CALL_CNT, 0, 0x2);
 
/*
-* prog = array-prog[index];
+* prog = array-ptrs[index];
 * if (prog == NULL)
 * goto out;
 */
@@ -1041,7 +1041,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
EMIT6_DISP_LH(0xeb00, 0x000d, REG_1, BPF_REG_3, REG_0, 3);
/* lg %r1,prog(%b2,%r1) */
EMIT6_DISP_LH(0xe300, 0x0004, REG_1, BPF_REG_2,
- REG_1, offsetof(struct bpf_array, prog));
+ REG_1, offsetof(struct bpf_array, ptrs));
/* clgij %r1,0,0x8,label0 */
EMIT6_PCREL_IMM_LABEL(0xec00, 0x007d, REG_1, 0, 0, 0x8);
 
-- 
1.8.3.4

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 net-next] bpf: s390: Fix build error caused by the struct bpf_array member name changed

2015-08-11 Thread Kaixu Xia
There is a build error that 'struct bpf_array' has no member
named 'prog' on s390. In commit 2a36f0b92eb6 (bpf: Make the
bpf_prog_array_map more generic), the member 'prog' of struct
bpf_array is replaced by 'ptrs'. So this patch fixes it.

Fixes: 2a36f0b92eb6 (bpf: Make the bpf_prog_array_map more generic)
Reported-by: Wu Fengguang fengguang...@intel.com
Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 arch/s390/net/bpf_jit_comp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 9f4bbc0..eeda051 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1032,7 +1032,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
  MAX_TAIL_CALL_CNT, 0, 0x2);
 
/*
-* prog = array-prog[index];
+* prog = array-ptrs[index];
 * if (prog == NULL)
 * goto out;
 */
@@ -1041,7 +1041,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
EMIT6_DISP_LH(0xeb00, 0x000d, REG_1, BPF_REG_3, REG_0, 3);
/* lg %r1,prog(%b2,%r1) */
EMIT6_DISP_LH(0xe300, 0x0004, REG_1, BPF_REG_2,
- REG_1, offsetof(struct bpf_array, prog));
+ REG_1, offsetof(struct bpf_array, ptrs));
/* clgij %r1,0,0x8,label0 */
EMIT6_PCREL_IMM_LABEL(0xec00, 0x007d, REG_1, 0, 0, 0x8);
 
-- 
1.8.3.4

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next] bpf: fix the bug 'struct bpf_array' has no member named 'prog' in s390 architecture

2015-08-10 Thread Kaixu Xia
'Kbuild test robot' sent me an email about a build error
'struct bpf_array' has no member named 'prog' in s390
architecture. This error is caused by commit: 2a36f0b92eb
638dd023870574eb471b1c56be9ad [656/692] bpf: Make the bpf
_prog_array_map more generic. In this patch, the member 'prog'
of struct bpf_array has been replaced by 'ptrs'. So this
patch fix it.
---
 arch/s390/net/bpf_jit_comp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 9f4bbc0..eeda051 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1032,7 +1032,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
  MAX_TAIL_CALL_CNT, 0, 0x2);
 
/*
-* prog = array-prog[index];
+* prog = array-ptrs[index];
 * if (prog == NULL)
 * goto out;
 */
@@ -1041,7 +1041,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
EMIT6_DISP_LH(0xeb00, 0x000d, REG_1, BPF_REG_3, REG_0, 3);
/* lg %r1,prog(%b2,%r1) */
EMIT6_DISP_LH(0xe300, 0x0004, REG_1, BPF_REG_2,
- REG_1, offsetof(struct bpf_array, prog));
+ REG_1, offsetof(struct bpf_array, ptrs));
/* clgij %r1,0,0x8,label0 */
EMIT6_PCREL_IMM_LABEL(0xec00, 0x007d, REG_1, 0, 0, 0x8);
 
-- 
1.8.3.4

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v7 2/5] bpf: Make the bpf_prog_array_map more generic

2015-08-06 Thread Kaixu Xia
From: Wang Nan wangn...@huawei.com

All the map backends are of generic nature. In order to avoid
adding much special code into the eBPF core, rewrite part of
the bpf_prog_array map code and make it more generic. So the
new perf_event_array map type can reuse most of code with
bpf_prog_array map and add fewer lines of special code.

Signed-off-by: Wang Nan wangn...@huawei.com
Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 arch/x86/net/bpf_jit_comp.c |  6 ++--
 include/linux/bpf.h |  8 +++--
 kernel/bpf/arraymap.c   | 80 +++--
 kernel/bpf/core.c   |  2 +-
 kernel/bpf/syscall.c|  2 +-
 5 files changed, 60 insertions(+), 38 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ec5214f..70efcd0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
  * goto out;
  *   if (++tail_call_cnt  MAX_TAIL_CALL_CNT)
  * goto out;
- *   prog = array-prog[index];
+ *   prog = array-ptrs[index];
  *   if (prog == NULL)
  * goto out;
  *   goto *(prog-bpf_func + prologue_size);
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
EMIT3(0x83, 0xC0, 0x01);  /* add eax, 1 */
EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], 
eax */
 
-   /* prog = array-prog[index]; */
+   /* prog = array-ptrs[index]; */
EMIT4_off32(0x48, 0x8D, 0x84, 0xD6,   /* lea rax, [rsi + rdx * 8 + 
offsetof(...)] */
-   offsetof(struct bpf_array, prog));
+   offsetof(struct bpf_array, ptrs));
EMIT3(0x48, 0x8B, 0x00);  /* mov rax, qword ptr [rax] */
 
/* if (prog == NULL)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 139d6d2..d495211 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -24,6 +24,10 @@ struct bpf_map_ops {
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 
flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
+
+   /* funcs called by prog_array and perf_event_array map */
+   void *(*map_fd_get_ptr) (struct bpf_map *map, int fd);
+   void (*map_fd_put_ptr) (void *ptr);
 };
 
 struct bpf_map {
@@ -142,13 +146,13 @@ struct bpf_array {
bool owner_jited;
union {
char value[0] __aligned(8);
-   struct bpf_prog *prog[0] __aligned(8);
+   void *ptrs[0] __aligned(8);
};
 };
 #define MAX_TAIL_CALL_CNT 32
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-void bpf_prog_array_map_clear(struct bpf_map *map);
+void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog 
*fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229..45df657 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
 }
 late_initcall(register_array_map);
 
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
-   /* only bpf_prog file descriptors can be stored in prog_array map */
+   /* only file descriptors can be stored in this type of map */
if (attr-value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
return array_map_alloc(attr);
 }
 
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
 {
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
 
/* make sure it's empty */
for (i = 0; i  array-map.max_entries; i++)
-   BUG_ON(array-prog[i] != NULL);
+   BUG_ON(array-ptrs[i] != NULL);
kvfree(array);
 }
 
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
return NULL;
 }
 
 /* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+   void *value, u64 map_flags)
 {
struct bpf_array *array = container_of(map, struct bpf_array, map);
-   struct bpf_prog *prog, *old_prog;
+   void *new_ptr, *old_ptr;
u32 index = *(u32 *)key, ufd;
 
if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map 
*map, void *key,
return -E2BIG;
 
ufd = *(u32 *)value;
-   prog

[PATCH v7 5/5] samples/bpf: example of get selected PMU counter value

2015-08-06 Thread Kaixu Xia
This is a simple example and shows how to use the new ability
to get the selected Hardware PMU counter value.

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 samples/bpf/Makefile   |  4 +++
 samples/bpf/bpf_helpers.h  |  2 ++
 samples/bpf/tracex6_kern.c | 26 ++
 samples/bpf/tracex6_user.c | 68 ++
 4 files changed, 100 insertions(+)
 create mode 100644 samples/bpf/tracex6_kern.c
 create mode 100644 samples/bpf/tracex6_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4450fed..63e7d50 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex2
 hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
+hostprogs-y += tracex6
 hostprogs-y += lathist
 
 test_verifier-objs := test_verifier.o libbpf.o
@@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
+tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
 lathist-objs := bpf_load.o libbpf.o lathist_user.o
 
 # Tell kbuild to always build the programs
@@ -37,6 +39,7 @@ always += tracex2_kern.o
 always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
+always += tracex6_kern.o
 always += tcbpf1_kern.o
 always += lathist_kern.o
 
@@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
+HOSTLOADLIBES_tracex6 += -lelf
 HOSTLOADLIBES_lathist += -lelf
 
 # point this to your LLVM backend with bpf support
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index c77c872..3a44d3a 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
(void *) BPF_FUNC_get_current_uid_gid;
 static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm;
+static int (*bpf_perf_event_read)(void *map, int index) =
+   (void *) BPF_FUNC_perf_event_read;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
new file mode 100644
index 000..23d1cff
--- /dev/null
+++ b/samples/bpf/tracex6_kern.c
@@ -0,0 +1,26 @@
+#include linux/version.h
+#include uapi/linux/bpf.h
+#include bpf_helpers.h
+
+struct bpf_map_def SEC(maps) my_map = {
+   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+   .key_size = sizeof(int),
+   .value_size = sizeof(u32),
+   .max_entries = 32,
+};
+
+SEC(kprobe/sys_write)
+int bpf_prog1(struct pt_regs *ctx)
+{
+   u64 count;
+   u32 key = bpf_get_smp_processor_id();
+   char fmt[] = CPU-%d   %llu\n;
+
+   count = bpf_perf_event_read(my_map, key);
+   bpf_trace_printk(fmt, sizeof(fmt), key, count);
+
+   return 0;
+}
+
+char _license[] SEC(license) = GPL;
+u32 _version SEC(version) = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
new file mode 100644
index 000..928f05e
--- /dev/null
+++ b/samples/bpf/tracex6_user.c
@@ -0,0 +1,68 @@
+#include stdio.h
+#include unistd.h
+#include stdlib.h
+#include stdbool.h
+#include string.h
+#include fcntl.h
+#include poll.h
+#include sys/ioctl.h
+#include linux/perf_event.h
+#include linux/bpf.h
+#include libbpf.h
+#include bpf_load.h
+
+#define SAMPLE_PERIOD  0x7fffULL
+
+static void test_bpf_perf_event(void)
+{
+   int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+   int *pmu_fd = malloc(nr_cpus * sizeof(int));
+   unsigned long value;
+   int i;
+
+   struct perf_event_attr attr_insn_pmu = {
+   .freq = 0,
+   .sample_period = SAMPLE_PERIOD,
+   .inherit = 0,
+   .type = PERF_TYPE_HARDWARE,
+   .read_format = 0,
+   .sample_type = 0,
+   .config = 0,/* PMU: cycles */
+   };
+
+   for (i = 0; i  nr_cpus; i++) {
+   pmu_fd[i] = perf_event_open(attr_insn_pmu, -1/*pid*/, 
i/*cpu*/, -1/*group_fd*/, 0);
+   if (pmu_fd[i]  0)
+   printf(event syscall failed\n);
+
+   bpf_update_elem(map_fd[0], i, pmu_fd[i], BPF_ANY);
+   ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+   }
+
+   system(ls);
+   system(pwd);
+   system(sleep 2);
+
+   for (i = 0; i  nr_cpus; i++)
+   close(pmu_fd[i]);
+
+   close(map_fd);
+
+   free(pmu_fd);
+}
+
+int main(int argc, char **argv)
+{
+   char filename[256];
+
+   snprintf(filename, sizeof(filename), %s_kern.o, argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf(%s, bpf_log_buf);
+   return 1;
+   }
+
+   test_bpf_perf_event();
+
+   return 0;
+}
-- 
1.8.3.4

[PATCH v7 1/5] perf: add the necessary core perf APIs when accessing events counters in eBPF programs

2015-08-06 Thread Kaixu Xia
This patch add three core perf APIs:
 - perf_event_attrs(): export the struct perf_event_attr from struct
   perf_event;
 - perf_event_get(): get the struct perf_event from the given fd;
 - perf_event_read_local(): read the events counters active on the
   current CPU;
These APIs are needed when accessing events counters in eBPF programs.

The API perf_event_read_local() comes from Peter and I add the
corresponding SOB. 

Signed-off-by: Kaixu Xia xiaka...@huawei.com
Signed-off-by: Peter Zijlstra a.p.zijls...@chello.nl
---
 include/linux/perf_event.h | 10 ++
 kernel/events/core.c   | 78 ++
 2 files changed, 88 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809..092a0e8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,8 @@ extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
+extern const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
@@ -659,6 +661,7 @@ perf_event_create_kernel_counter(struct perf_event_attr 
*attr,
void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
int src_cpu, int dst_cpu);
+extern u64 perf_event_read_local(struct perf_event *event);
 extern u64 perf_event_read_value(struct perf_event *event,
 u64 *enabled, u64 *running);
 
@@ -979,6 +982,12 @@ static inline int perf_event_init_task(struct task_struct 
*child)  { return 0; }
 static inline void perf_event_exit_task(struct task_struct *child) { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
+static inline struct perf_event *perf_event_get(unsigned int fd)   { 
return ERR_PTR(-EINVAL); }
+static inline const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event)
+{
+   return ERR_PTR(-EINVAL);
+}
+static inline u64 perf_event_read_local(struct perf_event *event)  { 
return -EINVAL; }
 static inline void perf_event_print_debug(void)
{ }
 static inline int perf_event_task_disable(void)
{ return -EINVAL; }
 static inline int perf_event_task_enable(void) { 
return -EINVAL; }
@@ -1011,6 +1020,7 @@ static inline void perf_event_enable(struct perf_event 
*event){ }
 static inline void perf_event_disable(struct perf_event *event)
{ }
 static inline int __perf_event_disable(void *info) { 
return -1; }
 static inline void perf_event_task_tick(void)  { }
+static inline int perf_event_release_kernel(struct perf_event *event)  { 
return 0; }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS)  defined(CONFIG_NO_HZ_FULL)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae34..e2c6a88 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3212,6 +3212,59 @@ static inline u64 perf_event_count(struct perf_event 
*event)
return __perf_event_count(event);
 }
 
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ *   - either for the current task, or for this CPU
+ *   - does not have inherit set, for inherited task events
+ * will not be local and we cannot read them atomically
+ *   - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+   unsigned long flags;
+   u64 val;
+
+   /*
+* Disabling interrupts avoids all counter scheduling (context
+* switches, timer based rotation and IPIs).
+*/
+   local_irq_save(flags);
+
+   /* If this is a per-task event, it must be for current */
+   WARN_ON_ONCE((event-attach_state  PERF_ATTACH_TASK) 
+event-hw.target != current);
+
+   /* If this is a per-CPU event, it must be for this CPU */
+   WARN_ON_ONCE(!(event-attach_state  PERF_ATTACH_TASK) 
+event-cpu != smp_processor_id());
+
+   /*
+* It must not be an event with inherit set, we cannot read
+* all child counters from atomic context.
+*/
+   WARN_ON_ONCE(event-attr.inherit);
+
+   /*
+* It must not have a pmu::count method, those are not
+* NMI safe.
+*/
+   WARN_ON_ONCE(event-pmu-count);
+
+   /*
+* If the event is currently on this CPU, its either a per-task event,
+* or local to this CPU. Furthermore it means its ACTIVE (otherwise

[PATCH v7 4/5] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter

2015-08-06 Thread Kaixu Xia
According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 include/linux/bpf.h  |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/verifier.c| 48 +---
 kernel/trace/bpf_trace.c | 31 +++
 4 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fc1f40..f57d7fe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,6 +190,7 @@ extern const struct bpf_func_proto 
bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
+extern const struct bpf_func_proto bpf_perf_event_read_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a1814e8..92a48e2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -271,6 +271,7 @@ enum bpf_func_id {
 */
BPF_FUNC_skb_get_tunnel_key,
BPF_FUNC_skb_set_tunnel_key,
+   BPF_FUNC_perf_event_read,   /* u64 bpf_perf_event_read(map, index) 
*/
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cd307df..48e1c71 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
[CONST_IMM] = imm,
 };
 
+static const struct {
+   int map_type;
+   int func_id;
+} func_limit[] = {
+   {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+   {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
 static void print_verifier_state(struct verifier_env *env)
 {
enum bpf_reg_type t;
@@ -837,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 
regno,
return err;
 }
 
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+   bool bool_map, bool_func;
+   int i;
+
+   if (!map)
+   return 0;
+
+   for (i = 0; i = ARRAY_SIZE(func_limit); i++) {
+   bool_map = (map-map_type == func_limit[i].map_type);
+   bool_func = (func_id == func_limit[i].func_id);
+   /* only when map  func pair match it can continue.
+* don't allow any other map type to be passed into
+* the special func;
+*/
+   if (bool_map != bool_func)
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
struct verifier_state *state = env-cur_state;
@@ -912,21 +942,9 @@ static int check_call(struct verifier_env *env, int 
func_id)
return -EINVAL;
}
 
-   if (map  map-map_type == BPF_MAP_TYPE_PROG_ARRAY 
-   func_id != BPF_FUNC_tail_call)
-   /* prog_array map type needs extra care:
-* only allow to pass it into bpf_tail_call() for now.
-* bpf_map_delete_elem() can be allowed in the future,
-* while bpf_map_update_elem() must only be done via syscall
-*/
-   return -EINVAL;
-
-   if (func_id == BPF_FUNC_tail_call 
-   map-map_type != BPF_MAP_TYPE_PROG_ARRAY)
-   /* don't allow any other map type to be passed into
-* bpf_tail_call()
-*/
-   return -EINVAL;
+   err = check_map_func_compatibility(map, func_id);
+   if (err)
+   return err;
 
return 0;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041a..ef9936d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -158,6 +158,35 @@ const struct bpf_func_proto 
*bpf_get_trace_printk_proto(void)
return bpf_trace_printk_proto;
 }
 
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+   struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+   struct bpf_array *array = container_of(map, struct bpf_array, map);
+   struct perf_event *event;
+
+   if (unlikely(index = array-map.max_entries))
+   return -E2BIG;
+
+   event = (struct perf_event *)array-ptrs[index];
+   if (!event)
+   return -ENOENT;
+
+   /*
+* we don't know if the function is run successfully by the
+* return value. It can be judged in other places, such as
+* eBPF programs.
+*/
+   return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+   .func   = bpf_perf_event_read

[PATCH v7 0/5] bpf: Introduce the new ability of eBPF programs to access hardware PMU counter

2015-08-06 Thread Kaixu Xia
This patchset is base on the net-next:
 git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git
commit 9dc20a649609c95ce7c5ac4282656ba627b67d49.

Previous patch v6 url:
https://lkml.org/lkml/2015/8/4/188

changes in V7:
 - rebase the whole patch set to net-next tree(9dc20a64);
 - split out the core perf APIs into Patch 1/5;
 - change the return value of function perf_event_attrs()
   from struct perf_event * to const struct perf_event * in
   Patch 1/5;
 - rename the function perf_event_read_internal() to perf_event_
   read_local() and rewrite it in Patch 1/5;
 - rename the function check_func_limit() to check_map_func
   _compatibility() and remove the unnecessary pass pointer to
   a pointer in Patch 4/5;

changes in V6: 
 - make the Patch 1/4 commit message more meaning and readable;
 - remove the unnecessary comment in Patch 2/4 and make it clean;
 - declare the function perf_event_release_kernel() in include/
   linux/perf_event.h to fix the build error when CONFIG_PERF_EVENTS
   isn't configured in Patch 2/4;
 - add function perf_event_attrs() to get the struct perf_event_attr
   in Patch 2/4. 
 - move the related code from kernel/trace/bpf_trace.c to kernel/
   events/core.c and add function perf_event_read_internal() to
   avoid poking inside of the event outside of perf code in Patch 3/4;
 - generial the func  map match-pair with an array in Patch 3/4;

changes in V5: 
 - move struct fd_array_map_ops* fd_ops to bpf_map;
 - move array perf event decrement refcnt function to
   map_free;
 - fix the NULL ptr of perf_event_get();
 - move bpf_perf_event_read() to kernel/bpf/bpf_trace.c;
 - get rid of the remaining struct bpf_prog;
 - move the unnecessay cast on void *;

changes in V4: 
 - make the bpf_prog_array_map more generic;
 - fix the bug of event refcnt leak;
 - use more useful errno in bpf_perf_event_read();

changes in V3: 
 - collapse V2 patches 1-3 into one;
 - drop the function map-ops-map_traverse_elem() and release
   the struct perf_event in map_free;
 - only allow to access bpf_perf_event_read() from programs;
 - update the perf_event_array_map elem via xchg();
 - pass index directly to bpf_perf_event_read() instead of
   MAP_KEY;

changes in V2:
 - put atomic_long_inc_not_zero() between fdget() and fdput();
 - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
 - Only read the event counter on current CPU or on current
   process;
 - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
   pointer to the struct perf_event;
 - according to the perf_event_map_fd and key, the function
   bpf_perf_event_read() can get the Hardware PMU counter value;

Patch 5/5 is a simple example and shows how to use this new eBPF
programs ability. The PMU counter data can be found in
/sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
value when 'kprobe/sys_write' sampling)

  $ cat /sys/kernel/debug/tracing/trace_pipe
  $ ./tracex6
   ...
   syslog-ng-548   [000] d..176.905673: : CPU-0   681765271
   syslog-ng-548   [000] d..176.905690: : CPU-0   681787855
   syslog-ng-548   [000] d..176.905707: : CPU-0   681810504
   syslog-ng-548   [000] d..176.905725: : CPU-0   681834771
   syslog-ng-548   [000] d..176.905745: : CPU-0   681859519
   syslog-ng-548   [000] d..176.905766: : CPU-0   681890419
   syslog-ng-548   [000] d..176.905783: : CPU-0   681914045
   syslog-ng-548   [000] d..176.905800: : CPU-0   681935950
   syslog-ng-548   [000] d..176.905816: : CPU-0   681958299
  ls-690   [005] d..182.241308: : CPU-5   3138451
  sh-691   [004] d..182.244570: : CPU-4   7324988
   ...-699   [007] d..199.961387: : CPU-7   3194027
   ...-695   [003] d..199.961474: : CPU-3   288901
   ...-695   [003] d..199.961541: : CPU-3   383145
   ...-695   [003] d..199.961591: : CPU-3   450365
   ...-695   [003] d..199.961639: : CPU-3   515751
   ...-695   [003] d..199.961686: : CPU-3   579047
   ...

The detail of patches is as follow:

Patch 1/5 add the necessary core perf APIs perf_event_attrs(),
perf_event_get(),perf_event_read_local() when accessing events
counters in eBPF programs

Patch 2/5 rewrites part of the bpf_prog_array map code and make it
more generic;

Patch 3/5 introduces a new bpf map type. This map only stores the
pointer to struct perf_event;

Patch 4/5 implements function bpf_perf_event_read() that get the
selected hardware PMU conuter;

Patch 5/5 gives a simple example.

Kaixu Xia (4):
  perf: add the necessary core perf APIs when accessing events counters
in eBPF programs
  bpf: Add new bpf map type to store the pointer to struct perf_event
  bpf: Implement function bpf_perf_event_read() that get the selected
hardware PMU conuter
  samples/bpf: example of get selected PMU counter value

Wang Nan (1):
  bpf: Make the bpf_prog_array_map more generic

 arch/x86/net/bpf_jit_comp.c |   6

[PATCH v7 3/5] bpf: Add new bpf map type to store the pointer to struct perf_event

2015-08-06 Thread Kaixu Xia
Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 include/linux/bpf.h  |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/arraymap.c| 57 
 3 files changed, 59 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d495211..4fc1f40 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
 #include uapi/linux/bpf.h
 #include linux/workqueue.h
 #include linux/file.h
+#include linux/perf_event.h
 
 struct bpf_map;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2ce13c1..a1814e8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
+   BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 45df657..29ace10 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -273,3 +273,60 @@ static int __init register_prog_array_map(void)
return 0;
 }
 late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+   bpf_fd_array_map_clear(map);
+   fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+   struct perf_event *event;
+   const struct perf_event_attr *attr;
+
+   event = perf_event_get(fd);
+   if (IS_ERR(event))
+   return event;
+
+   attr = perf_event_attrs(event);
+   if (IS_ERR(attr))
+   return (void *)attr;
+
+   if (attr-type != PERF_TYPE_RAW 
+   attr-type != PERF_TYPE_HARDWARE) {
+   perf_event_release_kernel(event);
+   return ERR_PTR(-EINVAL);
+   }
+   return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+   struct perf_event *event = ptr;
+
+   perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+   .map_alloc = fd_array_map_alloc,
+   .map_free = perf_event_array_map_free,
+   .map_get_next_key = array_map_get_next_key,
+   .map_lookup_elem = fd_array_map_lookup_elem,
+   .map_update_elem = fd_array_map_update_elem,
+   .map_delete_elem = fd_array_map_delete_elem,
+   .map_fd_get_ptr = perf_event_fd_array_get_ptr,
+   .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+   .ops = perf_event_array_ops,
+   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+   bpf_register_map_type(perf_event_array_type);
+   return 0;
+}
+late_initcall(register_perf_event_array_map);
-- 
1.8.3.4

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v6 2/4] bpf: Add new bpf map type to store the pointer to struct perf_event

2015-08-04 Thread Kaixu Xia
Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 include/linux/bpf.h|  1 +
 include/linux/perf_event.h |  8 +++
 include/uapi/linux/bpf.h   |  1 +
 kernel/bpf/arraymap.c  | 57 ++
 kernel/events/core.c   | 25 
 5 files changed, 92 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a8ce262..d0b394a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
 #include uapi/linux/bpf.h
 #include linux/workqueue.h
 #include linux/file.h
+#include linux/perf_event.h
 
 struct bpf_map;
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809..81fc99e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,8 @@ extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
+extern struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
@@ -979,6 +981,11 @@ static inline int perf_event_init_task(struct task_struct 
*child)  { return 0; }
 static inline void perf_event_exit_task(struct task_struct *child) { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
+static inline struct perf_event *perf_event_get(unsigned int fd)   { 
return ERR_PTR(-EINVAL); }
+static inline struct perf_event_attr *perf_event_attrs(struct perf_event 
*event)
+{
+   return ERR_PTR(-EINVAL);
+}
 static inline void perf_event_print_debug(void)
{ }
 static inline int perf_event_task_disable(void)
{ return -EINVAL; }
 static inline int perf_event_task_enable(void) { 
return -EINVAL; }
@@ -1011,6 +1018,7 @@ static inline void perf_event_enable(struct perf_event 
*event){ }
 static inline void perf_event_disable(struct perf_event *event)
{ }
 static inline int __perf_event_disable(void *info) { 
return -1; }
 static inline void perf_event_task_tick(void)  { }
+static inline int perf_event_release_kernel(struct perf_event *event)  { 
return 0; }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS)  defined(CONFIG_NO_HZ_FULL)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f9..69a1f6b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
+   BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 45df657..b1e98ff 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -273,3 +273,60 @@ static int __init register_prog_array_map(void)
return 0;
 }
 late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+   bpf_fd_array_map_clear(map);
+   fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+   struct perf_event *event;
+   struct perf_event_attr *attr;
+
+   event = perf_event_get(fd);
+   if (IS_ERR(event))
+   return event;
+
+   attr = perf_event_attrs(event);
+   if (IS_ERR(attr))
+   return attr;
+
+   if (attr-type != PERF_TYPE_RAW 
+   attr-type != PERF_TYPE_HARDWARE) {
+   perf_event_release_kernel(event);
+   return ERR_PTR(-EINVAL);
+   }
+   return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+   struct perf_event *event = ptr;
+
+   perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+   .map_alloc = fd_array_map_alloc,
+   .map_free = perf_event_array_map_free,
+   .map_get_next_key = array_map_get_next_key,
+   .map_lookup_elem = fd_array_map_lookup_elem,
+   .map_update_elem = fd_array_map_update_elem,
+   .map_delete_elem = fd_array_map_delete_elem,
+   .map_fd_get_ptr = perf_event_fd_array_get_ptr,
+   .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+   .ops = perf_event_array_ops,
+   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY

[PATCH v6 3/4] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter

2015-08-04 Thread Kaixu Xia
According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 include/linux/bpf.h|  1 +
 include/linux/perf_event.h |  2 ++
 include/uapi/linux/bpf.h   |  1 +
 kernel/bpf/verifier.c  | 49 --
 kernel/events/core.c   | 19 ++
 kernel/trace/bpf_trace.c   | 31 +
 6 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0b394a..db9f781 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,6 +190,7 @@ extern const struct bpf_func_proto 
bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
+extern const struct bpf_func_proto bpf_perf_event_read_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 81fc99e..6f1e448 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -643,6 +643,7 @@ extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern struct perf_event *perf_event_get(unsigned int fd);
 extern struct perf_event_attr *perf_event_attrs(struct perf_event *event);
+extern u64 perf_event_read_internal(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
@@ -986,6 +987,7 @@ static inline struct perf_event_attr 
*perf_event_attrs(struct perf_event *event)
 {
return ERR_PTR(-EINVAL);
 }
+static inline u64 perf_event_read_internal(struct perf_event *event)   { 
return -EINVAL; }
 static inline void perf_event_print_debug(void)
{ }
 static inline int perf_event_task_disable(void)
{ return -EINVAL; }
 static inline int perf_event_task_enable(void) { 
return -EINVAL; }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 69a1f6b..b9b13ce 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -250,6 +250,7 @@ enum bpf_func_id {
 * Return: 0 on success
 */
BPF_FUNC_get_current_comm,
+   BPF_FUNC_perf_event_read,   /* u64 bpf_perf_event_read(map, index) 
*/
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866..45fae14 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
[CONST_IMM] = imm,
 };
 
+static const struct {
+   int map_type;
+   int func_id;
+} func_limit[] = {
+   {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+   {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
 static void print_verifier_state(struct verifier_env *env)
 {
enum bpf_reg_type t;
@@ -833,6 +841,29 @@ static int check_func_arg(struct verifier_env *env, u32 
regno,
return err;
 }
 
+static int check_func_limit(struct bpf_map **mapp, int func_id)
+{
+   struct bpf_map *map = *mapp;
+   bool bool_map, bool_func;
+   int i;
+
+   if (!map)
+   return 0;
+
+   for (i = 0; i = ARRAY_SIZE(func_limit); i++) {
+   bool_map = (map-map_type == func_limit[i].map_type);
+   bool_func = (func_id == func_limit[i].func_id);
+   /* only when map  func pair match it can continue.
+* don't allow any other map type to be passed into
+* the special func;
+*/
+   if (bool_map != bool_func)
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
struct verifier_state *state = env-cur_state;
@@ -908,21 +939,9 @@ static int check_call(struct verifier_env *env, int 
func_id)
return -EINVAL;
}
 
-   if (map  map-map_type == BPF_MAP_TYPE_PROG_ARRAY 
-   func_id != BPF_FUNC_tail_call)
-   /* prog_array map type needs extra care:
-* only allow to pass it into bpf_tail_call() for now.
-* bpf_map_delete_elem() can be allowed in the future,
-* while bpf_map_update_elem() must only be done via syscall
-*/
-   return -EINVAL;
-
-   if (func_id == BPF_FUNC_tail_call 
-   map-map_type != BPF_MAP_TYPE_PROG_ARRAY)
-   /* don't allow any other map type to be passed into
-* bpf_tail_call

[PATCH v6 0/4] bpf: Introduce the new ability of eBPF programs to access hardware PMU counter

2015-08-04 Thread Kaixu Xia
Previous patch v5 url:
https://lkml.org/lkml/2015/7/31/299

changes in V6: 
 - make the Patch 1/4 commit message more meaning and readable;
 - remove the unnecessary comment in Patch 2/4 and make it clean;
 - declare the function perf_event_release_kernel() in include/
   linux/perf_event.h to fix the build error when CONFIG_PERF_EVENTS
   isn't configured in Patch 2/4;
 - add function perf_event_attrs() to get the struct perf_event_attr
   in Patch 2/4. 
 - move the related code from kernel/trace/bpf_trace.c to kernel/
   events/core.c and add function perf_event_read_internal() to
   avoid poking inside of the event outside of perf code in Patch 3/4;
 - generial the func  map match-pair with an array in Patch 3/4;

changes in V5: 
 - move struct fd_array_map_ops* fd_ops to bpf_map;
 - move array perf event decrement refcnt function to
   map_free;
 - fix the NULL ptr of perf_event_get();
 - move bpf_perf_event_read() to kernel/bpf/bpf_trace.c;
 - get rid of the remaining struct bpf_prog;
 - move the unnecessay cast on void *;

changes in V4: 
 - make the bpf_prog_array_map more generic;
 - fix the bug of event refcnt leak;
 - use more useful errno in bpf_perf_event_read();

changes in V3: 
 - collapse V2 patches 1-3 into one;
 - drop the function map-ops-map_traverse_elem() and release
   the struct perf_event in map_free;
 - only allow to access bpf_perf_event_read() from programs;
 - update the perf_event_array_map elem via xchg();
 - pass index directly to bpf_perf_event_read() instead of
   MAP_KEY;

changes in V2:
 - put atomic_long_inc_not_zero() between fdget() and fdput();
 - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
 - Only read the event counter on current CPU or on current
   process;
 - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
   pointer to the struct perf_event;
 - according to the perf_event_map_fd and key, the function
   bpf_perf_event_read() can get the Hardware PMU counter value;

Patch 4/4 is a simple example and shows how to use this new eBPF
programs ability. The PMU counter data can be found in
/sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
value when 'kprobe/sys_write' sampling)

  $ cat /sys/kernel/debug/tracing/trace_pipe
  $ ./tracex6
   ...
   syslog-ng-548   [000] d..176.905673: : CPU-0   681765271
   syslog-ng-548   [000] d..176.905690: : CPU-0   681787855
   syslog-ng-548   [000] d..176.905707: : CPU-0   681810504
   syslog-ng-548   [000] d..176.905725: : CPU-0   681834771
   syslog-ng-548   [000] d..176.905745: : CPU-0   681859519
   syslog-ng-548   [000] d..176.905766: : CPU-0   681890419
   syslog-ng-548   [000] d..176.905783: : CPU-0   681914045
   syslog-ng-548   [000] d..176.905800: : CPU-0   681935950
   syslog-ng-548   [000] d..176.905816: : CPU-0   681958299
  ls-690   [005] d..182.241308: : CPU-5   3138451
  sh-691   [004] d..182.244570: : CPU-4   7324988
   ...-699   [007] d..199.961387: : CPU-7   3194027
   ...-695   [003] d..199.961474: : CPU-3   288901
   ...-695   [003] d..199.961541: : CPU-3   383145
   ...-695   [003] d..199.961591: : CPU-3   450365
   ...-695   [003] d..199.961639: : CPU-3   515751
   ...-695   [003] d..199.961686: : CPU-3   579047
   ...

The detail of patches is as follow:

Patch 1/4 rewrites part of the bpf_prog_array map code and make it
more generic;

Patch 2/4 introduces a new bpf map type. This map only stores the
pointer to struct perf_event;

Patch 3/4 implements function bpf_perf_event_read() that get the
selected hardware PMU conuter;

Patch 4/4 gives a simple example.

Kaixu Xia (3):
  bpf: Add new bpf map type to store the pointer to struct perf_event
  bpf: Implement function bpf_perf_event_read() that get the selected
hardware PMU conuter
  samples/bpf: example of get selected PMU counter value

Wang Nan (1):
  bpf: Make the bpf_prog_array_map more generic

 arch/x86/net/bpf_jit_comp.c |   6 +-
 include/linux/bpf.h |  10 +++-
 include/linux/perf_event.h  |  10 
 include/uapi/linux/bpf.h|   2 +
 kernel/bpf/arraymap.c   | 137 ++--
 kernel/bpf/core.c   |   2 +-
 kernel/bpf/syscall.c|   2 +-
 kernel/bpf/verifier.c   |  49 +++-
 kernel/events/core.c|  44 ++
 kernel/trace/bpf_trace.c|  31 ++
 samples/bpf/Makefile|   4 ++
 samples/bpf/bpf_helpers.h   |   2 +
 samples/bpf/tracex6_kern.c  |  26 +
 samples/bpf/tracex6_user.c  |  68 ++
 14 files changed, 340 insertions(+), 53 deletions(-)
 create mode 100644 samples/bpf/tracex6_kern.c
 create mode 100644 samples/bpf/tracex6_user.c

-- 
1.8.3.4

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http

[PATCH v6 4/4] samples/bpf: example of get selected PMU counter value

2015-08-04 Thread Kaixu Xia
This is a simple example and shows how to use the new ability
to get the selected Hardware PMU counter value.

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 samples/bpf/Makefile   |  4 +++
 samples/bpf/bpf_helpers.h  |  2 ++
 samples/bpf/tracex6_kern.c | 26 ++
 samples/bpf/tracex6_user.c | 68 ++
 4 files changed, 100 insertions(+)
 create mode 100644 samples/bpf/tracex6_kern.c
 create mode 100644 samples/bpf/tracex6_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4450fed..63e7d50 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex2
 hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
+hostprogs-y += tracex6
 hostprogs-y += lathist
 
 test_verifier-objs := test_verifier.o libbpf.o
@@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
+tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
 lathist-objs := bpf_load.o libbpf.o lathist_user.o
 
 # Tell kbuild to always build the programs
@@ -37,6 +39,7 @@ always += tracex2_kern.o
 always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
+always += tracex6_kern.o
 always += tcbpf1_kern.o
 always += lathist_kern.o
 
@@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
+HOSTLOADLIBES_tracex6 += -lelf
 HOSTLOADLIBES_lathist += -lelf
 
 # point this to your LLVM backend with bpf support
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index bdf1c16..c8a3594 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
(void *) BPF_FUNC_get_current_uid_gid;
 static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm;
+static int (*bpf_perf_event_read)(void *map, int index) =
+   (void *) BPF_FUNC_perf_event_read;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
new file mode 100644
index 000..23d1cff
--- /dev/null
+++ b/samples/bpf/tracex6_kern.c
@@ -0,0 +1,26 @@
+#include linux/version.h
+#include uapi/linux/bpf.h
+#include bpf_helpers.h
+
+struct bpf_map_def SEC(maps) my_map = {
+   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+   .key_size = sizeof(int),
+   .value_size = sizeof(u32),
+   .max_entries = 32,
+};
+
+SEC(kprobe/sys_write)
+int bpf_prog1(struct pt_regs *ctx)
+{
+   u64 count;
+   u32 key = bpf_get_smp_processor_id();
+   char fmt[] = CPU-%d   %llu\n;
+
+   count = bpf_perf_event_read(my_map, key);
+   bpf_trace_printk(fmt, sizeof(fmt), key, count);
+
+   return 0;
+}
+
+char _license[] SEC(license) = GPL;
+u32 _version SEC(version) = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
new file mode 100644
index 000..928f05e
--- /dev/null
+++ b/samples/bpf/tracex6_user.c
@@ -0,0 +1,68 @@
+#include stdio.h
+#include unistd.h
+#include stdlib.h
+#include stdbool.h
+#include string.h
+#include fcntl.h
+#include poll.h
+#include sys/ioctl.h
+#include linux/perf_event.h
+#include linux/bpf.h
+#include libbpf.h
+#include bpf_load.h
+
+#define SAMPLE_PERIOD  0x7fffULL
+
+static void test_bpf_perf_event(void)
+{
+   int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+   int *pmu_fd = malloc(nr_cpus * sizeof(int));
+   unsigned long value;
+   int i;
+
+   struct perf_event_attr attr_insn_pmu = {
+   .freq = 0,
+   .sample_period = SAMPLE_PERIOD,
+   .inherit = 0,
+   .type = PERF_TYPE_HARDWARE,
+   .read_format = 0,
+   .sample_type = 0,
+   .config = 0,/* PMU: cycles */
+   };
+
+   for (i = 0; i  nr_cpus; i++) {
+   pmu_fd[i] = perf_event_open(attr_insn_pmu, -1/*pid*/, 
i/*cpu*/, -1/*group_fd*/, 0);
+   if (pmu_fd[i]  0)
+   printf(event syscall failed\n);
+
+   bpf_update_elem(map_fd[0], i, pmu_fd[i], BPF_ANY);
+   ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+   }
+
+   system(ls);
+   system(pwd);
+   system(sleep 2);
+
+   for (i = 0; i  nr_cpus; i++)
+   close(pmu_fd[i]);
+
+   close(map_fd);
+
+   free(pmu_fd);
+}
+
+int main(int argc, char **argv)
+{
+   char filename[256];
+
+   snprintf(filename, sizeof(filename), %s_kern.o, argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf(%s, bpf_log_buf);
+   return 1;
+   }
+
+   test_bpf_perf_event();
+
+   return 0;
+}
-- 
1.8.3.4

[PATCH v6 1/4] bpf: Make the bpf_prog_array_map more generic

2015-08-04 Thread Kaixu Xia
From: Wang Nan wangn...@huawei.com

All the map backends are of generic nature. In order to avoid
adding much special code into the eBPF core, rewrite part of
the bpf_prog_array map code and make it more generic. So the
new perf_event_array map type can reuse most of code with
bpf_prog_array map and add fewer lines of special code.

Signed-off-by: Wang Nan wangn...@huawei.com
Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 arch/x86/net/bpf_jit_comp.c |  6 ++--
 include/linux/bpf.h |  8 +++--
 kernel/bpf/arraymap.c   | 80 +++--
 kernel/bpf/core.c   |  2 +-
 kernel/bpf/syscall.c|  2 +-
 5 files changed, 60 insertions(+), 38 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 579a8fd..e377f07 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
  * goto out;
  *   if (++tail_call_cnt  MAX_TAIL_CALL_CNT)
  * goto out;
- *   prog = array-prog[index];
+ *   prog = array-ptrs[index];
  *   if (prog == NULL)
  * goto out;
  *   goto *(prog-bpf_func + prologue_size);
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
EMIT3(0x83, 0xC0, 0x01);  /* add eax, 1 */
EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], 
eax */
 
-   /* prog = array-prog[index]; */
+   /* prog = array-ptrs[index]; */
EMIT4(0x48, 0x8D, 0x44, 0xD6);/* lea rax, [rsi + rdx * 8 + 
0x50] */
-   EMIT1(offsetof(struct bpf_array, prog));
+   EMIT1(offsetof(struct bpf_array, ptrs));
EMIT3(0x48, 0x8B, 0x00);  /* mov rax, qword ptr [rax] */
 
/* if (prog == NULL)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476..a8ce262 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -24,6 +24,10 @@ struct bpf_map_ops {
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 
flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
+
+   /* funcs called by prog_array and perf_event_array map */
+   void *(*map_fd_get_ptr) (struct bpf_map *map, int fd);
+   void (*map_fd_put_ptr) (void *ptr);
 };
 
 struct bpf_map {
@@ -142,13 +146,13 @@ struct bpf_array {
bool owner_jited;
union {
char value[0] __aligned(8);
-   struct bpf_prog *prog[0] __aligned(8);
+   void *ptrs[0] __aligned(8);
};
 };
 #define MAX_TAIL_CALL_CNT 32
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-void bpf_prog_array_map_clear(struct bpf_map *map);
+void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog 
*fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229..45df657 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
 }
 late_initcall(register_array_map);
 
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
-   /* only bpf_prog file descriptors can be stored in prog_array map */
+   /* only file descriptors can be stored in this type of map */
if (attr-value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
return array_map_alloc(attr);
 }
 
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
 {
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
 
/* make sure it's empty */
for (i = 0; i  array-map.max_entries; i++)
-   BUG_ON(array-prog[i] != NULL);
+   BUG_ON(array-ptrs[i] != NULL);
kvfree(array);
 }
 
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
return NULL;
 }
 
 /* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+   void *value, u64 map_flags)
 {
struct bpf_array *array = container_of(map, struct bpf_array, map);
-   struct bpf_prog *prog, *old_prog;
+   void *new_ptr, *old_ptr;
u32 index = *(u32 *)key, ufd;
 
if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map 
*map, void *key,
return -E2BIG;
 
ufd = *(u32 *)value;
-   prog = bpf_prog_get(ufd

[PATCH v5 0/4] bpf: Introduce the new ability of eBPF programs to access hardware PMU counter

2015-07-31 Thread Kaixu Xia
Previous patch v4 url:
https://lkml.org/lkml/2015/7/28/432

changes in V5: 
 - move struct fd_array_map_ops* fd_ops to bpf_map;
 - move array perf event decrement refcnt function to
   map_free;
 - fix the NULL ptr of perf_event_get();
 - move bpf_perf_event_read() to kernel/bpf/bpf_trace.c;
 - get rid of the remaining struct bpf_prog;
 - move the unnecessay cast on void *;

changes in V4: 
 - make the bpf_prog_array_map more generic;
 - fix the bug of event refcnt leak;
 - use more useful errno in bpf_perf_event_read();

changes in V3: 
 - collapse V2 patches 1-3 into one;
 - drop the function map-ops-map_traverse_elem() and release
   the struct perf_event in map_free;
 - only allow to access bpf_perf_event_read() from programs;
 - update the perf_event_array_map elem via xchg();
 - pass index directly to bpf_perf_event_read() instead of
   MAP_KEY;

changes in V2: 
 - put atomic_long_inc_not_zero() between fdget() and fdput();
 - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
 - Only read the event counter on current CPU or on current
   process;
 - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
   pointer to the struct perf_event;
 - according to the perf_event_map_fd and key, the function
   bpf_perf_event_read() can get the Hardware PMU counter value;

Patch 4/4 is a simple example and shows how to use this new eBPF
programs ability. The PMU counter data can be found in
/sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
value when 'kprobe/sys_write' sampling)

  $ cat /sys/kernel/debug/tracing/trace_pipe
  $ ./tracex6
   ...
   syslog-ng-548   [000] d..176.905673: : CPU-0   681765271
   syslog-ng-548   [000] d..176.905690: : CPU-0   681787855
   syslog-ng-548   [000] d..176.905707: : CPU-0   681810504
   syslog-ng-548   [000] d..176.905725: : CPU-0   681834771
   syslog-ng-548   [000] d..176.905745: : CPU-0   681859519
   syslog-ng-548   [000] d..176.905766: : CPU-0   681890419
   syslog-ng-548   [000] d..176.905783: : CPU-0   681914045
   syslog-ng-548   [000] d..176.905800: : CPU-0   681935950
   syslog-ng-548   [000] d..176.905816: : CPU-0   681958299
  ls-690   [005] d..182.241308: : CPU-5   3138451
  sh-691   [004] d..182.244570: : CPU-4   7324988
   ...-699   [007] d..199.961387: : CPU-7   3194027
   ...-695   [003] d..199.961474: : CPU-3   288901
   ...-695   [003] d..199.961541: : CPU-3   383145
   ...-695   [003] d..199.961591: : CPU-3   450365
   ...-695   [003] d..199.961639: : CPU-3   515751
   ...-695   [003] d..199.961686: : CPU-3   579047
   ...

The detail of patches is as follow:

Patch 1/4 rewrites part of the bpf_prog_array map code and make it
more generic;

Patch 2/4 introduces a new bpf map type. This map only stores the
pointer to struct perf_event;

Patch 3/4 implements function bpf_perf_event_read() that get the
selected hardware PMU conuter;

Patch 4/4 gives a simple example.

Kaixu Xia (3):
  bpf: Add new bpf map type to store the pointer to struct perf_event
  bpf: Implement function bpf_perf_event_read() that get the selected
hardware PMU conuter
  samples/bpf: example of get selected PMU counter value

Wang Nan (1):
  bpf: Make the bpf_prog_array_map more generic

 arch/x86/net/bpf_jit_comp.c |   6 +-
 include/linux/bpf.h |  10 +++-
 include/linux/perf_event.h  |  14 -
 include/uapi/linux/bpf.h|   2 +
 kernel/bpf/arraymap.c   | 135 ++--
 kernel/bpf/core.c   |   2 +-
 kernel/bpf/syscall.c|   2 +-
 kernel/bpf/verifier.c   |  56 +-
 kernel/events/core.c|  27 ++---
 kernel/trace/bpf_trace.c|  37 
 samples/bpf/Makefile|   4 ++
 samples/bpf/bpf_helpers.h   |   2 +
 samples/bpf/tracex6_kern.c  |  26 +
 samples/bpf/tracex6_user.c  |  68 ++
 14 files changed, 328 insertions(+), 63 deletions(-)
 create mode 100644 samples/bpf/tracex6_kern.c
 create mode 100644 samples/bpf/tracex6_user.c

-- 
1.8.3.4

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 2/4] bpf: Add new bpf map type to store the pointer to struct perf_event

2015-07-31 Thread Kaixu Xia
Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 include/linux/bpf.h|  1 +
 include/linux/perf_event.h |  2 ++
 include/uapi/linux/bpf.h   |  1 +
 kernel/bpf/arraymap.c  | 55 ++
 kernel/events/core.c   | 17 ++
 5 files changed, 76 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a8ce262..d0b394a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
 #include uapi/linux/bpf.h
 #include linux/workqueue.h
 #include linux/file.h
+#include linux/perf_event.h
 
 struct bpf_map;
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809..27e05c1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,7 @@ extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
@@ -979,6 +980,7 @@ static inline int perf_event_init_task(struct task_struct 
*child)   { return 0; }
 static inline void perf_event_exit_task(struct task_struct *child) { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
+static struct perf_event *perf_event_get(unsigned int fd)  { 
return ERR_PTR(-EINVAL); }
 static inline void perf_event_print_debug(void)
{ }
 static inline int perf_event_task_disable(void)
{ return -EINVAL; }
 static inline int perf_event_task_enable(void) { 
return -EINVAL; }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f9..69a1f6b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
+   BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 45df657..b7e0b5d 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -273,3 +273,58 @@ static int __init register_prog_array_map(void)
return 0;
 }
 late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+   bpf_fd_array_map_clear(map);
+   fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+   struct perf_event *event;
+
+   event = perf_event_get(fd);
+   if (IS_ERR(event))
+   return event;
+
+   /*
+* prevent some crazy events so we can make our life easier
+*/
+   if (event-attr.type != PERF_TYPE_RAW 
+   event-attr.type != PERF_TYPE_HARDWARE) {
+   perf_event_release_kernel(event);
+   return ERR_PTR(-EINVAL);
+   }
+   return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+   struct perf_event *event = ptr;
+
+   perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+   .map_alloc = fd_array_map_alloc,
+   .map_free = perf_event_array_map_free,
+   .map_get_next_key = array_map_get_next_key,
+   .map_lookup_elem = fd_array_map_lookup_elem,
+   .map_update_elem = fd_array_map_update_elem,
+   .map_delete_elem = fd_array_map_delete_elem,
+   .map_fd_get_ptr = perf_event_fd_array_get_ptr,
+   .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+   .ops = perf_event_array_ops,
+   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+   bpf_register_map_type(perf_event_array_type);
+   return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae34..58f0d47 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8574,6 +8574,23 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task-perf_event_ctxp[ctxn]);
 }
 
+struct perf_event *perf_event_get(unsigned int fd)
+{
+   int err;
+   struct fd f;
+   struct perf_event *event;
+
+   err = perf_fget_light(fd, f);
+   if (err)
+   return ERR_PTR(err);
+
+   event = f.file-private_data

[PATCH v5 4/4] samples/bpf: example of get selected PMU counter value

2015-07-31 Thread Kaixu Xia
This is a simple example and shows how to use the new ability
to get the selected Hardware PMU counter value.

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 samples/bpf/Makefile   |  4 +++
 samples/bpf/bpf_helpers.h  |  2 ++
 samples/bpf/tracex6_kern.c | 26 ++
 samples/bpf/tracex6_user.c | 68 ++
 4 files changed, 100 insertions(+)
 create mode 100644 samples/bpf/tracex6_kern.c
 create mode 100644 samples/bpf/tracex6_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4450fed..63e7d50 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex2
 hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
+hostprogs-y += tracex6
 hostprogs-y += lathist
 
 test_verifier-objs := test_verifier.o libbpf.o
@@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
+tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
 lathist-objs := bpf_load.o libbpf.o lathist_user.o
 
 # Tell kbuild to always build the programs
@@ -37,6 +39,7 @@ always += tracex2_kern.o
 always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
+always += tracex6_kern.o
 always += tcbpf1_kern.o
 always += lathist_kern.o
 
@@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
+HOSTLOADLIBES_tracex6 += -lelf
 HOSTLOADLIBES_lathist += -lelf
 
 # point this to your LLVM backend with bpf support
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index bdf1c16..c8a3594 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
(void *) BPF_FUNC_get_current_uid_gid;
 static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm;
+static int (*bpf_perf_event_read)(void *map, int index) =
+   (void *) BPF_FUNC_perf_event_read;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
new file mode 100644
index 000..23d1cff
--- /dev/null
+++ b/samples/bpf/tracex6_kern.c
@@ -0,0 +1,26 @@
+#include linux/version.h
+#include uapi/linux/bpf.h
+#include bpf_helpers.h
+
+struct bpf_map_def SEC(maps) my_map = {
+   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+   .key_size = sizeof(int),
+   .value_size = sizeof(u32),
+   .max_entries = 32,
+};
+
+SEC(kprobe/sys_write)
+int bpf_prog1(struct pt_regs *ctx)
+{
+   u64 count;
+   u32 key = bpf_get_smp_processor_id();
+   char fmt[] = CPU-%d   %llu\n;
+
+   count = bpf_perf_event_read(my_map, key);
+   bpf_trace_printk(fmt, sizeof(fmt), key, count);
+
+   return 0;
+}
+
+char _license[] SEC(license) = GPL;
+u32 _version SEC(version) = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
new file mode 100644
index 000..928f05e
--- /dev/null
+++ b/samples/bpf/tracex6_user.c
@@ -0,0 +1,68 @@
+#include stdio.h
+#include unistd.h
+#include stdlib.h
+#include stdbool.h
+#include string.h
+#include fcntl.h
+#include poll.h
+#include sys/ioctl.h
+#include linux/perf_event.h
+#include linux/bpf.h
+#include libbpf.h
+#include bpf_load.h
+
+#define SAMPLE_PERIOD  0x7fffULL
+
+static void test_bpf_perf_event(void)
+{
+   int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+   int *pmu_fd = malloc(nr_cpus * sizeof(int));
+   unsigned long value;
+   int i;
+
+   struct perf_event_attr attr_insn_pmu = {
+   .freq = 0,
+   .sample_period = SAMPLE_PERIOD,
+   .inherit = 0,
+   .type = PERF_TYPE_HARDWARE,
+   .read_format = 0,
+   .sample_type = 0,
+   .config = 0,/* PMU: cycles */
+   };
+
+   for (i = 0; i  nr_cpus; i++) {
+   pmu_fd[i] = perf_event_open(attr_insn_pmu, -1/*pid*/, 
i/*cpu*/, -1/*group_fd*/, 0);
+   if (pmu_fd[i]  0)
+   printf(event syscall failed\n);
+
+   bpf_update_elem(map_fd[0], i, pmu_fd[i], BPF_ANY);
+   ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+   }
+
+   system(ls);
+   system(pwd);
+   system(sleep 2);
+
+   for (i = 0; i  nr_cpus; i++)
+   close(pmu_fd[i]);
+
+   close(map_fd);
+
+   free(pmu_fd);
+}
+
+int main(int argc, char **argv)
+{
+   char filename[256];
+
+   snprintf(filename, sizeof(filename), %s_kern.o, argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf(%s, bpf_log_buf);
+   return 1;
+   }
+
+   test_bpf_perf_event();
+
+   return 0;
+}
-- 
1.8.3.4

[PATCH v5 3/4] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter

2015-07-31 Thread Kaixu Xia
According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.

Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 include/linux/bpf.h|  1 +
 include/linux/perf_event.h | 12 +-
 include/uapi/linux/bpf.h   |  1 +
 kernel/bpf/verifier.c  | 56 +-
 kernel/events/core.c   | 10 +
 kernel/trace/bpf_trace.c   | 37 ++
 6 files changed, 92 insertions(+), 25 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0b394a..db9f781 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,6 +190,7 @@ extern const struct bpf_func_proto 
bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
+extern const struct bpf_func_proto bpf_perf_event_read_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 27e05c1..c1a3f39 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -662,7 +662,7 @@ extern void perf_pmu_migrate_context(struct pmu *pmu,
int src_cpu, int dst_cpu);
 extern u64 perf_event_read_value(struct perf_event *event,
 u64 *enabled, u64 *running);
-
+extern void __perf_event_read(void *info);
 
 struct perf_sample_data {
/*
@@ -863,6 +863,14 @@ static inline u64 __perf_event_count(struct perf_event 
*event)
return local64_read(event-count) + atomic64_read(event-child_count);
 }
 
+static inline u64 perf_event_count(struct perf_event *event)
+{
+   if (event-pmu-count)
+   return event-pmu-count(event);
+
+   return __perf_event_count(event);
+}
+
 extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks 
*callbacks);
@@ -984,6 +992,8 @@ static struct perf_event *perf_event_get(unsigned int fd)   
{ return ERR_PTR(-EIN
 static inline void perf_event_print_debug(void)
{ }
 static inline int perf_event_task_disable(void)
{ return -EINVAL; }
 static inline int perf_event_task_enable(void) { 
return -EINVAL; }
+static inline void __perf_event_read(void *info)   { }
+static inline u64 perf_event_count(struct perf_event *event)   { 
return 0; }
 static inline int perf_event_refresh(struct perf_event *event, int refresh)
 {
return -EINVAL;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 69a1f6b..b9b13ce 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -250,6 +250,7 @@ enum bpf_func_id {
 * Return: 0 on success
 */
BPF_FUNC_get_current_comm,
+   BPF_FUNC_perf_event_read,   /* u64 bpf_perf_event_read(map, index) 
*/
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866..93b6624 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -833,6 +833,44 @@ static int check_func_arg(struct verifier_env *env, u32 
regno,
return err;
 }
 
+static int check_func_limit(struct bpf_map **mapp, int func_id)
+{
+   struct bpf_map *map = *mapp;
+
+   if (map  map-map_type == BPF_MAP_TYPE_PROG_ARRAY 
+   func_id != BPF_FUNC_tail_call)
+   /* prog_array map type needs extra care:
+* only allow to pass it into bpf_tail_call() for now.
+* bpf_map_delete_elem() can be allowed in the future,
+* while bpf_map_update_elem() must only be done via syscall
+*/
+   return -EINVAL;
+
+   if (func_id == BPF_FUNC_tail_call 
+   map-map_type != BPF_MAP_TYPE_PROG_ARRAY)
+   /* don't allow any other map type to be passed into
+* bpf_tail_call()
+*/
+   return -EINVAL;
+
+   if (map  map-map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY 
+   func_id != BPF_FUNC_perf_event_read)
+   /* perf_event_array map type needs extra care:
+* only allow to pass it into bpf_perf_event_read() for now.
+* bpf_map_update/delete_elem() must only be done via syscall
+*/
+   return -EINVAL;
+
+   if (func_id == BPF_FUNC_perf_event_read 
+   map-map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+   /* don't allow any other map type to be passed into
+* bpf_perf_event_read

[PATCH v5 1/4] bpf: Make the bpf_prog_array_map more generic

2015-07-31 Thread Kaixu Xia
From: Wang Nan wangn...@huawei.com

According to the comments from Daniel, rewrite part of
the bpf_prog_array map code and make it more generic.
So the new perf_event_array map type can reuse most of
code with bpf_prog_array map and add fewer lines of
special code.

Tested the samples/bpf/tracex5 after this patch:
$ sudo ./tracex5
...
dd-1051  [000] d...26.682903: : mmap
dd-1051  [000] d...26.698348: : syscall=102 (one of get/set 
uid/pid/gid)
dd-1051  [000] d...26.703892: : read(fd=0, buf=0078c010, 
size=512)
dd-1051  [000] d...26.705847: : write(fd=1, buf=0078c010, 
size=512)
dd-1051  [000] d...26.707914: : read(fd=0, buf=0078c010, 
size=512)
dd-1051  [000] d...26.710988: : write(fd=1, buf=0078c010, 
size=512)
dd-1051  [000] d...26.711865: : read(fd=0, buf=0078c010, 
size=512)
dd-1051  [000] d...26.712704: : write(fd=1, buf=0078c010, 
size=512)
...

Signed-off-by: Wang Nan wangn...@huawei.com
Signed-off-by: Kaixu Xia xiaka...@huawei.com
---
 arch/x86/net/bpf_jit_comp.c |  6 ++--
 include/linux/bpf.h |  8 +++--
 kernel/bpf/arraymap.c   | 80 +++--
 kernel/bpf/core.c   |  2 +-
 kernel/bpf/syscall.c|  2 +-
 5 files changed, 60 insertions(+), 38 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 579a8fd..e377f07 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
  * goto out;
  *   if (++tail_call_cnt  MAX_TAIL_CALL_CNT)
  * goto out;
- *   prog = array-prog[index];
+ *   prog = array-ptrs[index];
  *   if (prog == NULL)
  * goto out;
  *   goto *(prog-bpf_func + prologue_size);
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
EMIT3(0x83, 0xC0, 0x01);  /* add eax, 1 */
EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], 
eax */
 
-   /* prog = array-prog[index]; */
+   /* prog = array-ptrs[index]; */
EMIT4(0x48, 0x8D, 0x44, 0xD6);/* lea rax, [rsi + rdx * 8 + 
0x50] */
-   EMIT1(offsetof(struct bpf_array, prog));
+   EMIT1(offsetof(struct bpf_array, ptrs));
EMIT3(0x48, 0x8B, 0x00);  /* mov rax, qword ptr [rax] */
 
/* if (prog == NULL)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476..a8ce262 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -24,6 +24,10 @@ struct bpf_map_ops {
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 
flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
+
+   /* funcs called by prog_array and perf_event_array map */
+   void *(*map_fd_get_ptr) (struct bpf_map *map, int fd);
+   void (*map_fd_put_ptr) (void *ptr);
 };
 
 struct bpf_map {
@@ -142,13 +146,13 @@ struct bpf_array {
bool owner_jited;
union {
char value[0] __aligned(8);
-   struct bpf_prog *prog[0] __aligned(8);
+   void *ptrs[0] __aligned(8);
};
 };
 #define MAX_TAIL_CALL_CNT 32
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-void bpf_prog_array_map_clear(struct bpf_map *map);
+void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog 
*fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229..45df657 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
 }
 late_initcall(register_array_map);
 
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
-   /* only bpf_prog file descriptors can be stored in prog_array map */
+   /* only file descriptors can be stored in this type of map */
if (attr-value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
return array_map_alloc(attr);
 }
 
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
 {
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
 
/* make sure it's empty */
for (i = 0; i  array-map.max_entries; i++)
-   BUG_ON(array-prog[i] != NULL);
+   BUG_ON(array-ptrs[i] != NULL);
kvfree(array);
 }
 
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
return NULL;
 }
 
 /* only called from