From: Peter Zijlstra
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Alexander Shishkin
---
include/linux/perf_event.h | 17 +
include/uapi/linux/perf_event.h | 16 +
kernel/events/core.c| 140 +---
kernel/events/internal.h| 23 +++
kernel/events/ring_buffer.c | 97 ++--
5 files changed, 264 insertions(+), 29 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 893a0d0798..344058c71d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -263,6 +263,18 @@ struct pmu {
* flush branch stack on context-switches (needed in cpu-wide mode)
*/
void (*flush_branch_stack) (void);
+
+ /*
+* Set up pmu-private data structures for an AUX area
+*/
+ void *(*setup_aux) (int cpu, void **pages,
+int nr_pages, bool overwrite);
+ /* optional */
+
+ /*
+* Free pmu-private AUX data structures
+*/
+ void (*free_aux)(void *aux); /* optional */
};
/**
@@ -782,6 +794,11 @@ static inline bool has_branch_stack(struct perf_event
*event)
return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}
+static inline bool has_aux(struct perf_event *event)
+{
+ return event->pmu->setup_aux;
+}
+
extern int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size);
extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index f7d18c2cb7..7e0967c0f5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -497,6 +497,22 @@ struct perf_event_mmap_page {
__u64 data_tail; /* user-space written tail */
__u64 data_offset;/* where the buffer starts */
__u64 data_size; /* data buffer size */
+
+ /*
+* AUX area is defined by aux_{offset,size} fields that should be set
+* by the userspace, so that
+*
+* aux_offset >= data_offset + data_size
+*
+* prior to mmap()ing it. Size of the mmap()ed area should be aux_size.
+*
+* Ring buffer pointers aux_{head,tail} have the same semantics as
+* data_{head,tail} and same ordering rules apply.
+*/
+ __u64 aux_head;
+ __u64 aux_tail;
+ __u64 aux_offset;
+ __u64 aux_size;
};
#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 23bacb8682..86b0577229 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4116,6 +4116,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
+ if (vma->vm_pgoff)
+ atomic_inc(&event->rb->aux_mmap_count);
}
/*
@@ -4135,6 +4137,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
+ /*
+* rb->aux_mmap_count will always drop before rb->mmap_count and
+* event->mmap_count, so it is ok to use event->mmap_mutex to
+* serialize with perf_mmap here.
+*/
+ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
+ atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex))
{
+ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+
+ rb_free_aux(rb);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
atomic_dec(&rb->mmap_count);
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4208,7 +4224,7 @@ out_put:
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
- .close = perf_mmap_close,
+ .close