Several use cases for AUX data, namely event sampling (including a piece of
AUX data in some perf event sample, so that the user can get, for example,
instruction traces leading up to a certain event like a breakpoint or a
hardware event), process core dumps (providing user with a history of a
process' instruction flow leading up to a crash), system crash dumps and
storing AUX data in pstore across reboot (to facilitate post-mortem
investigation of a system crash) require different parts of the kernel code
to be able to configure hardware to produce AUX data and collect it when it
is needed.

Luckily, there is already an api for in-kernel perf events, which has several
users. This proposal is to extend that api to allow in-kernel users to
allocate AUX buffers for kernel counters. Such users will call
rb_alloc_kernel() to allocate what they want and later copy the data out to
other backends, e.g. a sample in another event's ring buffer or a core dump
file. These buffers are never mapped to userspace.

There are no additional constraints or requirements on the pmu drivers.

A typical user of this interface will first create a kernel counter with a
call to perf_event_create_kernel_counter() and then allocate a ring buffer
for it with rb_alloc_kernel(). Data can then be copied out from the AUX
buffer using rb_output_aux(), which is passed a callback that will write
chunks of AUX data into the desired destination, such as perf_output_copy()
or dump_emit(). Caller needs to use perf_event_disable to make sure that the
counter is not active while it copies data out.

Signed-off-by: Alexander Shishkin <alexander.shish...@linux.intel.com>
---
 kernel/events/internal.h    | 19 +++++++++++
 kernel/events/ring_buffer.c | 83 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index a7ce82b670..4ae300ee02 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -56,6 +56,9 @@ struct ring_buffer {
        void                            *data_pages[0];
 };
 
+typedef unsigned long (*aux_copyfn)(void *data, const void *src,
+                                   unsigned long len);
+
 extern void rb_free(struct ring_buffer *rb);
 extern void ring_buffer_unaccount(struct ring_buffer *rb, bool aux);
 
@@ -82,6 +85,11 @@ extern void perf_event_wakeup(struct perf_event *event);
 extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
                        pgoff_t pgoff, int nr_pages, long watermark, int flags);
 extern void rb_free_aux(struct ring_buffer *rb);
+extern long rb_output_aux(struct ring_buffer *rb, unsigned long from,
+                         unsigned long to, aux_copyfn copyfn, void *data);
+extern int rb_alloc_kernel(struct perf_event *event, int nr_pages,
+                          int aux_nr_pages);
+extern void rb_free_kernel(struct ring_buffer *rb, struct perf_event *event);
 extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
 extern void ring_buffer_put(struct ring_buffer *rb);
 
@@ -126,6 +134,17 @@ static inline unsigned long perf_aux_size(struct 
ring_buffer *rb)
        return rb->aux_nr_pages << PAGE_SHIFT;
 }
 
+static inline bool kernel_rb_event(struct perf_event *event)
+{
+       /*
+        * Having a ring buffer and not being on any ring buffers' wakeup
+        * list means it was attached by rb_alloc_kernel() and not
+        * ring_buffer_attach(). It's the only case when these two
+        * conditions take place at the same time.
+        */
+       return event->rb && list_empty(&event->rb_entry);
+}
+
 #define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...)       \
 {                                                                      \
        unsigned long size, written;                                    \
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 484ce09d96..9244a4fa9b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -578,6 +578,40 @@ void ring_buffer_unaccount(struct ring_buffer *rb, bool 
aux)
        free_uid(rb->mmap_user);
 }
 
+/*
+ * Copy out AUX data from a ring_buffer using a supplied callback.
+ */
+long rb_output_aux(struct ring_buffer *rb, unsigned long from,
+                  unsigned long to, aux_copyfn copyfn, void *data)
+{
+       unsigned long tocopy, remainder, len = 0;
+       void *addr;
+
+       from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+       to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+
+       do {
+               tocopy = PAGE_SIZE - offset_in_page(from);
+               if (to > from)
+                       tocopy = min(tocopy, to - from);
+               if (!tocopy)
+                       break;
+
+               addr = rb->aux_pages[from >> PAGE_SHIFT];
+               addr += offset_in_page(from);
+
+               remainder = copyfn(data, addr, tocopy);
+               if (remainder)
+                       return -EFAULT;
+
+               len += tocopy;
+               from += tocopy;
+               from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+       } while (to != from);
+
+       return len;
+}
+
 #define PERF_AUX_GFP   (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
 
 static struct page *rb_alloc_aux_page(int node, int order)
@@ -749,6 +783,55 @@ void rb_free_aux(struct ring_buffer *rb)
        }
 }
 
+/*
+ * Allocate a ring_buffer for a kernel event and attach it to this event.
+ * This ring_buffer will not participate in mmap operations and set_output,
+ * so ring_buffer_attach() and related complications do not apply.
+ */
+int rb_alloc_kernel(struct perf_event *event, int nr_pages, int aux_nr_pages)
+{
+       struct ring_buffer *rb;
+       int ret, pgoff = nr_pages + 1;
+
+       /*
+        * Use overwrite mode (!RING_BUFFER_WRITABLE) for both data and aux
+        * areas as we don't want wakeups or interrupts.
+        */
+       rb = rb_alloc(NULL, nr_pages, 0, event->cpu, 0);
+       if (IS_ERR(rb))
+               return PTR_ERR(rb);
+
+       ret = rb_alloc_aux(rb, event, pgoff, aux_nr_pages, 0, 0);
+       if (ret) {
+               rb_free(rb);
+               return ret;
+       }
+
+       /*
+        * These buffers never get mmapped; so the only use of the
+        * aux_mmap_count is to enable AUX transactions
+        * (see perf_aux_output_begin()).
+        */
+       atomic_set(&rb->aux_mmap_count, 1);
+
+       /*
+        * Kernel counters don't need ring buffer wakeups, therefore we don't
+        * use ring_buffer_attach() here and event->rb_entry stays empty.
+        */
+       rcu_assign_pointer(event->rb, rb);
+
+       return 0;
+}
+
+void rb_free_kernel(struct ring_buffer *rb, struct perf_event *event)
+{
+       WARN_ON_ONCE(atomic_read(&rb->refcount) != 1);
+       atomic_set(&rb->aux_mmap_count, 0);
+       rcu_assign_pointer(event->rb, NULL);
+       rb_free_aux(rb);
+       rb_free(rb);
+}
+
 #ifndef CONFIG_PERF_USE_VMALLOC
 
 /*
-- 
2.9.3

Reply via email to