On Thu, Jun 06, 2013 at 01:58:06PM +0800, Yan, Zheng wrote:
> From: "Yan, Zheng" <zheng.z....@intel.com>
> 
> If perf event buffer is in overwrite mode, the kernel only updates
> the data head when it overwrites old samples. The program that owns
> the buffer need periodically check the buffer and update a variable
> that tracks the date tail. If the program fails to do this in time,
> the data tail can be overwritted by new samples. The program has to
> rewind the buffer because it does not know where is the first vaild
> sample.
> 
> This patch makes the kernel update the date tail when it overwrites
> old events. So the program that owns the event buffer can always
> read the latest samples. This is convenient for programs that use
> perf to do branch tracing. One use case is GDB branch tracing:
> (http://sourceware.org/ml/gdb-patches/2012-06/msg00172.html)
> It uses perf interface to read BTS, but only cares the branches
> before the ptrace event.
> 
> I added code to perf_output_{begin/end} to count how many cycles
> are spent by sample output, then ran "perf record" to profile kernel
> compilation 10 times on IvyBridge-EP. (perf record -a make -j 60)
> The first number is scaled to 1000, the rest numbers are scaled by
> the same factor.
> 
>         before   overwrite mode      after   overwrite mode
> AVG      1000        999             1046        1044
> STDEV    19.4        19.5            17.1        17.9

OK, so I was sure I replied to this email; but apparently I didn't :/

So its still adding about 5% overhead to the regular case; this is sad.

What does something like the below do?

---
 include/linux/perf_event.h  |  2 +
 kernel/events/core.c        | 60 ++++++++++++++++++++++++------
 kernel/events/internal.h    |  2 +
 kernel/events/ring_buffer.c | 90 +++++++++++++++++++++++++++------------------
 4 files changed, 106 insertions(+), 48 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8873f82..bcce98a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -753,6 +753,8 @@ static inline bool has_branch_stack(struct perf_event 
*event)
 
 extern int perf_output_begin(struct perf_output_handle *handle,
                             struct perf_event *event, unsigned int size);
+extern int perf_output_begin_overwrite(struct perf_output_handle *handle,
+                            struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
 extern unsigned int perf_output_copy(struct perf_output_handle *handle,
                             const void *buf, unsigned int len);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1833bc5..4d674e9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3878,6 +3878,8 @@ static const struct vm_operations_struct perf_mmap_vmops 
= {
        .page_mkwrite   = perf_mmap_fault,
 };
 
+static void perf_event_set_overflow(struct perf_event *event, struct 
ring_buffer *rb);
+
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct perf_event *event = file->private_data;
@@ -3985,6 +3987,7 @@ static int perf_mmap(struct file *file, struct 
vm_area_struct *vma)
        vma->vm_mm->pinned_vm += extra;
 
        ring_buffer_attach(event, rb);
+       perf_event_set_overflow(event, rb);
        rcu_assign_pointer(event->rb, rb);
 
        perf_event_update_userpage(event);
@@ -4595,9 +4598,12 @@ void perf_prepare_sample(struct perf_event_header 
*header,
        }
 }
 
-static void perf_event_output(struct perf_event *event,
-                               struct perf_sample_data *data,
-                               struct pt_regs *regs)
+static __always_inline void 
+__perf_event_output(struct perf_event *event,
+                   struct perf_sample_data *data,
+                   struct pt_regs *regs,
+                   int (*output_begin)(struct perf_output_handle *, 
+                                       struct perf_event *, unsigned int))
 {
        struct perf_output_handle handle;
        struct perf_event_header header;
@@ -4607,7 +4613,7 @@ static void perf_event_output(struct perf_event *event,
 
        perf_prepare_sample(&header, data, event, regs);
 
-       if (perf_output_begin(&handle, event, header.size))
+       if (output_begin(&handle, event, header.size))
                goto exit;
 
        perf_output_sample(&handle, &header, data, event);
@@ -4618,6 +4624,33 @@ static void perf_event_output(struct perf_event *event,
        rcu_read_unlock();
 }
 
+static void perf_event_output(struct perf_event *event,
+                               struct perf_sample_data *data,
+                               struct pt_regs *regs)
+{
+       __perf_event_output(event, data, regs, perf_output_begin);
+}
+
+static void perf_event_output_overwrite(struct perf_event *event,
+                               struct perf_sample_data *data,
+                               struct pt_regs *regs)
+{
+       __perf_event_output(event, data, regs, perf_output_begin_overwrite);
+}
+
+static void 
+perf_event_set_overflow(struct perf_event *event, struct ring_buffer *rb)
+{
+       if (event->overflow_handler != perf_event_output ||
+           event->overflow_handler != perf_event_output_overwrite)
+               return;
+
+       if (rb->overwrite)
+               event->overflow_handler = perf_event_output_overwrite;
+       else
+               event->overflow_handler = perf_event_output;
+}
+
 /*
  * read event_id
  */
@@ -5183,10 +5216,7 @@ static int __perf_event_overflow(struct perf_event 
*event,
                irq_work_queue(&event->pending);
        }
 
-       if (event->overflow_handler)
-               event->overflow_handler(event, data, regs);
-       else
-               perf_event_output(event, data, regs);
+       event->overflow_handler(event, data, regs);
 
        if (event->fasync && event->pending_kill) {
                event->pending_wakeup = 1;
@@ -6501,8 +6531,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                context = parent_event->overflow_handler_context;
        }
 
-       event->overflow_handler = overflow_handler;
-       event->overflow_handler_context = context;
+       if (overflow_handler) {
+               event->overflow_handler = overflow_handler;
+               event->overflow_handler_context = context;
+       } else {
+               event->overflow_handler = perf_event_output;
+               event->overflow_handler_context = NULL;
+       }
 
        perf_event__state_init(event);
 
@@ -6736,9 +6771,10 @@ perf_event_set_output(struct perf_event *event, struct 
perf_event *output_event)
        if (old_rb)
                ring_buffer_detach(event, old_rb);
 
-       if (rb)
+       if (rb) {
                ring_buffer_attach(event, rb);
-
+               perf_event_set_overflow(event, rb);
+       }
        rcu_assign_pointer(event->rb, rb);
 
        if (old_rb) {
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca65997..c4e4610 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -20,6 +20,8 @@ struct ring_buffer {
 
        atomic_t                        poll;           /* POLL_ for wakeups */
 
+       local_t                         tail;           /* read position     */
+       local_t                         next_tail;      /* next read position */
        local_t                         head;           /* write position    */
        local_t                         nest;           /* nested writers    */
        local_t                         events;         /* event limit       */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144..5887044 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -15,28 +15,9 @@
 
 #include "internal.h"
 
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
-                             unsigned long offset, unsigned long head)
+static bool perf_output_space(unsigned long tail, unsigned long offset,
+                             unsigned long head, unsigned long mask)
 {
-       unsigned long sz = perf_data_size(rb);
-       unsigned long mask = sz - 1;
-
-       /*
-        * check if user-writable
-        * overwrite : over-write its own tail
-        * !overwrite: buffer possibly drops events.
-        */
-       if (rb->overwrite)
-               return true;
-
-       /*
-        * verify that payload is not bigger than buffer
-        * otherwise masking logic may fail to detect
-        * the "not enough space" condition
-        */
-       if ((head - offset) > sz)
-               return false;
-
        offset = (offset - tail) & mask;
        head   = (head   - tail) & mask;
 
@@ -109,11 +90,11 @@ static void perf_output_put_handle(struct 
perf_output_handle *handle)
        preempt_enable();
 }
 
-int perf_output_begin(struct perf_output_handle *handle,
-                     struct perf_event *event, unsigned int size)
+static __always_inline int __perf_output_begin(struct perf_output_handle 
*handle,
+                     struct perf_event *event, unsigned int size, bool 
overwrite)
 {
        struct ring_buffer *rb;
-       unsigned long tail, offset, head;
+       unsigned long tail, offset, head, max_size;
        int have_lost;
        struct perf_sample_data sample_data;
        struct {
@@ -136,7 +117,8 @@ int perf_output_begin(struct perf_output_handle *handle,
        handle->rb      = rb;
        handle->event   = event;
 
-       if (!rb->nr_pages)
+       max_size = perf_data_size(rb);
+       if (size > max_size)
                goto out;
 
        have_lost = local_read(&rb->lost);
@@ -149,19 +131,43 @@ int perf_output_begin(struct perf_output_handle *handle,
 
        perf_output_get_handle(handle);
 
-       do {
+       if (overwrite) {
+               do {
+                       tail = local_read(&rb->tail);
+                       offset = local_read(&rb->head);
+                       head = offset + size;
+                       if (unlikely(!perf_output_space(tail, offset, head,
+                                                       max_size - 1))) {
+                               tail = local_read(&rb->next_tail);
+                               local_set(&rb->tail, tail);
+                               rb->user_page->data_tail = tail;
+                       }
+               } while (local_cmpxchg(&rb->head, offset, head) != offset);
+
                /*
-                * Userspace could choose to issue a mb() before updating the
-                * tail pointer. So that all reads will be completed before the
-                * write is issued.
+                * Save the start of next event when half of the buffer
+                * has been filled. Later when the event buffer overflows,
+                * update the tail pointer to point to it.
                 */
-               tail = ACCESS_ONCE(rb->user_page->data_tail);
-               smp_rmb();
-               offset = head = local_read(&rb->head);
-               head += size;
-               if (unlikely(!perf_output_space(rb, tail, offset, head)))
-                       goto fail;
-       } while (local_cmpxchg(&rb->head, offset, head) != offset);
+               if (tail == local_read(&rb->next_tail) &&
+                   head - tail >= (max_size >> 1))
+                       local_cmpxchg(&rb->next_tail, tail, head);
+       } else {
+               do {
+                       /*
+                        * Userspace could choose to issue a mb() before
+                        * updating the tail pointer. So that all reads will
+                        * be completed before the write is issued.
+                        */
+                       tail = ACCESS_ONCE(rb->user_page->data_tail);
+                       smp_rmb();
+                       offset = local_read(&rb->head);
+                       head = offset + size;
+                       if (unlikely(!perf_output_space(tail, offset, head,
+                                                       max_size - 1)))
+                               goto fail;
+               } while (local_cmpxchg(&rb->head, offset, head) != offset);
+       }
 
        if (head - local_read(&rb->wakeup) > rb->watermark)
                local_add(rb->watermark, &rb->wakeup);
@@ -194,6 +200,18 @@ int perf_output_begin(struct perf_output_handle *handle,
        return -ENOSPC;
 }
 
+int perf_output_begin(struct perf_output_handle *handle,
+                     struct perf_event *event, unsigned int size)
+{
+       return __perf_output_begin(handle, event, size, false);
+}
+
+int perf_output_begin_overwrite(struct perf_output_handle *handle,
+                     struct perf_event *event, unsigned int size)
+{
+       return __perf_output_begin(handle, event, size, true);
+}
+
 unsigned int perf_output_copy(struct perf_output_handle *handle,
                      const void *buf, unsigned int len)
 {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to