[RFC PATCH 02/17] perf: Factor out mlock accounting

Alexander Shishkin Tue, 05 Sep 2017 06:32:06 -0700

This patch moves ring buffer memory accounting down the rb_alloc() path
so that its callers won't have to worry about it. This also serves the
additional purpose of slightly cleaning up perf_mmap().


Signed-off-by: Alexander Shishkin <alexander.shish...@linux.intel.com>
---
 kernel/events/core.c        |  67 +++-----------------
 kernel/events/internal.h    |   5 +-
 kernel/events/ring_buffer.c | 145 ++++++++++++++++++++++++++++++++++++++------
 3 files changed, 136 insertions(+), 81 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9389e27cb0..24099ed9e5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5122,6 +5122,8 @@ void ring_buffer_put(struct ring_buffer *rb)
        if (!atomic_dec_and_test(&rb->refcount))
                return;
 
+       ring_buffer_unaccount(rb, false);
+
        WARN_ON_ONCE(!list_empty(&rb->event_list));
 
        call_rcu(&rb->rcu_head, rb_free_rcu);
@@ -5156,9 +5158,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        struct perf_event *event = vma->vm_file->private_data;
 
        struct ring_buffer *rb = ring_buffer_get(event);
-       struct user_struct *mmap_user = rb->mmap_user;
-       int mmap_locked = rb->mmap_locked;
-       unsigned long size = perf_data_size(rb);
 
        if (event->pmu->event_unmapped)
                event->pmu->event_unmapped(event, vma->vm_mm);
@@ -5178,11 +5177,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
                 */
                perf_pmu_output_stop(event);
 
-               /* now it's safe to free the pages */
-               atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
-               vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
-
-               /* this has to be the last one */
+               /* now it's safe to free the pages; ought to be the last one */
                rb_free_aux(rb);
                WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
 
@@ -5243,19 +5238,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        }
        rcu_read_unlock();
 
-       /*
-        * It could be there's still a few 0-ref events on the list; they'll
-        * get cleaned up by free_event() -- they'll also still have their
-        * ref on the rb and will free it whenever they are done with it.
-        *
-        * Aside from that, this buffer is 'fully' detached and unmapped,
-        * undo the VM accounting.
-        */
-
-       atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
-       vma->vm_mm->pinned_vm -= mmap_locked;
-       free_uid(mmap_user);
-
 out_put:
        ring_buffer_put(rb); /* could be last */
 }
@@ -5270,13 +5252,9 @@ static const struct vm_operations_struct perf_mmap_vmops 
= {
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct perf_event *event = file->private_data;
-       unsigned long user_locked, user_lock_limit;
-       struct user_struct *user = current_user();
-       unsigned long locked, lock_limit;
        struct ring_buffer *rb = NULL;
        unsigned long vma_size;
        unsigned long nr_pages;
-       long user_extra = 0, extra = 0;
        int ret = 0, flags = 0;
 
        /*
@@ -5347,7 +5325,6 @@ static int perf_mmap(struct file *file, struct 
vm_area_struct *vma)
                }
 
                atomic_set(&rb->aux_mmap_count, 1);
-               user_extra = nr_pages;
 
                goto accounting;
        }
@@ -5384,49 +5361,24 @@ static int perf_mmap(struct file *file, struct 
vm_area_struct *vma)
                goto unlock;
        }
 
-       user_extra = nr_pages + 1;
-
 accounting:
-       user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
-
-       /*
-        * Increase the limit linearly with more CPUs:
-        */
-       user_lock_limit *= num_online_cpus();
-
-       user_locked = atomic_long_read(&user->locked_vm) + user_extra;
-
-       if (user_locked > user_lock_limit)
-               extra = user_locked - user_lock_limit;
-
-       lock_limit = rlimit(RLIMIT_MEMLOCK);
-       lock_limit >>= PAGE_SHIFT;
-       locked = vma->vm_mm->pinned_vm + extra;
-
-       if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
-               !capable(CAP_IPC_LOCK)) {
-               ret = -EPERM;
-               goto unlock;
-       }
-
        WARN_ON(!rb && event->rb);
 
        if (vma->vm_flags & VM_WRITE)
                flags |= RING_BUFFER_WRITABLE;
 
        if (!rb) {
-               rb = rb_alloc(nr_pages,
+               rb = rb_alloc(vma->vm_mm, nr_pages,
                              event->attr.watermark ? 
event->attr.wakeup_watermark : 0,
                              event->cpu, flags);
 
-               if (!rb) {
-                       ret = -ENOMEM;
+               if (IS_ERR_OR_NULL(rb)) {
+                       ret = PTR_ERR(rb);
+                       rb = NULL;
                        goto unlock;
                }
 
                atomic_set(&rb->mmap_count, 1);
-               rb->mmap_user = get_current_user();
-               rb->mmap_locked = extra;
 
                ring_buffer_attach(event, rb);
 
@@ -5435,15 +5387,10 @@ static int perf_mmap(struct file *file, struct 
vm_area_struct *vma)
        } else {
                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
                                   event->attr.aux_watermark, flags);
-               if (!ret)
-                       rb->aux_mmap_locked = extra;
        }
 
 unlock:
        if (!ret) {
-               atomic_long_add(user_extra, &user->locked_vm);
-               vma->vm_mm->pinned_vm += extra;
-
                atomic_inc(&event->mmap_count);
        } else if (rb) {
                atomic_dec(&rb->mmap_count);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 843e970473..3e603c45eb 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -36,6 +36,7 @@ struct ring_buffer {
        atomic_t                        mmap_count;
        unsigned long                   mmap_locked;
        struct user_struct              *mmap_user;
+       struct mm_struct                *mmap_mapping;
 
        /* AUX area */
        long                            aux_head;
@@ -56,6 +57,7 @@ struct ring_buffer {
 };
 
 extern void rb_free(struct ring_buffer *rb);
+extern void ring_buffer_unaccount(struct ring_buffer *rb, bool aux);
 
 static inline void rb_free_rcu(struct rcu_head *rcu_head)
 {
@@ -74,7 +76,8 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, 
bool pause)
 }
 
 extern struct ring_buffer *
-rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+rb_alloc(struct mm_struct *mm, int nr_pages, long watermark, int cpu,
+        int flags);
 extern void perf_event_wakeup(struct perf_event *event);
 extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
                        pgoff_t pgoff, int nr_pages, long watermark, int flags);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index af71a84e12..d36f169cae 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -505,6 +505,88 @@ void *perf_get_aux(struct perf_output_handle *handle)
        return handle->rb->aux_priv;
 }
 
+/*
+ * Check if the current user can afford @nr_pages, considering the
+ * perf_event_mlock sysctl and their mlock limit. If the former is exceeded,
+ * pin the remainder on their mm, if the latter is not sufficient either,
+ * error out. Otherwise, keep track of the pages used in the ring_buffer so
+ * that the accounting can be undone when the pages are freed.
+ */
+static int ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm,
+                              unsigned long nr_pages, bool aux)
+{
+       unsigned long total, limit, pinned;
+
+       if (!mm)
+               mm = rb->mmap_mapping;
+
+       rb->mmap_user = current_user();
+
+       limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+
+       /*
+        * Increase the limit linearly with more CPUs:
+        */
+       limit *= num_online_cpus();
+
+       total = atomic_long_read(&rb->mmap_user->locked_vm) + nr_pages;
+
+       pinned = 0;
+       if (total > limit) {
+               /*
+                * Everything that's over the sysctl_perf_event_mlock
+                * limit needs to be accounted to the consumer's mm.
+                */
+               if (!mm)
+                       return -EPERM;
+
+               pinned = total - limit;
+
+               limit = rlimit(RLIMIT_MEMLOCK);
+               limit >>= PAGE_SHIFT;
+               total = mm->pinned_vm + pinned;
+
+               if ((total > limit) && perf_paranoid_tracepoint_raw() &&
+                   !capable(CAP_IPC_LOCK)) {
+                       return -EPERM;
+               }
+
+               if (aux)
+                       rb->aux_mmap_locked = pinned;
+               else
+                       rb->mmap_locked = pinned;
+
+               mm->pinned_vm += pinned;
+       }
+
+       if (!rb->mmap_mapping)
+               rb->mmap_mapping = mm;
+
+       /* account for user page */
+       if (!aux)
+               nr_pages++;
+
+       rb->mmap_user = get_current_user();
+       atomic_long_add(nr_pages, &rb->mmap_user->locked_vm);
+
+       return 0;
+}
+
+/*
+ * Undo the mlock pages accounting done in ring_buffer_account().
+ */
+void ring_buffer_unaccount(struct ring_buffer *rb, bool aux)
+{
+       unsigned long nr_pages = aux ? rb->aux_nr_pages : rb->nr_pages + 1;
+       unsigned long pinned = aux ? rb->aux_mmap_locked : rb->mmap_locked;
+
+       atomic_long_sub(nr_pages, &rb->mmap_user->locked_vm);
+       if (rb->mmap_mapping)
+               rb->mmap_mapping->pinned_vm -= pinned;
+
+       free_uid(rb->mmap_user);
+}
+
 #define PERF_AUX_GFP   (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
 
 static struct page *rb_alloc_aux_page(int node, int order)
@@ -574,11 +656,16 @@ int rb_alloc_aux(struct ring_buffer *rb, struct 
perf_event *event,
 {
        bool overwrite = !(flags & RING_BUFFER_WRITABLE);
        int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
-       int ret = -ENOMEM, max_order = 0;
+       int ret, max_order = 0;
 
        if (!has_aux(event))
                return -EOPNOTSUPP;
 
+       ret = ring_buffer_account(rb, NULL, nr_pages, true);
+       if (ret)
+               return ret;
+
+       ret = -ENOMEM;
        if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
                /*
                 * We need to start with the max_order that fits in nr_pages,
@@ -593,7 +680,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event 
*event,
                if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) 
&&
                    !overwrite) {
                        if (!max_order)
-                               return -EINVAL;
+                               goto out;
 
                        max_order--;
                }
@@ -654,18 +741,23 @@ int rb_alloc_aux(struct ring_buffer *rb, struct 
perf_event *event,
                rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
 
 out:
-       if (!ret)
+       if (!ret) {
                rb->aux_pgoff = pgoff;
-       else
+       } else {
+               ring_buffer_unaccount(rb, true);
                __rb_free_aux(rb);
+       }
 
        return ret;
 }
 
 void rb_free_aux(struct ring_buffer *rb)
 {
-       if (atomic_dec_and_test(&rb->aux_refcount))
+       if (atomic_dec_and_test(&rb->aux_refcount)) {
+               ring_buffer_unaccount(rb, true);
+
                __rb_free_aux(rb);
+       }
 }
 
 #ifndef CONFIG_PERF_USE_VMALLOC
@@ -699,22 +791,25 @@ static void *perf_mmap_alloc_page(int cpu)
        return page_address(page);
 }
 
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long 
watermark,
+                            int cpu, int flags)
 {
+       unsigned long size = offsetof(struct ring_buffer, data_pages[nr_pages]);
        struct ring_buffer *rb;
-       unsigned long size;
-       int i;
-
-       size = sizeof(struct ring_buffer);
-       size += nr_pages * sizeof(void *);
+       int i, ret = -ENOMEM;
 
        rb = kzalloc(size, GFP_KERNEL);
        if (!rb)
                goto fail;
 
+       ret = ring_buffer_account(rb, mm, nr_pages, false);
+       if (ret)
+               goto fail_free_rb;
+
+       ret = -ENOMEM;
        rb->user_page = perf_mmap_alloc_page(cpu);
        if (!rb->user_page)
-               goto fail_user_page;
+               goto fail_unaccount;
 
        for (i = 0; i < nr_pages; i++) {
                rb->data_pages[i] = perf_mmap_alloc_page(cpu);
@@ -734,11 +829,14 @@ struct ring_buffer *rb_alloc(int nr_pages, long 
watermark, int cpu, int flags)
 
        free_page((unsigned long)rb->user_page);
 
-fail_user_page:
+fail_unaccount:
+       ring_buffer_unaccount(rb, false);
+
+fail_free_rb:
        kfree(rb);
 
 fail:
-       return NULL;
+       return ERR_PTR(ret);
 }
 
 static void perf_mmap_free_page(unsigned long addr)
@@ -805,19 +903,23 @@ void rb_free(struct ring_buffer *rb)
        schedule_work(&rb->work);
 }
 
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long 
watermark,
+                            int cpu, int flags)
 {
+       unsigned long size = offsetof(struct ring_buffer, data_pages[1]);
        struct ring_buffer *rb;
-       unsigned long size;
        void *all_buf;
-
-       size = sizeof(struct ring_buffer);
-       size += sizeof(void *);
+       int ret = -ENOMEM;
 
        rb = kzalloc(size, GFP_KERNEL);
        if (!rb)
                goto fail;
 
+       ret = ring_buffer_account(rb, mm, nr_pages, false);
+       if (ret)
+               goto fail_free;
+
+       ret = -ENOMEM;
        INIT_WORK(&rb->work, rb_free_work);
 
        all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
@@ -836,10 +938,13 @@ struct ring_buffer *rb_alloc(int nr_pages, long 
watermark, int cpu, int flags)
        return rb;
 
 fail_all_buf:
+       ring_buffer_unaccount(rb, false);
+
+fail_free:
        kfree(rb);
 
 fail:
-       return NULL;
+       return ERR_PTR(ret);
 }
 
 #endif
-- 
2.14.1

[RFC PATCH 02/17] perf: Factor out mlock accounting

Reply via email to