This patch moves ring buffer memory accounting down the rb_alloc() path so that its callers won't have to worry about it. This also serves the additional purpose of slightly cleaning up perf_mmap().
Signed-off-by: Alexander Shishkin <alexander.shish...@linux.intel.com> --- kernel/events/core.c | 67 +++----------------- kernel/events/internal.h | 5 +- kernel/events/ring_buffer.c | 145 ++++++++++++++++++++++++++++++++++++++------ 3 files changed, 136 insertions(+), 81 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9389e27cb0..24099ed9e5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5122,6 +5122,8 @@ void ring_buffer_put(struct ring_buffer *rb) if (!atomic_dec_and_test(&rb->refcount)) return; + ring_buffer_unaccount(rb, false); + WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); @@ -5156,9 +5158,6 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; struct ring_buffer *rb = ring_buffer_get(event); - struct user_struct *mmap_user = rb->mmap_user; - int mmap_locked = rb->mmap_locked; - unsigned long size = perf_data_size(rb); if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); @@ -5178,11 +5177,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) */ perf_pmu_output_stop(event); - /* now it's safe to free the pages */ - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; - - /* this has to be the last one */ + /* now it's safe to free the pages; ought to be the last one */ rb_free_aux(rb); WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); @@ -5243,19 +5238,6 @@ static void perf_mmap_close(struct vm_area_struct *vma) } rcu_read_unlock(); - /* - * It could be there's still a few 0-ref events on the list; they'll - * get cleaned up by free_event() -- they'll also still have their - * ref on the rb and will free it whenever they are done with it. - * - * Aside from that, this buffer is 'fully' detached and unmapped, - * undo the VM accounting. - */ - - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; - free_uid(mmap_user); - out_put: ring_buffer_put(rb); /* could be last */ } @@ -5270,13 +5252,9 @@ static const struct vm_operations_struct perf_mmap_vmops = { static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; struct ring_buffer *rb = NULL; unsigned long vma_size; unsigned long nr_pages; - long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* @@ -5347,7 +5325,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } atomic_set(&rb->aux_mmap_count, 1); - user_extra = nr_pages; goto accounting; } @@ -5384,49 +5361,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - user_extra = nr_pages + 1; - accounting: - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm) + user_extra; - - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; - - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->pinned_vm + extra; - - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } - WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; if (!rb) { - rb = rb_alloc(nr_pages, + rb = rb_alloc(vma->vm_mm, nr_pages, event->attr.watermark ? event->attr.wakeup_watermark : 0, event->cpu, flags); - if (!rb) { - ret = -ENOMEM; + if (IS_ERR_OR_NULL(rb)) { + ret = PTR_ERR(rb); + rb = NULL; goto unlock; } atomic_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); - rb->mmap_locked = extra; ring_buffer_attach(event, rb); @@ -5435,15 +5387,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); - if (!ret) - rb->aux_mmap_locked = extra; } unlock: if (!ret) { - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; - atomic_inc(&event->mmap_count); } else if (rb) { atomic_dec(&rb->mmap_count); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 843e970473..3e603c45eb 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -36,6 +36,7 @@ struct ring_buffer { atomic_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; + struct mm_struct *mmap_mapping; /* AUX area */ long aux_head; @@ -56,6 +57,7 @@ struct ring_buffer { }; extern void rb_free(struct ring_buffer *rb); +extern void ring_buffer_unaccount(struct ring_buffer *rb, bool aux); static inline void rb_free_rcu(struct rcu_head *rcu_head) { @@ -74,7 +76,8 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause) } extern struct ring_buffer * -rb_alloc(int nr_pages, long watermark, int cpu, int flags); +rb_alloc(struct mm_struct *mm, int nr_pages, long watermark, int cpu, + int flags); extern void perf_event_wakeup(struct perf_event *event); extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12..d36f169cae 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -505,6 +505,88 @@ void *perf_get_aux(struct perf_output_handle *handle) return handle->rb->aux_priv; } +/* + * Check if the current user can afford @nr_pages, considering the + * perf_event_mlock sysctl and their mlock limit. If the former is exceeded, + * pin the remainder on their mm, if the latter is not sufficient either, + * error out. Otherwise, keep track of the pages used in the ring_buffer so + * that the accounting can be undone when the pages are freed. + */ +static int ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm, + unsigned long nr_pages, bool aux) +{ + unsigned long total, limit, pinned; + + if (!mm) + mm = rb->mmap_mapping; + + rb->mmap_user = current_user(); + + limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + limit *= num_online_cpus(); + + total = atomic_long_read(&rb->mmap_user->locked_vm) + nr_pages; + + pinned = 0; + if (total > limit) { + /* + * Everything that's over the sysctl_perf_event_mlock + * limit needs to be accounted to the consumer's mm. + */ + if (!mm) + return -EPERM; + + pinned = total - limit; + + limit = rlimit(RLIMIT_MEMLOCK); + limit >>= PAGE_SHIFT; + total = mm->pinned_vm + pinned; + + if ((total > limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { + return -EPERM; + } + + if (aux) + rb->aux_mmap_locked = pinned; + else + rb->mmap_locked = pinned; + + mm->pinned_vm += pinned; + } + + if (!rb->mmap_mapping) + rb->mmap_mapping = mm; + + /* account for user page */ + if (!aux) + nr_pages++; + + rb->mmap_user = get_current_user(); + atomic_long_add(nr_pages, &rb->mmap_user->locked_vm); + + return 0; +} + +/* + * Undo the mlock pages accounting done in ring_buffer_account(). + */ +void ring_buffer_unaccount(struct ring_buffer *rb, bool aux) +{ + unsigned long nr_pages = aux ? rb->aux_nr_pages : rb->nr_pages + 1; + unsigned long pinned = aux ? rb->aux_mmap_locked : rb->mmap_locked; + + atomic_long_sub(nr_pages, &rb->mmap_user->locked_vm); + if (rb->mmap_mapping) + rb->mmap_mapping->pinned_vm -= pinned; + + free_uid(rb->mmap_user); +} + #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) static struct page *rb_alloc_aux_page(int node, int order) @@ -574,11 +656,16 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); - int ret = -ENOMEM, max_order = 0; + int ret, max_order = 0; if (!has_aux(event)) return -EOPNOTSUPP; + ret = ring_buffer_account(rb, NULL, nr_pages, true); + if (ret) + return ret; + + ret = -ENOMEM; if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { /* * We need to start with the max_order that fits in nr_pages, @@ -593,7 +680,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && !overwrite) { if (!max_order) - return -EINVAL; + goto out; max_order--; } @@ -654,18 +741,23 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); out: - if (!ret) + if (!ret) { rb->aux_pgoff = pgoff; - else + } else { + ring_buffer_unaccount(rb, true); __rb_free_aux(rb); + } return ret; } void rb_free_aux(struct ring_buffer *rb) { - if (atomic_dec_and_test(&rb->aux_refcount)) + if (atomic_dec_and_test(&rb->aux_refcount)) { + ring_buffer_unaccount(rb, true); + __rb_free_aux(rb); + } } #ifndef CONFIG_PERF_USE_VMALLOC @@ -699,22 +791,25 @@ static void *perf_mmap_alloc_page(int cpu) return page_address(page); } -struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark, + int cpu, int flags) { + unsigned long size = offsetof(struct ring_buffer, data_pages[nr_pages]); struct ring_buffer *rb; - unsigned long size; - int i; - - size = sizeof(struct ring_buffer); - size += nr_pages * sizeof(void *); + int i, ret = -ENOMEM; rb = kzalloc(size, GFP_KERNEL); if (!rb) goto fail; + ret = ring_buffer_account(rb, mm, nr_pages, false); + if (ret) + goto fail_free_rb; + + ret = -ENOMEM; rb->user_page = perf_mmap_alloc_page(cpu); if (!rb->user_page) - goto fail_user_page; + goto fail_unaccount; for (i = 0; i < nr_pages; i++) { rb->data_pages[i] = perf_mmap_alloc_page(cpu); @@ -734,11 +829,14 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) free_page((unsigned long)rb->user_page); -fail_user_page: +fail_unaccount: + ring_buffer_unaccount(rb, false); + +fail_free_rb: kfree(rb); fail: - return NULL; + return ERR_PTR(ret); } static void perf_mmap_free_page(unsigned long addr) @@ -805,19 +903,23 @@ void rb_free(struct ring_buffer *rb) schedule_work(&rb->work); } -struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark, + int cpu, int flags) { + unsigned long size = offsetof(struct ring_buffer, data_pages[1]); struct ring_buffer *rb; - unsigned long size; void *all_buf; - - size = sizeof(struct ring_buffer); - size += sizeof(void *); + int ret = -ENOMEM; rb = kzalloc(size, GFP_KERNEL); if (!rb) goto fail; + ret = ring_buffer_account(rb, mm, nr_pages, false); + if (ret) + goto fail_free; + + ret = -ENOMEM; INIT_WORK(&rb->work, rb_free_work); all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); @@ -836,10 +938,13 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) return rb; fail_all_buf: + ring_buffer_unaccount(rb, false); + +fail_free: kfree(rb); fail: - return NULL; + return ERR_PTR(ret); } #endif -- 2.14.1