On Fri, Jan 20, 2017 at 12:30:38PM -0800, David Carrillo-Cisneros wrote:
> On Fri, Jan 20, 2017 at 1:20 AM, Peter Zijlstra <pet...@infradead.org> wrote:
> > On Wed, Jan 18, 2017 at 11:24:54AM -0800, David Carrillo-Cisneros wrote:
> >> cpuctx->unique_pmu was originally introduced as a way to identify cpuctxs
> >> with shared pmus in order to avoid visiting the same cpuctx more than once
> >> in a for_each_pmu loop.
> >>
> >> cpuctx->unique_pmu == cpuctx->pmu in non-software task contexts since they
> >> have only one pmu per cpuctx. Since perf_pmu_sched_task is only called in
> >> hw contexts, this patch replaces cpuctx->unique_pmu by cpuctx->pmu in it.
> >>
> >> The change above, together with the previous patch in this series, removed
> >> the remaining uses of cpuctx->unique_pmu, so we remove it altogether.
> >>
> >> Signed-off-by: David Carrillo-Cisneros <davi...@google.com>
> >> Acked-by: Mark Rutland <mark.rutl...@arm.com>
> >

> >
> > This very much relies on us never calling perf_pmu_unregister() on the
> > software PMUs afaict. A condition not mention in the Changelog.
> >
> What's a good way to solve this? Update the Changelog or add code to
> update ctx->pmu?

I think just update the Changelog and maybe put a comment near
perf_pmu_register() and/or the sw pmu abuse that relies on this.

> This issue would go away cleanly if we were to remove the context
> sharing across pmu's. Would you support work in that direction?

Its something that I've considered, the trivial solution is folding it
all into the one swevent pmu by adding a switch in all the
add/del/start/stop/read methods. Its a wee bit ugly but straight fwd.

I've not really found anything less ugly though; and I have to fully
admit to the current situation being rather vile.


I also just found the below patch that I've had bitrotting since 2015.

---
Subject: perf: Move all software PMUs into their own file
From: Peter Zijlstra <pet...@infradead.org>
Date: Fri Apr 17 19:52:17 CEST 2015


Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
 kernel/events/Makefile   |    2 
 kernel/events/core.c     | 1280 +++++------------------------------------------
 kernel/events/internal.h |   13 
 kernel/events/software.c | 1021 +++++++++++++++++++++++++++++++++++++
 4 files changed, 1184 insertions(+), 1132 deletions(-)

--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,7 +2,7 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
 endif
 
-obj-y := core.o ring_buffer.o callchain.o
+obj-y := core.o software.o ring_buffer.o callchain.o
 
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_UPROBES) += uprobes.o
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,14 +36,11 @@
 #include <linux/kernel_stat.h>
 #include <linux/cgroup.h>
 #include <linux/perf_event.h>
-#include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
-#include <linux/bpf.h>
-#include <linux/filter.h>
 
 #include "internal.h"
 
@@ -1828,8 +1825,6 @@ static void perf_set_shadow_time(struct
                event->shadow_ctx_time = tstamp - ctx->timestamp;
 }
 
-#define MAX_INTERRUPTS (~0ULL)
-
 static void perf_log_throttle(struct perf_event *event, int enable);
 static void perf_log_itrace_start(struct perf_event *event);
 
@@ -3411,9 +3406,6 @@ find_get_context(struct pmu *pmu, struct
        return ERR_PTR(err);
 }
 
-static void perf_event_free_filter(struct perf_event *event);
-static void perf_event_free_bpf_prog(struct perf_event *event);
-
 static void free_event_rcu(struct rcu_head *head)
 {
        struct perf_event *event;
@@ -4020,8 +4012,6 @@ static inline int perf_fget_light(int fd
 
 static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
-static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned 
long arg)
 {
@@ -6036,9 +6026,9 @@ static void perf_log_itrace_start(struct
  * Generic event overflow handling, sampling.
  */
 
-static int __perf_event_overflow(struct perf_event *event,
-                                  int throttle, struct perf_sample_data *data,
-                                  struct pt_regs *regs)
+int __perf_event_overflow(struct perf_event *event,
+                         int throttle, struct perf_sample_data *data,
+                         struct pt_regs *regs)
 {
        int events = atomic_read(&event->event_limit);
        struct hw_perf_event *hwc = &event->hw;
@@ -6111,1155 +6101,223 @@ int perf_event_overflow(struct perf_even
        return __perf_event_overflow(event, 1, data, regs);
 }
 
-/*
- * Generic software event infrastructure
- */
-
-struct swevent_htable {
-       struct swevent_hlist            *swevent_hlist;
-       struct mutex                    hlist_mutex;
-       int                             hlist_refcount;
-
-       /* Recursion avoidance in each contexts */
-       int                             recursion[PERF_NR_CONTEXTS];
-
-       /* Keeps track of cpu being initialized/exited */
-       bool                            online;
-};
-
-static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
-
-/*
- * We directly increment event->count and keep a second value in
- * event->hw.period_left to count intervals. This period event
- * is kept in the range [-sample_period, 0] so that we can use the
- * sign as trigger.
- */
-
-u64 perf_swevent_set_period(struct perf_event *event)
+static void perf_pmu_nop_void(struct pmu *pmu)
 {
-       struct hw_perf_event *hwc = &event->hw;
-       u64 period = hwc->last_period;
-       u64 nr, offset;
-       s64 old, val;
-
-       hwc->last_period = hwc->sample_period;
-
-again:
-       old = val = local64_read(&hwc->period_left);
-       if (val < 0)
-               return 0;
-
-       nr = div64_u64(period + val, period);
-       offset = nr * period;
-       val -= offset;
-       if (local64_cmpxchg(&hwc->period_left, old, val) != old)
-               goto again;
-
-       return nr;
 }
 
-static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
-                                   struct perf_sample_data *data,
-                                   struct pt_regs *regs)
+static int perf_pmu_nop_int(struct pmu *pmu)
 {
-       struct hw_perf_event *hwc = &event->hw;
-       int throttle = 0;
-
-       if (!overflow)
-               overflow = perf_swevent_set_period(event);
-
-       if (hwc->interrupts == MAX_INTERRUPTS)
-               return;
-
-       for (; overflow; overflow--) {
-               if (__perf_event_overflow(event, throttle,
-                                           data, regs)) {
-                       /*
-                        * We inhibit the overflow from happening when
-                        * hwc->interrupts == MAX_INTERRUPTS.
-                        */
-                       break;
-               }
-               throttle = 1;
-       }
+       return 0;
 }
 
-static void perf_swevent_event(struct perf_event *event, u64 nr,
-                              struct perf_sample_data *data,
-                              struct pt_regs *regs)
+static void perf_pmu_start_txn(struct pmu *pmu)
 {
-       struct hw_perf_event *hwc = &event->hw;
-
-       local64_add(nr, &event->count);
-
-       if (!regs)
-               return;
-
-       if (!is_sampling_event(event))
-               return;
-
-       if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && 
!event->attr.freq) {
-               data->period = nr;
-               return perf_swevent_overflow(event, 1, data, regs);
-       } else
-               data->period = event->hw.last_period;
-
-       if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
-               return perf_swevent_overflow(event, 1, data, regs);
-
-       if (local64_add_negative(nr, &hwc->period_left))
-               return;
-
-       perf_swevent_overflow(event, 0, data, regs);
+       perf_pmu_disable(pmu);
 }
 
-static int perf_exclude_event(struct perf_event *event,
-                             struct pt_regs *regs)
+static int perf_pmu_commit_txn(struct pmu *pmu)
 {
-       if (event->hw.state & PERF_HES_STOPPED)
-               return 1;
-
-       if (regs) {
-               if (event->attr.exclude_user && user_mode(regs))
-                       return 1;
-
-               if (event->attr.exclude_kernel && !user_mode(regs))
-                       return 1;
-       }
-
+       perf_pmu_enable(pmu);
        return 0;
 }
 
-static int perf_swevent_match(struct perf_event *event,
-                               enum perf_type_id type,
-                               u32 event_id,
-                               struct perf_sample_data *data,
-                               struct pt_regs *regs)
-{
-       if (event->attr.type != type)
-               return 0;
-
-       if (event->attr.config != event_id)
-               return 0;
-
-       if (perf_exclude_event(event, regs))
-               return 0;
-
-       return 1;
-}
-
-static inline u64 swevent_hash(u64 type, u32 event_id)
+static void perf_pmu_cancel_txn(struct pmu *pmu)
 {
-       u64 val = event_id | (type << 32);
-
-       return hash_64(val, SWEVENT_HLIST_BITS);
+       perf_pmu_enable(pmu);
 }
 
-static inline struct hlist_head *
-__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
+static int perf_event_idx_default(struct perf_event *event)
 {
-       u64 hash = swevent_hash(type, event_id);
-
-       return &hlist->heads[hash];
+       return 0;
 }
 
-/* For the read side: events when they trigger */
-static inline struct hlist_head *
-find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 {
-       struct swevent_hlist *hlist;
+       struct pmu *pmu;
 
-       hlist = rcu_dereference(swhash->swevent_hlist);
-       if (!hlist)
+       if (ctxn < 0)
                return NULL;
 
-       return __find_swevent_head(hlist, type, event_id);
+       list_for_each_entry(pmu, &pmus, entry) {
+               if (pmu->task_ctx_nr == ctxn)
+                       return pmu->pmu_cpu_context;
+       }
+
+       return NULL;
 }
 
-/* For the event head insertion and removal in the hlist */
-static inline struct hlist_head *
-find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
 {
-       struct swevent_hlist *hlist;
-       u32 event_id = event->attr.config;
-       u64 type = event->attr.type;
-
-       /*
-        * Event scheduling is always serialized against hlist allocation
-        * and release. Which makes the protected version suitable here.
-        * The context lock guarantees that.
-        */
-       hlist = rcu_dereference_protected(swhash->swevent_hlist,
-                                         lockdep_is_held(&event->ctx->lock));
-       if (!hlist)
-               return NULL;
+       int cpu;
 
-       return __find_swevent_head(hlist, type, event_id);
-}
+       for_each_possible_cpu(cpu) {
+               struct perf_cpu_context *cpuctx;
 
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-                                   u64 nr,
-                                   struct perf_sample_data *data,
-                                   struct pt_regs *regs)
-{
-       struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-       struct perf_event *event;
-       struct hlist_head *head;
+               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 
-       rcu_read_lock();
-       head = find_swevent_head_rcu(swhash, type, event_id);
-       if (!head)
-               goto end;
-
-       hlist_for_each_entry_rcu(event, head, hlist_entry) {
-               if (perf_swevent_match(event, type, event_id, data, regs))
-                       perf_swevent_event(event, nr, data, regs);
+               if (cpuctx->unique_pmu == old_pmu)
+                       cpuctx->unique_pmu = pmu;
        }
-end:
-       rcu_read_unlock();
 }
 
-DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
-
-int perf_swevent_get_recursion_context(void)
+static void free_pmu_context(struct pmu *pmu)
 {
-       struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-
-       return get_recursion_context(swhash->recursion);
-}
-EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
+       struct pmu *i;
 
-inline void perf_swevent_put_recursion_context(int rctx)
-{
-       struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+       mutex_lock(&pmus_lock);
+       /*
+        * Like a real lame refcount.
+        */
+       list_for_each_entry(i, &pmus, entry) {
+               if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+                       update_pmu_context(i, pmu);
+                       goto out;
+               }
+       }
 
-       put_recursion_context(swhash->recursion, rctx);
+       free_percpu(pmu->pmu_cpu_context);
+out:
+       mutex_unlock(&pmus_lock);
 }
+static struct idr pmu_idr;
 
-void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
 {
-       struct perf_sample_data data;
-
-       if (WARN_ON_ONCE(!regs))
-               return;
+       struct pmu *pmu = dev_get_drvdata(dev);
 
-       perf_sample_data_init(&data, addr, 0);
-       do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+       return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
 }
+static DEVICE_ATTR_RO(type);
 
-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+                               struct device_attribute *attr,
+                               char *page)
 {
-       int rctx;
-
-       preempt_disable_notrace();
-       rctx = perf_swevent_get_recursion_context();
-       if (unlikely(rctx < 0))
-               goto fail;
-
-       ___perf_sw_event(event_id, nr, regs, addr);
+       struct pmu *pmu = dev_get_drvdata(dev);
 
-       perf_swevent_put_recursion_context(rctx);
-fail:
-       preempt_enable_notrace();
+       return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
 }
 
-static void perf_swevent_read(struct perf_event *event)
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
 {
-}
+       struct pmu *pmu = dev_get_drvdata(dev);
+       int timer, cpu, ret;
 
-static int perf_swevent_add(struct perf_event *event, int flags)
-{
-       struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-       struct hw_perf_event *hwc = &event->hw;
-       struct hlist_head *head;
+       ret = kstrtoint(buf, 0, &timer);
+       if (ret)
+               return ret;
 
-       if (is_sampling_event(event)) {
-               hwc->last_period = hwc->sample_period;
-               perf_swevent_set_period(event);
-       }
+       if (timer < 1)
+               return -EINVAL;
 
-       hwc->state = !(flags & PERF_EF_START);
+       /* same value, noting to do */
+       if (timer == pmu->hrtimer_interval_ms)
+               return count;
 
-       head = find_swevent_head(swhash, event);
-       if (!head) {
-               /*
-                * We can race with cpu hotplug code. Do not
-                * WARN if the cpu just got unplugged.
-                */
-               WARN_ON_ONCE(swhash->online);
-               return -EINVAL;
-       }
+       pmu->hrtimer_interval_ms = timer;
 
-       hlist_add_head_rcu(&event->hlist_entry, head);
-       perf_event_update_userpage(event);
+       /* update all cpuctx for this PMU */
+       for_each_possible_cpu(cpu) {
+               struct perf_cpu_context *cpuctx;
+               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 
-       return 0;
-}
+               if (hrtimer_active(&cpuctx->hrtimer))
+                       hrtimer_forward_now(&cpuctx->hrtimer, 
cpuctx->hrtimer_interval);
+       }
 
-static void perf_swevent_del(struct perf_event *event, int flags)
-{
-       hlist_del_rcu(&event->hlist_entry);
+       return count;
 }
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
 
-static void perf_swevent_start(struct perf_event *event, int flags)
-{
-       event->hw.state = 0;
-}
+static struct attribute *pmu_dev_attrs[] = {
+       &dev_attr_type.attr,
+       &dev_attr_perf_event_mux_interval_ms.attr,
+       NULL,
+};
+ATTRIBUTE_GROUPS(pmu_dev);
 
-static void perf_swevent_stop(struct perf_event *event, int flags)
-{
-       event->hw.state = PERF_HES_STOPPED;
-}
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+       .name           = "event_source",
+       .dev_groups     = pmu_dev_groups,
+};
 
-/* Deref the hlist from the update side */
-static inline struct swevent_hlist *
-swevent_hlist_deref(struct swevent_htable *swhash)
+static void pmu_dev_release(struct device *dev)
 {
-       return rcu_dereference_protected(swhash->swevent_hlist,
-                                        lockdep_is_held(&swhash->hlist_mutex));
+       kfree(dev);
 }
 
-static void swevent_hlist_release(struct swevent_htable *swhash)
+static int pmu_dev_alloc(struct pmu *pmu)
 {
-       struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
-
-       if (!hlist)
-               return;
+       int ret = -ENOMEM;
 
-       RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
-       kfree_rcu(hlist, rcu_head);
-}
+       pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+       if (!pmu->dev)
+               goto out;
 
-static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
-{
-       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+       pmu->dev->groups = pmu->attr_groups;
+       device_initialize(pmu->dev);
+       ret = dev_set_name(pmu->dev, "%s", pmu->name);
+       if (ret)
+               goto free_dev;
 
-       mutex_lock(&swhash->hlist_mutex);
+       dev_set_drvdata(pmu->dev, pmu);
+       pmu->dev->bus = &pmu_bus;
+       pmu->dev->release = pmu_dev_release;
+       ret = device_add(pmu->dev);
+       if (ret)
+               goto free_dev;
 
-       if (!--swhash->hlist_refcount)
-               swevent_hlist_release(swhash);
+out:
+       return ret;
 
-       mutex_unlock(&swhash->hlist_mutex);
+free_dev:
+       put_device(pmu->dev);
+       goto out;
 }
 
-static void swevent_hlist_put(struct perf_event *event)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               swevent_hlist_put_cpu(event, cpu);
-}
+static struct lock_class_key cpuctx_mutex;
+static struct lock_class_key cpuctx_lock;
 
-static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 {
-       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-       int err = 0;
+       int cpu, ret;
 
-       mutex_lock(&swhash->hlist_mutex);
+       mutex_lock(&pmus_lock);
+       ret = -ENOMEM;
+       pmu->pmu_disable_count = alloc_percpu(int);
+       if (!pmu->pmu_disable_count)
+               goto unlock;
 
-       if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
-               struct swevent_hlist *hlist;
+       pmu->type = -1;
+       if (!name)
+               goto skip_type;
+       pmu->name = name;
 
-               hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
-               if (!hlist) {
-                       err = -ENOMEM;
-                       goto exit;
+       if (type < 0) {
+               type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+               if (type < 0) {
+                       ret = type;
+                       goto free_pdc;
                }
-               rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
-       swhash->hlist_refcount++;
-exit:
-       mutex_unlock(&swhash->hlist_mutex);
+       pmu->type = type;
 
-       return err;
-}
-
-static int swevent_hlist_get(struct perf_event *event)
-{
-       int err;
-       int cpu, failed_cpu;
-
-       get_online_cpus();
-       for_each_possible_cpu(cpu) {
-               err = swevent_hlist_get_cpu(event, cpu);
-               if (err) {
-                       failed_cpu = cpu;
-                       goto fail;
-               }
-       }
-       put_online_cpus();
-
-       return 0;
-fail:
-       for_each_possible_cpu(cpu) {
-               if (cpu == failed_cpu)
-                       break;
-               swevent_hlist_put_cpu(event, cpu);
-       }
-
-       put_online_cpus();
-       return err;
-}
-
-struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
-
-static void sw_perf_event_destroy(struct perf_event *event)
-{
-       u64 event_id = event->attr.config;
-
-       WARN_ON(event->parent);
-
-       static_key_slow_dec(&perf_swevent_enabled[event_id]);
-       swevent_hlist_put(event);
-}
-
-static int perf_swevent_init(struct perf_event *event)
-{
-       u64 event_id = event->attr.config;
-
-       if (event->attr.type != PERF_TYPE_SOFTWARE)
-               return -ENOENT;
-
-       /*
-        * no branch sampling for software events
-        */
-       if (has_branch_stack(event))
-               return -EOPNOTSUPP;
-
-       switch (event_id) {
-       case PERF_COUNT_SW_CPU_CLOCK:
-       case PERF_COUNT_SW_TASK_CLOCK:
-               return -ENOENT;
-
-       default:
-               break;
-       }
-
-       if (event_id >= PERF_COUNT_SW_MAX)
-               return -ENOENT;
-
-       if (!event->parent) {
-               int err;
-
-               err = swevent_hlist_get(event);
-               if (err)
-                       return err;
-
-               static_key_slow_inc(&perf_swevent_enabled[event_id]);
-               event->destroy = sw_perf_event_destroy;
-       }
-
-       return 0;
-}
-
-static struct pmu perf_swevent = {
-       .task_ctx_nr    = perf_sw_context,
-
-       .capabilities   = PERF_PMU_CAP_NO_NMI,
-
-       .event_init     = perf_swevent_init,
-       .add            = perf_swevent_add,
-       .del            = perf_swevent_del,
-       .start          = perf_swevent_start,
-       .stop           = perf_swevent_stop,
-       .read           = perf_swevent_read,
-};
-
-#ifdef CONFIG_EVENT_TRACING
-
-static int perf_tp_filter_match(struct perf_event *event,
-                               struct perf_sample_data *data)
-{
-       void *record = data->raw->data;
-
-       if (likely(!event->filter) || filter_match_preds(event->filter, record))
-               return 1;
-       return 0;
-}
-
-static int perf_tp_event_match(struct perf_event *event,
-                               struct perf_sample_data *data,
-                               struct pt_regs *regs)
-{
-       if (event->hw.state & PERF_HES_STOPPED)
-               return 0;
-       /*
-        * All tracepoints are from kernel-space.
-        */
-       if (event->attr.exclude_kernel)
-               return 0;
-
-       if (!perf_tp_filter_match(event, data))
-               return 0;
-
-       return 1;
-}
-
-void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-                  struct pt_regs *regs, struct hlist_head *head, int rctx,
-                  struct task_struct *task)
-{
-       struct perf_sample_data data;
-       struct perf_event *event;
-
-       struct perf_raw_record raw = {
-               .size = entry_size,
-               .data = record,
-       };
-
-       perf_sample_data_init(&data, addr, 0);
-       data.raw = &raw;
-
-       hlist_for_each_entry_rcu(event, head, hlist_entry) {
-               if (perf_tp_event_match(event, &data, regs))
-                       perf_swevent_event(event, count, &data, regs);
-       }
-
-       /*
-        * If we got specified a target task, also iterate its context and
-        * deliver this event there too.
-        */
-       if (task && task != current) {
-               struct perf_event_context *ctx;
-               struct trace_entry *entry = record;
-
-               rcu_read_lock();
-               ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
-               if (!ctx)
-                       goto unlock;
-
-               list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-                       if (event->attr.type != PERF_TYPE_TRACEPOINT)
-                               continue;
-                       if (event->attr.config != entry->type)
-                               continue;
-                       if (perf_tp_event_match(event, &data, regs))
-                               perf_swevent_event(event, count, &data, regs);
-               }
-unlock:
-               rcu_read_unlock();
-       }
-
-       perf_swevent_put_recursion_context(rctx);
-}
-EXPORT_SYMBOL_GPL(perf_tp_event);
-
-static void tp_perf_event_destroy(struct perf_event *event)
-{
-       perf_trace_destroy(event);
-}
-
-static int perf_tp_event_init(struct perf_event *event)
-{
-       int err;
-
-       if (event->attr.type != PERF_TYPE_TRACEPOINT)
-               return -ENOENT;
-
-       /*
-        * no branch sampling for tracepoint events
-        */
-       if (has_branch_stack(event))
-               return -EOPNOTSUPP;
-
-       err = perf_trace_init(event);
-       if (err)
-               return err;
-
-       event->destroy = tp_perf_event_destroy;
-
-       return 0;
-}
-
-static struct pmu perf_tracepoint = {
-       .task_ctx_nr    = perf_sw_context,
-
-       .event_init     = perf_tp_event_init,
-       .add            = perf_trace_add,
-       .del            = perf_trace_del,
-       .start          = perf_swevent_start,
-       .stop           = perf_swevent_stop,
-       .read           = perf_swevent_read,
-};
-
-static inline void perf_tp_register(void)
-{
-       perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
-}
-
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
-       char *filter_str;
-       int ret;
-
-       if (event->attr.type != PERF_TYPE_TRACEPOINT)
-               return -EINVAL;
-
-       filter_str = strndup_user(arg, PAGE_SIZE);
-       if (IS_ERR(filter_str))
-               return PTR_ERR(filter_str);
-
-       ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
-
-       kfree(filter_str);
-       return ret;
-}
-
-static void perf_event_free_filter(struct perf_event *event)
-{
-       ftrace_profile_free_filter(event);
-}
-
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
-{
-       struct bpf_prog *prog;
-
-       if (event->attr.type != PERF_TYPE_TRACEPOINT)
-               return -EINVAL;
-
-       if (event->tp_event->prog)
-               return -EEXIST;
-
-       if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
-               /* bpf programs can only be attached to kprobes */
-               return -EINVAL;
-
-       prog = bpf_prog_get(prog_fd);
-       if (IS_ERR(prog))
-               return PTR_ERR(prog);
-
-       if (prog->type != BPF_PROG_TYPE_KPROBE) {
-               /* valid fd, but invalid bpf program type */
-               bpf_prog_put(prog);
-               return -EINVAL;
-       }
-
-       event->tp_event->prog = prog;
-
-       return 0;
-}
-
-static void perf_event_free_bpf_prog(struct perf_event *event)
-{
-       struct bpf_prog *prog;
-
-       if (!event->tp_event)
-               return;
-
-       prog = event->tp_event->prog;
-       if (prog) {
-               event->tp_event->prog = NULL;
-               bpf_prog_put(prog);
-       }
-}
-
-#else
-
-static inline void perf_tp_register(void)
-{
-}
-
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
-       return -ENOENT;
-}
-
-static void perf_event_free_filter(struct perf_event *event)
-{
-}
-
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
-{
-       return -ENOENT;
-}
-
-static void perf_event_free_bpf_prog(struct perf_event *event)
-{
-}
-#endif /* CONFIG_EVENT_TRACING */
-
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-void perf_bp_event(struct perf_event *bp, void *data)
-{
-       struct perf_sample_data sample;
-       struct pt_regs *regs = data;
-
-       perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
-
-       if (!bp->hw.state && !perf_exclude_event(bp, regs))
-               perf_swevent_event(bp, 1, &sample, regs);
-}
-#endif
-
-/*
- * hrtimer based swevent callback
- */
-
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
-{
-       enum hrtimer_restart ret = HRTIMER_RESTART;
-       struct perf_sample_data data;
-       struct pt_regs *regs;
-       struct perf_event *event;
-       u64 period;
-
-       event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-
-       if (event->state != PERF_EVENT_STATE_ACTIVE)
-               return HRTIMER_NORESTART;
-
-       event->pmu->read(event);
-
-       perf_sample_data_init(&data, 0, event->hw.last_period);
-       regs = get_irq_regs();
-
-       if (regs && !perf_exclude_event(event, regs)) {
-               if (!(event->attr.exclude_idle && is_idle_task(current)))
-                       if (__perf_event_overflow(event, 1, &data, regs))
-                               ret = HRTIMER_NORESTART;
-       }
-
-       period = max_t(u64, 10000, event->hw.sample_period);
-       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
-       return ret;
-}
-
-static void perf_swevent_start_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       s64 period;
-
-       if (!is_sampling_event(event))
-               return;
-
-       period = local64_read(&hwc->period_left);
-       if (period) {
-               if (period < 0)
-                       period = 10000;
-
-               local64_set(&hwc->period_left, 0);
-       } else {
-               period = max_t(u64, 10000, hwc->sample_period);
-       }
-       __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(period), 0,
-                               HRTIMER_MODE_REL_PINNED, 0);
-}
-
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (is_sampling_event(event)) {
-               ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-               local64_set(&hwc->period_left, ktime_to_ns(remaining));
-
-               hrtimer_cancel(&hwc->hrtimer);
-       }
-}
-
-static void perf_swevent_init_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (!is_sampling_event(event))
-               return;
-
-       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hwc->hrtimer.function = perf_swevent_hrtimer;
-
-       /*
-        * Since hrtimers have a fixed rate, we can do a static freq->period
-        * mapping and avoid the whole period adjust feedback stuff.
-        */
-       if (event->attr.freq) {
-               long freq = event->attr.sample_freq;
-
-               event->attr.sample_period = NSEC_PER_SEC / freq;
-               hwc->sample_period = event->attr.sample_period;
-               local64_set(&hwc->period_left, hwc->sample_period);
-               hwc->last_period = hwc->sample_period;
-               event->attr.freq = 0;
-       }
-}
-
-/*
- * Software event: cpu wall time clock
- */
-
-static void cpu_clock_event_update(struct perf_event *event)
-{
-       s64 prev;
-       u64 now;
-
-       now = local_clock();
-       prev = local64_xchg(&event->hw.prev_count, now);
-       local64_add(now - prev, &event->count);
-}
-
-static void cpu_clock_event_start(struct perf_event *event, int flags)
-{
-       local64_set(&event->hw.prev_count, local_clock());
-       perf_swevent_start_hrtimer(event);
-}
-
-static void cpu_clock_event_stop(struct perf_event *event, int flags)
-{
-       perf_swevent_cancel_hrtimer(event);
-       cpu_clock_event_update(event);
-}
-
-static int cpu_clock_event_add(struct perf_event *event, int flags)
-{
-       if (flags & PERF_EF_START)
-               cpu_clock_event_start(event, flags);
-       perf_event_update_userpage(event);
-
-       return 0;
-}
-
-static void cpu_clock_event_del(struct perf_event *event, int flags)
-{
-       cpu_clock_event_stop(event, flags);
-}
-
-static void cpu_clock_event_read(struct perf_event *event)
-{
-       cpu_clock_event_update(event);
-}
-
-static int cpu_clock_event_init(struct perf_event *event)
-{
-       if (event->attr.type != PERF_TYPE_SOFTWARE)
-               return -ENOENT;
-
-       if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
-               return -ENOENT;
-
-       /*
-        * no branch sampling for software events
-        */
-       if (has_branch_stack(event))
-               return -EOPNOTSUPP;
-
-       perf_swevent_init_hrtimer(event);
-
-       return 0;
-}
-
-static struct pmu perf_cpu_clock = {
-       .task_ctx_nr    = perf_sw_context,
-
-       .capabilities   = PERF_PMU_CAP_NO_NMI,
-
-       .event_init     = cpu_clock_event_init,
-       .add            = cpu_clock_event_add,
-       .del            = cpu_clock_event_del,
-       .start          = cpu_clock_event_start,
-       .stop           = cpu_clock_event_stop,
-       .read           = cpu_clock_event_read,
-};
-
-/*
- * Software event: task time clock
- */
-
-static void task_clock_event_update(struct perf_event *event, u64 now)
-{
-       u64 prev;
-       s64 delta;
-
-       prev = local64_xchg(&event->hw.prev_count, now);
-       delta = now - prev;
-       local64_add(delta, &event->count);
-}
-
-static void task_clock_event_start(struct perf_event *event, int flags)
-{
-       local64_set(&event->hw.prev_count, event->ctx->time);
-       perf_swevent_start_hrtimer(event);
-}
-
-static void task_clock_event_stop(struct perf_event *event, int flags)
-{
-       perf_swevent_cancel_hrtimer(event);
-       task_clock_event_update(event, event->ctx->time);
-}
-
-static int task_clock_event_add(struct perf_event *event, int flags)
-{
-       if (flags & PERF_EF_START)
-               task_clock_event_start(event, flags);
-       perf_event_update_userpage(event);
-
-       return 0;
-}
-
-static void task_clock_event_del(struct perf_event *event, int flags)
-{
-       task_clock_event_stop(event, PERF_EF_UPDATE);
-}
-
-static void task_clock_event_read(struct perf_event *event)
-{
-       u64 now = perf_clock();
-       u64 delta = now - event->ctx->timestamp;
-       u64 time = event->ctx->time + delta;
-
-       task_clock_event_update(event, time);
-}
-
-static int task_clock_event_init(struct perf_event *event)
-{
-       if (event->attr.type != PERF_TYPE_SOFTWARE)
-               return -ENOENT;
-
-       if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
-               return -ENOENT;
-
-       /*
-        * no branch sampling for software events
-        */
-       if (has_branch_stack(event))
-               return -EOPNOTSUPP;
-
-       perf_swevent_init_hrtimer(event);
-
-       return 0;
-}
-
-static struct pmu perf_task_clock = {
-       .task_ctx_nr    = perf_sw_context,
-
-       .capabilities   = PERF_PMU_CAP_NO_NMI,
-
-       .event_init     = task_clock_event_init,
-       .add            = task_clock_event_add,
-       .del            = task_clock_event_del,
-       .start          = task_clock_event_start,
-       .stop           = task_clock_event_stop,
-       .read           = task_clock_event_read,
-};
-
-static void perf_pmu_nop_void(struct pmu *pmu)
-{
-}
-
-static int perf_pmu_nop_int(struct pmu *pmu)
-{
-       return 0;
-}
-
-static void perf_pmu_start_txn(struct pmu *pmu)
-{
-       perf_pmu_disable(pmu);
-}
-
-static int perf_pmu_commit_txn(struct pmu *pmu)
-{
-       perf_pmu_enable(pmu);
-       return 0;
-}
-
-static void perf_pmu_cancel_txn(struct pmu *pmu)
-{
-       perf_pmu_enable(pmu);
-}
-
-static int perf_event_idx_default(struct perf_event *event)
-{
-       return 0;
-}
-
-/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
-       struct pmu *pmu;
-
-       if (ctxn < 0)
-               return NULL;
-
-       list_for_each_entry(pmu, &pmus, entry) {
-               if (pmu->task_ctx_nr == ctxn)
-                       return pmu->pmu_cpu_context;
-       }
-
-       return NULL;
-}
-
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct perf_cpu_context *cpuctx;
-
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
-               if (cpuctx->unique_pmu == old_pmu)
-                       cpuctx->unique_pmu = pmu;
-       }
-}
-
-static void free_pmu_context(struct pmu *pmu)
-{
-       struct pmu *i;
-
-       mutex_lock(&pmus_lock);
-       /*
-        * Like a real lame refcount.
-        */
-       list_for_each_entry(i, &pmus, entry) {
-               if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
-                       update_pmu_context(i, pmu);
-                       goto out;
-               }
-       }
-
-       free_percpu(pmu->pmu_cpu_context);
-out:
-       mutex_unlock(&pmus_lock);
-}
-static struct idr pmu_idr;
-
-static ssize_t
-type_show(struct device *dev, struct device_attribute *attr, char *page)
-{
-       struct pmu *pmu = dev_get_drvdata(dev);
-
-       return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
-}
-static DEVICE_ATTR_RO(type);
-
-static ssize_t
-perf_event_mux_interval_ms_show(struct device *dev,
-                               struct device_attribute *attr,
-                               char *page)
-{
-       struct pmu *pmu = dev_get_drvdata(dev);
-
-       return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
-}
-
-static ssize_t
-perf_event_mux_interval_ms_store(struct device *dev,
-                                struct device_attribute *attr,
-                                const char *buf, size_t count)
-{
-       struct pmu *pmu = dev_get_drvdata(dev);
-       int timer, cpu, ret;
-
-       ret = kstrtoint(buf, 0, &timer);
-       if (ret)
-               return ret;
-
-       if (timer < 1)
-               return -EINVAL;
-
-       /* same value, noting to do */
-       if (timer == pmu->hrtimer_interval_ms)
-               return count;
-
-       pmu->hrtimer_interval_ms = timer;
-
-       /* update all cpuctx for this PMU */
-       for_each_possible_cpu(cpu) {
-               struct perf_cpu_context *cpuctx;
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
-
-               if (hrtimer_active(&cpuctx->hrtimer))
-                       hrtimer_forward_now(&cpuctx->hrtimer, 
cpuctx->hrtimer_interval);
-       }
-
-       return count;
-}
-static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
-
-static struct attribute *pmu_dev_attrs[] = {
-       &dev_attr_type.attr,
-       &dev_attr_perf_event_mux_interval_ms.attr,
-       NULL,
-};
-ATTRIBUTE_GROUPS(pmu_dev);
-
-static int pmu_bus_running;
-static struct bus_type pmu_bus = {
-       .name           = "event_source",
-       .dev_groups     = pmu_dev_groups,
-};
-
-static void pmu_dev_release(struct device *dev)
-{
-       kfree(dev);
-}
-
-static int pmu_dev_alloc(struct pmu *pmu)
-{
-       int ret = -ENOMEM;
-
-       pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
-       if (!pmu->dev)
-               goto out;
-
-       pmu->dev->groups = pmu->attr_groups;
-       device_initialize(pmu->dev);
-       ret = dev_set_name(pmu->dev, "%s", pmu->name);
-       if (ret)
-               goto free_dev;
-
-       dev_set_drvdata(pmu->dev, pmu);
-       pmu->dev->bus = &pmu_bus;
-       pmu->dev->release = pmu_dev_release;
-       ret = device_add(pmu->dev);
-       if (ret)
-               goto free_dev;
-
-out:
-       return ret;
-
-free_dev:
-       put_device(pmu->dev);
-       goto out;
-}
-
-static struct lock_class_key cpuctx_mutex;
-static struct lock_class_key cpuctx_lock;
-
-int perf_pmu_register(struct pmu *pmu, const char *name, int type)
-{
-       int cpu, ret;
-
-       mutex_lock(&pmus_lock);
-       ret = -ENOMEM;
-       pmu->pmu_disable_count = alloc_percpu(int);
-       if (!pmu->pmu_disable_count)
-               goto unlock;
-
-       pmu->type = -1;
-       if (!name)
-               goto skip_type;
-       pmu->name = name;
-
-       if (type < 0) {
-               type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
-               if (type < 0) {
-                       ret = type;
-                       goto free_pdc;
-               }
-       }
-       pmu->type = type;
-
-       if (pmu_bus_running) {
-               ret = pmu_dev_alloc(pmu);
-               if (ret)
-                       goto free_idr;
-       }
+       if (pmu_bus_running) {
+               ret = pmu_dev_alloc(pmu);
+               if (ret)
+                       goto free_idr;
+       }
 
 skip_type:
        pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
@@ -8808,30 +7866,10 @@ int perf_event_init_task(struct task_str
 
 static void __init perf_event_init_all_cpus(void)
 {
-       struct swevent_htable *swhash;
        int cpu;
 
-       for_each_possible_cpu(cpu) {
-               swhash = &per_cpu(swevent_htable, cpu);
-               mutex_init(&swhash->hlist_mutex);
+       for_each_possible_cpu(cpu)
                INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
-       }
-}
-
-static void perf_event_init_cpu(int cpu)
-{
-       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-
-       mutex_lock(&swhash->hlist_mutex);
-       swhash->online = true;
-       if (swhash->hlist_refcount > 0) {
-               struct swevent_hlist *hlist;
-
-               hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, 
cpu_to_node(cpu));
-               WARN_ON(!hlist);
-               rcu_assign_pointer(swhash->swevent_hlist, hlist);
-       }
-       mutex_unlock(&swhash->hlist_mutex);
 }
 
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
@@ -8862,20 +7900,8 @@ static void perf_event_exit_cpu_context(
        }
        srcu_read_unlock(&pmus_srcu, idx);
 }
-
-static void perf_event_exit_cpu(int cpu)
-{
-       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-
-       perf_event_exit_cpu_context(cpu);
-
-       mutex_lock(&swhash->hlist_mutex);
-       swhash->online = false;
-       swevent_hlist_release(swhash);
-       mutex_unlock(&swhash->hlist_mutex);
-}
 #else
-static inline void perf_event_exit_cpu(int cpu) { }
+static inline void perf_event_exit_cpu_context(int cpu) { }
 #endif
 
 static int
@@ -8884,7 +7910,7 @@ perf_reboot(struct notifier_block *notif
        int cpu;
 
        for_each_online_cpu(cpu)
-               perf_event_exit_cpu(cpu);
+               perf_event_exit_cpu_context(cpu);
 
        return NOTIFY_OK;
 }
@@ -8905,14 +7931,9 @@ perf_cpu_notify(struct notifier_block *s
 
        switch (action & ~CPU_TASKS_FROZEN) {
 
-       case CPU_UP_PREPARE:
-       case CPU_DOWN_FAILED:
-               perf_event_init_cpu(cpu);
-               break;
-
        case CPU_UP_CANCELED:
        case CPU_DOWN_PREPARE:
-               perf_event_exit_cpu(cpu);
+               perf_event_exit_cpu_context(cpu);
                break;
        default:
                break;
@@ -8929,10 +7950,7 @@ void __init perf_event_init(void)
 
        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
-       perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
-       perf_pmu_register(&perf_cpu_clock, NULL, -1);
-       perf_pmu_register(&perf_task_clock, NULL, -1);
-       perf_tp_register();
+       perf_swevent_register();
        perf_cpu_notifier(perf_cpu_notify);
        register_reboot_notifier(&perf_reboot_notifier);
 
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -228,4 +228,17 @@ static inline bool arch_perf_have_user_s
 #define perf_user_stack_pointer(regs) 0
 #endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
 
+#define MAX_INTERRUPTS (~0ULL)
+
+extern int __perf_event_overflow(struct perf_event *event,
+                         int throttle, struct perf_sample_data *data,
+                         struct pt_regs *regs);
+
+extern void perf_event_free_filter(struct perf_event *event);
+extern void perf_event_free_bpf_prog(struct perf_event *event);
+extern int perf_event_set_filter(struct perf_event *event, void __user *arg);
+extern int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+
+extern void perf_swevent_register(void);
+
 #endif /* _KERNEL_EVENTS_INTERNAL_H */
--- /dev/null
+++ b/kernel/events/software.c
@@ -0,0 +1,1021 @@
+
+#include <linux/perf_event.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/ftrace_event.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+#include "internal.h"
+
+/*
+ * Generic software event infrastructure
+ */
+
+struct swevent_htable {
+       struct swevent_hlist            *swevent_hlist;
+       struct mutex                    hlist_mutex;
+       int                             hlist_refcount;
+
+       /* Recursion avoidance in each contexts */
+       int                             recursion[PERF_NR_CONTEXTS];
+
+       /* Keeps track of cpu being initialized/exited */
+       bool                            online;
+};
+
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
+
+/*
+ * We directly increment event->count and keep a second value in
+ * event->hw.period_left to count intervals. This period event
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+u64 perf_swevent_set_period(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 period = hwc->last_period;
+       u64 nr, offset;
+       s64 old, val;
+
+       hwc->last_period = hwc->sample_period;
+
+again:
+       old = val = local64_read(&hwc->period_left);
+       if (val < 0)
+               return 0;
+
+       nr = div64_u64(period + val, period);
+       offset = nr * period;
+       val -= offset;
+       if (local64_cmpxchg(&hwc->period_left, old, val) != old)
+               goto again;
+
+       return nr;
+}
+
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
+                                   struct perf_sample_data *data,
+                                   struct pt_regs *regs)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int throttle = 0;
+
+       if (!overflow)
+               overflow = perf_swevent_set_period(event);
+
+       if (hwc->interrupts == MAX_INTERRUPTS)
+               return;
+
+       for (; overflow; overflow--) {
+               if (__perf_event_overflow(event, throttle,
+                                           data, regs)) {
+                       /*
+                        * We inhibit the overflow from happening when
+                        * hwc->interrupts == MAX_INTERRUPTS.
+                        */
+                       break;
+               }
+               throttle = 1;
+       }
+}
+
+static void perf_swevent_event(struct perf_event *event, u64 nr,
+                              struct perf_sample_data *data,
+                              struct pt_regs *regs)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       local64_add(nr, &event->count);
+
+       if (!regs)
+               return;
+
+       if (!is_sampling_event(event))
+               return;
+
+       if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && 
!event->attr.freq) {
+               data->period = nr;
+               return perf_swevent_overflow(event, 1, data, regs);
+       } else
+               data->period = event->hw.last_period;
+
+       if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+               return perf_swevent_overflow(event, 1, data, regs);
+
+       if (local64_add_negative(nr, &hwc->period_left))
+               return;
+
+       perf_swevent_overflow(event, 0, data, regs);
+}
+
+static int perf_exclude_event(struct perf_event *event,
+                             struct pt_regs *regs)
+{
+       if (event->hw.state & PERF_HES_STOPPED)
+               return 1;
+
+       if (regs) {
+               if (event->attr.exclude_user && user_mode(regs))
+                       return 1;
+
+               if (event->attr.exclude_kernel && !user_mode(regs))
+                       return 1;
+       }
+
+       return 0;
+}
+
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+       struct perf_sample_data sample;
+       struct pt_regs *regs = data;
+
+       perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
+
+       if (!bp->hw.state && !perf_exclude_event(bp, regs))
+               perf_swevent_event(bp, 1, &sample, regs);
+}
+#endif
+
+static int perf_swevent_match(struct perf_event *event,
+                               enum perf_type_id type,
+                               u32 event_id,
+                               struct perf_sample_data *data,
+                               struct pt_regs *regs)
+{
+       if (event->attr.type != type)
+               return 0;
+
+       if (event->attr.config != event_id)
+               return 0;
+
+       if (perf_exclude_event(event, regs))
+               return 0;
+
+       return 1;
+}
+
+static inline u64 swevent_hash(u64 type, u32 event_id)
+{
+       u64 val = event_id | (type << 32);
+
+       return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
+{
+       u64 hash = swevent_hash(type, event_id);
+
+       return &hlist->heads[hash];
+}
+
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
+{
+       struct swevent_hlist *hlist;
+
+       hlist = rcu_dereference(swhash->swevent_hlist);
+       if (!hlist)
+               return NULL;
+
+       return __find_swevent_head(hlist, type, event_id);
+}
+
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
+{
+       struct swevent_hlist *hlist;
+       u32 event_id = event->attr.config;
+       u64 type = event->attr.type;
+
+       /*
+        * Event scheduling is always serialized against hlist allocation
+        * and release. Which makes the protected version suitable here.
+        * The context lock guarantees that.
+        */
+       hlist = rcu_dereference_protected(swhash->swevent_hlist,
+                                         lockdep_is_held(&event->ctx->lock));
+       if (!hlist)
+               return NULL;
+
+       return __find_swevent_head(hlist, type, event_id);
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+                                   u64 nr,
+                                   struct perf_sample_data *data,
+                                   struct pt_regs *regs)
+{
+       struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+       struct perf_event *event;
+       struct hlist_head *head;
+
+       rcu_read_lock();
+       head = find_swevent_head_rcu(swhash, type, event_id);
+       if (!head)
+               goto end;
+
+       hlist_for_each_entry_rcu(event, head, hlist_entry) {
+               if (perf_swevent_match(event, type, event_id, data, regs))
+                       perf_swevent_event(event, nr, data, regs);
+       }
+end:
+       rcu_read_unlock();
+}
+
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
+
+int perf_swevent_get_recursion_context(void)
+{
+       struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+
+       return get_recursion_context(swhash->recursion);
+}
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
+
+inline void perf_swevent_put_recursion_context(int rctx)
+{
+       struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+
+       put_recursion_context(swhash->recursion, rctx);
+}
+
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+       struct perf_sample_data data;
+
+       if (WARN_ON_ONCE(!regs))
+               return;
+
+       perf_sample_data_init(&data, addr, 0);
+       do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+       int rctx;
+
+       preempt_disable_notrace();
+       rctx = perf_swevent_get_recursion_context();
+       if (unlikely(rctx < 0))
+               goto fail;
+
+       ___perf_sw_event(event_id, nr, regs, addr);
+
+       perf_swevent_put_recursion_context(rctx);
+fail:
+       preempt_enable_notrace();
+}
+
+static void perf_swevent_read(struct perf_event *event)
+{
+}
+
+static int perf_swevent_add(struct perf_event *event, int flags)
+{
+       struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+       struct hw_perf_event *hwc = &event->hw;
+       struct hlist_head *head;
+
+       if (is_sampling_event(event)) {
+               hwc->last_period = hwc->sample_period;
+               perf_swevent_set_period(event);
+       }
+
+       hwc->state = !(flags & PERF_EF_START);
+
+       head = find_swevent_head(swhash, event);
+       if (!head) {
+               /*
+                * We can race with cpu hotplug code. Do not
+                * WARN if the cpu just got unplugged.
+                */
+               WARN_ON_ONCE(swhash->online);
+               return -EINVAL;
+       }
+
+       hlist_add_head_rcu(&event->hlist_entry, head);
+       perf_event_update_userpage(event);
+
+       return 0;
+}
+
+static void perf_swevent_del(struct perf_event *event, int flags)
+{
+       hlist_del_rcu(&event->hlist_entry);
+}
+
+static void perf_swevent_start(struct perf_event *event, int flags)
+{
+       event->hw.state = 0;
+}
+
+static void perf_swevent_stop(struct perf_event *event, int flags)
+{
+       event->hw.state = PERF_HES_STOPPED;
+}
+
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct swevent_htable *swhash)
+{
+       return rcu_dereference_protected(swhash->swevent_hlist,
+                                        lockdep_is_held(&swhash->hlist_mutex));
+}
+
+static void swevent_hlist_release(struct swevent_htable *swhash)
+{
+       struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
+
+       if (!hlist)
+               return;
+
+       RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
+       kfree_rcu(hlist, rcu_head);
+}
+
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+       mutex_lock(&swhash->hlist_mutex);
+
+       if (!--swhash->hlist_refcount)
+               swevent_hlist_release(swhash);
+
+       mutex_unlock(&swhash->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+       int err = 0;
+
+       mutex_lock(&swhash->hlist_mutex);
+
+       if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+               struct swevent_hlist *hlist;
+
+               hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+               if (!hlist) {
+                       err = -ENOMEM;
+                       goto exit;
+               }
+               rcu_assign_pointer(swhash->swevent_hlist, hlist);
+       }
+       swhash->hlist_refcount++;
+exit:
+       mutex_unlock(&swhash->hlist_mutex);
+
+       return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+       int err;
+       int cpu, failed_cpu;
+
+       get_online_cpus();
+       for_each_possible_cpu(cpu) {
+               err = swevent_hlist_get_cpu(event, cpu);
+               if (err) {
+                       failed_cpu = cpu;
+                       goto fail;
+               }
+       }
+       put_online_cpus();
+
+       return 0;
+fail:
+       for_each_possible_cpu(cpu) {
+               if (cpu == failed_cpu)
+                       break;
+               swevent_hlist_put_cpu(event, cpu);
+       }
+
+       put_online_cpus();
+       return err;
+}
+
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+       u64 event_id = event->attr.config;
+
+       WARN_ON(event->parent);
+
+       static_key_slow_dec(&perf_swevent_enabled[event_id]);
+       swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+       u64 event_id = event->attr.config;
+
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       /*
+        * no branch sampling for software events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
+       switch (event_id) {
+       case PERF_COUNT_SW_CPU_CLOCK:
+       case PERF_COUNT_SW_TASK_CLOCK:
+               return -ENOENT;
+
+       default:
+               break;
+       }
+
+       if (event_id >= PERF_COUNT_SW_MAX)
+               return -ENOENT;
+
+       if (!event->parent) {
+               int err;
+
+               err = swevent_hlist_get(event);
+               if (err)
+                       return err;
+
+               static_key_slow_inc(&perf_swevent_enabled[event_id]);
+               event->destroy = sw_perf_event_destroy;
+       }
+
+       return 0;
+}
+
+static struct pmu perf_swevent = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .capabilities   = PERF_PMU_CAP_NO_NMI,
+
+       .event_init     = perf_swevent_init,
+       .add            = perf_swevent_add,
+       .del            = perf_swevent_del,
+       .start          = perf_swevent_start,
+       .stop           = perf_swevent_stop,
+       .read           = perf_swevent_read,
+};
+
+#ifdef CONFIG_EVENT_TRACING
+
+static int perf_tp_filter_match(struct perf_event *event,
+                               struct perf_sample_data *data)
+{
+       void *record = data->raw->data;
+
+       if (likely(!event->filter) || filter_match_preds(event->filter, record))
+               return 1;
+       return 0;
+}
+
+static int perf_tp_event_match(struct perf_event *event,
+                               struct perf_sample_data *data,
+                               struct pt_regs *regs)
+{
+       if (event->hw.state & PERF_HES_STOPPED)
+               return 0;
+       /*
+        * All tracepoints are from kernel-space.
+        */
+       if (event->attr.exclude_kernel)
+               return 0;
+
+       if (!perf_tp_filter_match(event, data))
+               return 0;
+
+       return 1;
+}
+
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+                  struct pt_regs *regs, struct hlist_head *head, int rctx,
+                  struct task_struct *task)
+{
+       struct perf_sample_data data;
+       struct perf_event *event;
+
+       struct perf_raw_record raw = {
+               .size = entry_size,
+               .data = record,
+       };
+
+       perf_sample_data_init(&data, addr, 0);
+       data.raw = &raw;
+
+       hlist_for_each_entry_rcu(event, head, hlist_entry) {
+               if (perf_tp_event_match(event, &data, regs))
+                       perf_swevent_event(event, count, &data, regs);
+       }
+
+       /*
+        * If we got specified a target task, also iterate its context and
+        * deliver this event there too.
+        */
+       if (task && task != current) {
+               struct perf_event_context *ctx;
+               struct trace_entry *entry = record;
+
+               rcu_read_lock();
+               ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+               if (!ctx)
+                       goto unlock;
+
+               list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+                       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                               continue;
+                       if (event->attr.config != entry->type)
+                               continue;
+                       if (perf_tp_event_match(event, &data, regs))
+                               perf_swevent_event(event, count, &data, regs);
+               }
+unlock:
+               rcu_read_unlock();
+       }
+
+       perf_swevent_put_recursion_context(rctx);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+       perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+       int err;
+
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -ENOENT;
+
+       /*
+        * no branch sampling for tracepoint events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
+       err = perf_trace_init(event);
+       if (err)
+               return err;
+
+       event->destroy = tp_perf_event_destroy;
+
+       return 0;
+}
+
+static struct pmu perf_tracepoint = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .event_init     = perf_tp_event_init,
+       .add            = perf_trace_add,
+       .del            = perf_trace_del,
+       .start          = perf_swevent_start,
+       .stop           = perf_swevent_stop,
+       .read           = perf_swevent_read,
+};
+
+static inline void perf_tp_register(void)
+{
+       perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+}
+
+int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+       char *filter_str;
+       int ret;
+
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -EINVAL;
+
+       filter_str = strndup_user(arg, PAGE_SIZE);
+       if (IS_ERR(filter_str))
+               return PTR_ERR(filter_str);
+
+       ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+
+       kfree(filter_str);
+       return ret;
+}
+
+void perf_event_free_filter(struct perf_event *event)
+{
+       ftrace_profile_free_filter(event);
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+       struct bpf_prog *prog;
+
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -EINVAL;
+
+       if (event->tp_event->prog)
+               return -EEXIST;
+
+       if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+               /* bpf programs can only be attached to kprobes */
+               return -EINVAL;
+
+       prog = bpf_prog_get(prog_fd);
+       if (IS_ERR(prog))
+               return PTR_ERR(prog);
+
+       if (prog->type != BPF_PROG_TYPE_KPROBE) {
+               /* valid fd, but invalid bpf program type */
+               bpf_prog_put(prog);
+               return -EINVAL;
+       }
+
+       event->tp_event->prog = prog;
+
+       return 0;
+}
+
+void perf_event_free_bpf_prog(struct perf_event *event)
+{
+       struct bpf_prog *prog;
+
+       if (!event->tp_event)
+               return;
+
+       prog = event->tp_event->prog;
+       if (prog) {
+               event->tp_event->prog = NULL;
+               bpf_prog_put(prog);
+       }
+}
+
+#else
+
+static inline void perf_tp_register(void)
+{
+}
+
+int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+       return -ENOENT;
+}
+
+void perf_event_free_filter(struct perf_event *event)
+{
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+       return -ENOENT;
+}
+
+void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
+#endif /* CONFIG_EVENT_TRACING */
+
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+{
+       enum hrtimer_restart ret = HRTIMER_RESTART;
+       struct perf_sample_data data;
+       struct pt_regs *regs;
+       struct perf_event *event;
+       u64 period;
+
+       event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+       if (event->state != PERF_EVENT_STATE_ACTIVE)
+               return HRTIMER_NORESTART;
+
+       event->pmu->read(event);
+
+       perf_sample_data_init(&data, 0, event->hw.last_period);
+       regs = get_irq_regs();
+
+       if (regs && !perf_exclude_event(event, regs)) {
+               if (!(event->attr.exclude_idle && is_idle_task(current)))
+                       if (__perf_event_overflow(event, 1, &data, regs))
+                               ret = HRTIMER_NORESTART;
+       }
+
+       period = max_t(u64, 10000, event->hw.sample_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+       return ret;
+}
+
+static void perf_swevent_start_hrtimer(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       s64 period;
+
+       if (!is_sampling_event(event))
+               return;
+
+       period = local64_read(&hwc->period_left);
+       if (period) {
+               if (period < 0)
+                       period = 10000;
+
+               local64_set(&hwc->period_left, 0);
+       } else {
+               period = max_t(u64, 10000, hwc->sample_period);
+       }
+       __hrtimer_start_range_ns(&hwc->hrtimer,
+                               ns_to_ktime(period), 0,
+                               HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (is_sampling_event(event)) {
+               ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+               local64_set(&hwc->period_left, ktime_to_ns(remaining));
+
+               hrtimer_cancel(&hwc->hrtimer);
+       }
+}
+
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (!is_sampling_event(event))
+               return;
+
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hwc->hrtimer.function = perf_swevent_hrtimer;
+
+       /*
+        * Since hrtimers have a fixed rate, we can do a static freq->period
+        * mapping and avoid the whole period adjust feedback stuff.
+        */
+       if (event->attr.freq) {
+               long freq = event->attr.sample_freq;
+
+               event->attr.sample_period = NSEC_PER_SEC / freq;
+               hwc->sample_period = event->attr.sample_period;
+               local64_set(&hwc->period_left, hwc->sample_period);
+               hwc->last_period = hwc->sample_period;
+               event->attr.freq = 0;
+       }
+}
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
+{
+       s64 prev;
+       u64 now;
+
+       now = local_clock();
+       prev = local64_xchg(&event->hw.prev_count, now);
+       local64_add(now - prev, &event->count);
+}
+
+static void cpu_clock_event_start(struct perf_event *event, int flags)
+{
+       local64_set(&event->hw.prev_count, local_clock());
+       perf_swevent_start_hrtimer(event);
+}
+
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+       perf_swevent_cancel_hrtimer(event);
+       cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_add(struct perf_event *event, int flags)
+{
+       if (flags & PERF_EF_START)
+               cpu_clock_event_start(event, flags);
+       perf_event_update_userpage(event);
+
+       return 0;
+}
+
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+       cpu_clock_event_stop(event, flags);
+}
+
+static void cpu_clock_event_read(struct perf_event *event)
+{
+       cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_init(struct perf_event *event)
+{
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+               return -ENOENT;
+
+       /*
+        * no branch sampling for software events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
+       perf_swevent_init_hrtimer(event);
+
+       return 0;
+}
+
+static struct pmu perf_cpu_clock = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .capabilities   = PERF_PMU_CAP_NO_NMI,
+
+       .event_init     = cpu_clock_event_init,
+       .add            = cpu_clock_event_add,
+       .del            = cpu_clock_event_del,
+       .start          = cpu_clock_event_start,
+       .stop           = cpu_clock_event_stop,
+       .read           = cpu_clock_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+       u64 prev;
+       s64 delta;
+
+       prev = local64_xchg(&event->hw.prev_count, now);
+       delta = now - prev;
+       local64_add(delta, &event->count);
+}
+
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+       local64_set(&event->hw.prev_count, event->ctx->time);
+       perf_swevent_start_hrtimer(event);
+}
+
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+       perf_swevent_cancel_hrtimer(event);
+       task_clock_event_update(event, event->ctx->time);
+}
+
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+       if (flags & PERF_EF_START)
+               task_clock_event_start(event, flags);
+       perf_event_update_userpage(event);
+
+       return 0;
+}
+
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+       task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+       u64 now = local_clock(); /* XXX */
+       u64 delta = now - event->ctx->timestamp;
+       u64 time = event->ctx->time + delta;
+
+       task_clock_event_update(event, time);
+}
+
+static int task_clock_event_init(struct perf_event *event)
+{
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+               return -ENOENT;
+
+       /*
+        * no branch sampling for software events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
+       perf_swevent_init_hrtimer(event);
+
+       return 0;
+}
+
+static struct pmu perf_task_clock = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .capabilities   = PERF_PMU_CAP_NO_NMI,
+
+       .event_init     = task_clock_event_init,
+       .add            = task_clock_event_add,
+       .del            = task_clock_event_del,
+       .start          = task_clock_event_start,
+       .stop           = task_clock_event_stop,
+       .read           = task_clock_event_read,
+};
+
+static void __init perf_swevent_init_all_cpus(void)
+{
+       struct swevent_htable *swhash;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               swhash = &per_cpu(swevent_htable, cpu);
+               mutex_init(&swhash->hlist_mutex);
+       }
+}
+
+static void perf_swevent_init_cpu(int cpu)
+{
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+       mutex_lock(&swhash->hlist_mutex);
+       swhash->online = true;
+       if (swhash->hlist_refcount > 0) {
+               struct swevent_hlist *hlist;
+
+               hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, 
cpu_to_node(cpu));
+               WARN_ON(!hlist);
+               rcu_assign_pointer(swhash->swevent_hlist, hlist);
+       }
+       mutex_unlock(&swhash->hlist_mutex);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void perf_swevent_exit_cpu(int cpu)
+{
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+       mutex_lock(&swhash->hlist_mutex);
+       swhash->online = false;
+       swevent_hlist_release(swhash);
+       mutex_unlock(&swhash->hlist_mutex);
+}
+#else
+static inline void perf_swevent_exit_cpu(int cpu) { }
+#endif
+
+static int
+perf_swevent_notify(struct notifier_block *self, unsigned long action, void 
*hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+
+       case CPU_UP_PREPARE:
+       case CPU_DOWN_FAILED:
+               perf_swevent_init_cpu(cpu);
+               break;
+
+       case CPU_UP_CANCELED:
+       case CPU_DOWN_PREPARE:
+               perf_swevent_exit_cpu(cpu);
+               break;
+
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+__init void perf_swevent_register(void)
+{
+       perf_swevent_init_all_cpus();
+
+       perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
+       perf_pmu_register(&perf_cpu_clock, NULL, -1);
+       perf_pmu_register(&perf_task_clock, NULL, -1);
+       perf_tp_register();
+
+       perf_cpu_notifier(perf_swevent_notify);
+}
+

Reply via email to