power: Add AMD accumulated power reporting mechanism

Huang Rui Wed, 20 Jan 2016 23:05:30 -0800

On Wed, Jan 20, 2016 at 10:22:44AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 20, 2016 at 12:48:24PM +0800, Huang Rui wrote:
> > Hi Peter,
> > 
> > Thanks so much to your comments.
> > 
> > On Tue, Jan 19, 2016 at 01:12:50PM +0100, Peter Zijlstra wrote:
> > > On Thu, Jan 14, 2016 at 10:50:08AM +0800, Huang Rui wrote:
> > > > +struct power_pmu {
> > > > +       spinlock_t              lock;
> > > 
> > > This should be a raw_spinlock_t, as it'll be nested under other
> > > raw_spinlock_t's.
> > > 
> > 
> > Do you mean the following spinlock operations are in hardware
> > interrupts disabled case, so I need use raw_spinlock_t instead, right?
> 
> 
>                       mainline                -rt
> 
> raw_spinlock_t                spin-waits              spin-waits
> spinlock_t            spin-waits              blocks (rt-mutex)
> struct mutex          blocks                  blocks (rt-mutex)
> 
> 
> since these functions are themselves called with raw_spinlock_t held
> (perf_event_context::lock for example, but also rq::lock), any lock
> nested inside them must also be raw_spinlock_t.
>


I see, thank you. :-)

I just quickly looked at about the spinlock on -rt mode. Because
realtime linux kernel provides two kinds of spinlock, the original
spinlock_t will be replaced the one which is able to sleep, actually,
like mutex. And another one (you mentioned here, raw_spinlock_t) can
keep on non-sleep behavior, that is the real spinlock.

And my lock here also will be nested under perf_event_context::lock,
right?

> I have a lockdep patch somewhere that checks these ordering things; I
> should rebase and post that again.
> 

Can you CC me when you post that patch next time?

> > Use raw_spin_lock_irqsave/raw_spin_unlock_irqrestore?
> 
> pmu::{start,stop,add,del} will be called with IRQs already disabled.
> 
> > > > +static int power_cpu_init(int cpu)
> > > > +{
> > > > +       int i, cu, ret = 0;
> > > > +       cpumask_var_t mask, dummy_mask;
> > > > +
> > > > +       cu = cpu / cores_per_cu;
> > > > +
> > > > +       if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> > > > +               return -ENOMEM;
> > > > +
> > > > +       if (!zalloc_cpumask_var(&dummy_mask, GFP_KERNEL)) {
> > > > +               ret = -ENOMEM;
> > > > +               goto out;
> > > > +       }
> > > > +
> > > > +       for (i = 0; i < cores_per_cu; i++)
> > > > +               cpumask_set_cpu(i, mask);
> > > > +
> > > > +       cpumask_shift_left(mask, mask, cu * cores_per_cu);
> > > > +
> > > > +       if (!cpumask_and(dummy_mask, mask, &cpu_mask))
> > > > +               cpumask_set_cpu(cpu, &cpu_mask);
> > > > +
> > > > +       free_cpumask_var(dummy_mask);
> > > > +out:
> > > > +       free_cpumask_var(mask);
> > > > +
> > > > +       return ret;
> > > > +}
> > > 
> > > > +static int power_cpu_notifier(struct notifier_block *self,
> > > > +                             unsigned long action, void *hcpu)
> > > > +{
> > > > +       unsigned int cpu = (long)hcpu;
> > > > +
> > > > +       switch (action & ~CPU_TASKS_FROZEN) {
> > > > +       case CPU_UP_PREPARE:
> > > > +               if (power_cpu_prepare(cpu))
> > > > +                       return NOTIFY_BAD;
> > > > +               break;
> > > > +       case CPU_STARTING:
> > > > +               if (power_cpu_init(cpu))
> > > > +                       return NOTIFY_BAD;
> > > 
> > > this is called with IRQs disabled, which makes those GFP_KERNEL allocs
> > > above a pretty bad idea.
> > > 
> > 
> > Right, so should I use GFP_ATOMIC to allocate cpumask here?
> 
> One should not use GFP_ATOMIC if at all possible, also no, -rt cannot do
> _any_ allocations from this site.
> 

OK, that's because allocation might sleep when IRQ disabled. That's
incorrect.

> > > Also, note that -rt cannot actually do _any_ allocations/frees from
> > > STARTING.
> > > 
> > > Please move the allocs/frees to PREPARE/ONLINE.
> > > 
> > 
> > How about add two cpumask_var_t at power_pmu structure? Then allocate
> > the two cpumask_var_t (pmu->mask, pmu->dummy_mask), and they can be
> > also used on power_cpu_init.
> 
> That would work.

I draft an update diff that based on original patch, please take a
look.

8<--------------------------------------------------------------------------

diff --git a/arch/x86/kernel/cpu/perf_event_amd_power.c 
b/arch/x86/kernel/cpu/perf_event_amd_power.c
index 69ef234..e71d993 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_power.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_power.c
@@ -46,10 +46,17 @@ static unsigned int cu_num;
 static u64 max_cu_acc_power;
 
 struct power_pmu {
-       spinlock_t              lock;
+       raw_spinlock_t          lock;
        struct list_head        active_list;
        struct pmu              *pmu; /* pointer to power_pmu_class */
        local64_t               cpu_sw_pwr_ptsc;
+       /*
+        * These two cpumasks is used for avoiding the allocations on
+        * CPU_STARTING phase. Because power_cpu_prepare will be
+        * called on IRQs disabled status.
+        */
+       cpumask_var_t           mask;
+       cpumask_var_t           tmp_mask;
 };
 
 static struct pmu pmu_class;
@@ -126,9 +133,9 @@ static void pmu_event_start(struct perf_event *event, int 
mode)
        struct power_pmu *pmu = __this_cpu_read(amd_power_pmu);
        unsigned long flags;
 
-       spin_lock_irqsave(&pmu->lock, flags);
+       raw_spin_lock_irqsave(&pmu->lock, flags);
        __pmu_event_start(pmu, event);
-       spin_unlock_irqrestore(&pmu->lock, flags);
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
 }
 
 static void pmu_event_stop(struct perf_event *event, int mode)
@@ -137,7 +144,7 @@ static void pmu_event_stop(struct perf_event *event, int 
mode)
        struct hw_perf_event *hwc = &event->hw;
        unsigned long flags;
 
-       spin_lock_irqsave(&pmu->lock, flags);
+       raw_spin_lock_irqsave(&pmu->lock, flags);
 
        /* mark event as deactivated and stopped */
        if (!(hwc->state & PERF_HES_STOPPED)) {
@@ -155,7 +162,7 @@ static void pmu_event_stop(struct perf_event *event, int 
mode)
                hwc->state |= PERF_HES_UPTODATE;
        }
 
-       spin_unlock_irqrestore(&pmu->lock, flags);
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
 }
 
 static int pmu_event_add(struct perf_event *event, int mode)
@@ -164,14 +171,14 @@ static int pmu_event_add(struct perf_event *event, int 
mode)
        struct hw_perf_event *hwc = &event->hw;
        unsigned long flags;
 
-       spin_lock_irqsave(&pmu->lock, flags);
+       raw_spin_lock_irqsave(&pmu->lock, flags);
 
        hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 
        if (mode & PERF_EF_START)
                __pmu_event_start(pmu, event);
 
-       spin_unlock_irqrestore(&pmu->lock, flags);
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
 
        return 0;
 }
@@ -297,89 +304,71 @@ static int power_cpu_exit(int cpu)
        struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu);
        int i, cu, ret = 0;
        int target = nr_cpumask_bits;
-       cpumask_var_t mask, tmp_mask;
 
        cu = cpu / cores_per_cu;
 
-       if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
-               return -ENOMEM;
-
-       if (!zalloc_cpumask_var(&tmp_mask, GFP_KERNEL)) {
-               ret = -ENOMEM;
-               goto out;
-       }
+       cpumask_clear(pmu->mask);
+       cpumask_clear(pmu->tmp_mask);
 
        for (i = 0; i < cores_per_cu; i++)
-               cpumask_set_cpu(i, mask);
+               cpumask_set_cpu(i, pmu->mask);
 
-       cpumask_shift_left(mask, mask, cu * cores_per_cu);
+       cpumask_shift_left(pmu->mask, pmu->mask, cu * cores_per_cu);
 
        cpumask_clear_cpu(cpu, &cpu_mask);
-       cpumask_clear_cpu(cpu, mask);
+       cpumask_clear_cpu(cpu, pmu->mask);
 
-       if (!cpumask_and(tmp_mask, mask, cpu_online_mask))
-               goto out1;
+       if (!cpumask_and(pmu->tmp_mask, pmu->mask, cpu_online_mask))
+               goto out;
 
        /*
         * find a new CPU on same compute unit, if was set in cpumask
         * and still some CPUs on compute unit, then move to the new
         * CPU
         */
-       target = cpumask_any(tmp_mask);
+       target = cpumask_any(pmu->tmp_mask);
        if (target < nr_cpumask_bits && target != cpu)
                cpumask_set_cpu(target, &cpu_mask);
 
        WARN_ON(cpumask_empty(&cpu_mask));
 
-out1:
+out:
        /*
         * migrate events and context to new CPU
         */
        if (target < nr_cpumask_bits)
                perf_pmu_migrate_context(pmu->pmu, cpu, target);
 
-       free_cpumask_var(tmp_mask);
-out:
-       free_cpumask_var(mask);
-
        return ret;
 
 }
 
 static int power_cpu_init(int cpu)
 {
-       int i, cu, ret = 0;
-       cpumask_var_t mask, dummy_mask;
-
-       cu = cpu / cores_per_cu;
+       struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu);
+       int i, cu;
 
-       if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
-               return -ENOMEM;
+       if (pmu)
+               return 0;
 
-       if (!zalloc_cpumask_var(&dummy_mask, GFP_KERNEL)) {
-               ret = -ENOMEM;
-               goto out;
-       }
+       cu = cpu / cores_per_cu;
 
        for (i = 0; i < cores_per_cu; i++)
-               cpumask_set_cpu(i, mask);
+               cpumask_set_cpu(i, pmu->mask);
 
-       cpumask_shift_left(mask, mask, cu * cores_per_cu);
+       cpumask_shift_left(pmu->mask, pmu->mask, cu * cores_per_cu);
 
-       if (!cpumask_and(dummy_mask, mask, &cpu_mask))
+       if (!cpumask_and(pmu->tmp_mask, pmu->mask, &cpu_mask))
                cpumask_set_cpu(cpu, &cpu_mask);
 
-       free_cpumask_var(dummy_mask);
-out:
-       free_cpumask_var(mask);
-
-       return ret;
+       return 0;
 }
 
 static int power_cpu_prepare(int cpu)
 {
        struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu);
        int phys_id = topology_physical_package_id(cpu);
+       int ret = 0;
 
        if (pmu)
                return 0;
@@ -391,7 +380,17 @@ static int power_cpu_prepare(int cpu)
        if (!pmu)
                return -ENOMEM;
 
-       spin_lock_init(&pmu->lock);
+       if (!zalloc_cpumask_var(&pmu->mask, GFP_KERNEL)) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       if (!zalloc_cpumask_var(&pmu->tmp_mask, GFP_KERNEL)) {
+               ret = -ENOMEM;
+               goto out1;
+       }
+
+       raw_spin_lock_init(&pmu->lock);
 
        INIT_LIST_HEAD(&pmu->active_list);
 
@@ -400,12 +399,21 @@ static int power_cpu_prepare(int cpu)
        per_cpu(amd_power_pmu, cpu) = pmu;
 
        return 0;
+
+out1:
+       free_cpumask_var(pmu->mask);
+out:
+       kfree(pmu);
+
+       return ret;
 }
 
 static void power_cpu_kfree(int cpu)
 {
        struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu);
 
+       free_cpumask_var(pmu->mask);
+       free_cpumask_var(pmu->tmp_mask);
        kfree(pmu);
 }

Re: [PATCH v2 5/5] perf/x86/amd/power: Add AMD accumulated power reporting mechanism

Reply via email to