On Wed, Jan 20, 2016 at 10:22:44AM +0100, Peter Zijlstra wrote: > On Wed, Jan 20, 2016 at 12:48:24PM +0800, Huang Rui wrote: > > Hi Peter, > > > > Thanks so much to your comments. > > > > On Tue, Jan 19, 2016 at 01:12:50PM +0100, Peter Zijlstra wrote: > > > On Thu, Jan 14, 2016 at 10:50:08AM +0800, Huang Rui wrote: > > > > +struct power_pmu { > > > > + spinlock_t lock; > > > > > > This should be a raw_spinlock_t, as it'll be nested under other > > > raw_spinlock_t's. > > > > > > > Do you mean the following spinlock operations are in hardware > > interrupts disabled case, so I need use raw_spinlock_t instead, right? > > > mainline -rt > > raw_spinlock_t spin-waits spin-waits > spinlock_t spin-waits blocks (rt-mutex) > struct mutex blocks blocks (rt-mutex) > > > since these functions are themselves called with raw_spinlock_t held > (perf_event_context::lock for example, but also rq::lock), any lock > nested inside them must also be raw_spinlock_t. >
I see, thank you. :-) I just quickly looked at about the spinlock on -rt mode. Because realtime linux kernel provides two kinds of spinlock, the original spinlock_t will be replaced the one which is able to sleep, actually, like mutex. And another one (you mentioned here, raw_spinlock_t) can keep on non-sleep behavior, that is the real spinlock. And my lock here also will be nested under perf_event_context::lock, right? > I have a lockdep patch somewhere that checks these ordering things; I > should rebase and post that again. > Can you CC me when you post that patch next time? > > Use raw_spin_lock_irqsave/raw_spin_unlock_irqrestore? > > pmu::{start,stop,add,del} will be called with IRQs already disabled. > > > > > +static int power_cpu_init(int cpu) > > > > +{ > > > > + int i, cu, ret = 0; > > > > + cpumask_var_t mask, dummy_mask; > > > > + > > > > + cu = cpu / cores_per_cu; > > > > + > > > > + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) > > > > + return -ENOMEM; > > > > + > > > > + if (!zalloc_cpumask_var(&dummy_mask, GFP_KERNEL)) { > > > > + ret = -ENOMEM; > > > > + goto out; > > > > + } > > > > + > > > > + for (i = 0; i < cores_per_cu; i++) > > > > + cpumask_set_cpu(i, mask); > > > > + > > > > + cpumask_shift_left(mask, mask, cu * cores_per_cu); > > > > + > > > > + if (!cpumask_and(dummy_mask, mask, &cpu_mask)) > > > > + cpumask_set_cpu(cpu, &cpu_mask); > > > > + > > > > + free_cpumask_var(dummy_mask); > > > > +out: > > > > + free_cpumask_var(mask); > > > > + > > > > + return ret; > > > > +} > > > > > > > +static int power_cpu_notifier(struct notifier_block *self, > > > > + unsigned long action, void *hcpu) > > > > +{ > > > > + unsigned int cpu = (long)hcpu; > > > > + > > > > + switch (action & ~CPU_TASKS_FROZEN) { > > > > + case CPU_UP_PREPARE: > > > > + if (power_cpu_prepare(cpu)) > > > > + return NOTIFY_BAD; > > > > + break; > > > > + case CPU_STARTING: > > > > + if (power_cpu_init(cpu)) > > > > + return NOTIFY_BAD; > > > > > > this is called with IRQs disabled, which makes those GFP_KERNEL allocs > > > above a pretty bad idea. > > > > > > > Right, so should I use GFP_ATOMIC to allocate cpumask here? > > One should not use GFP_ATOMIC if at all possible, also no, -rt cannot do > _any_ allocations from this site. > OK, that's because allocation might sleep when IRQ disabled. That's incorrect. > > > Also, note that -rt cannot actually do _any_ allocations/frees from > > > STARTING. > > > > > > Please move the allocs/frees to PREPARE/ONLINE. > > > > > > > How about add two cpumask_var_t at power_pmu structure? Then allocate > > the two cpumask_var_t (pmu->mask, pmu->dummy_mask), and they can be > > also used on power_cpu_init. > > That would work. I draft an update diff that based on original patch, please take a look. 8<-------------------------------------------------------------------------- diff --git a/arch/x86/kernel/cpu/perf_event_amd_power.c b/arch/x86/kernel/cpu/perf_event_amd_power.c index 69ef234..e71d993 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_power.c +++ b/arch/x86/kernel/cpu/perf_event_amd_power.c @@ -46,10 +46,17 @@ static unsigned int cu_num; static u64 max_cu_acc_power; struct power_pmu { - spinlock_t lock; + raw_spinlock_t lock; struct list_head active_list; struct pmu *pmu; /* pointer to power_pmu_class */ local64_t cpu_sw_pwr_ptsc; + /* + * These two cpumasks is used for avoiding the allocations on + * CPU_STARTING phase. Because power_cpu_prepare will be + * called on IRQs disabled status. + */ + cpumask_var_t mask; + cpumask_var_t tmp_mask; }; static struct pmu pmu_class; @@ -126,9 +133,9 @@ static void pmu_event_start(struct perf_event *event, int mode) struct power_pmu *pmu = __this_cpu_read(amd_power_pmu); unsigned long flags; - spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&pmu->lock, flags); __pmu_event_start(pmu, event); - spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&pmu->lock, flags); } static void pmu_event_stop(struct perf_event *event, int mode) @@ -137,7 +144,7 @@ static void pmu_event_stop(struct perf_event *event, int mode) struct hw_perf_event *hwc = &event->hw; unsigned long flags; - spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&pmu->lock, flags); /* mark event as deactivated and stopped */ if (!(hwc->state & PERF_HES_STOPPED)) { @@ -155,7 +162,7 @@ static void pmu_event_stop(struct perf_event *event, int mode) hwc->state |= PERF_HES_UPTODATE; } - spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&pmu->lock, flags); } static int pmu_event_add(struct perf_event *event, int mode) @@ -164,14 +171,14 @@ static int pmu_event_add(struct perf_event *event, int mode) struct hw_perf_event *hwc = &event->hw; unsigned long flags; - spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&pmu->lock, flags); hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (mode & PERF_EF_START) __pmu_event_start(pmu, event); - spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&pmu->lock, flags); return 0; } @@ -297,89 +304,71 @@ static int power_cpu_exit(int cpu) struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu); int i, cu, ret = 0; int target = nr_cpumask_bits; - cpumask_var_t mask, tmp_mask; cu = cpu / cores_per_cu; - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(&tmp_mask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out; - } + cpumask_clear(pmu->mask); + cpumask_clear(pmu->tmp_mask); for (i = 0; i < cores_per_cu; i++) - cpumask_set_cpu(i, mask); + cpumask_set_cpu(i, pmu->mask); - cpumask_shift_left(mask, mask, cu * cores_per_cu); + cpumask_shift_left(pmu->mask, pmu->mask, cu * cores_per_cu); cpumask_clear_cpu(cpu, &cpu_mask); - cpumask_clear_cpu(cpu, mask); + cpumask_clear_cpu(cpu, pmu->mask); - if (!cpumask_and(tmp_mask, mask, cpu_online_mask)) - goto out1; + if (!cpumask_and(pmu->tmp_mask, pmu->mask, cpu_online_mask)) + goto out; /* * find a new CPU on same compute unit, if was set in cpumask * and still some CPUs on compute unit, then move to the new * CPU */ - target = cpumask_any(tmp_mask); + target = cpumask_any(pmu->tmp_mask); if (target < nr_cpumask_bits && target != cpu) cpumask_set_cpu(target, &cpu_mask); WARN_ON(cpumask_empty(&cpu_mask)); -out1: +out: /* * migrate events and context to new CPU */ if (target < nr_cpumask_bits) perf_pmu_migrate_context(pmu->pmu, cpu, target); - free_cpumask_var(tmp_mask); -out: - free_cpumask_var(mask); - return ret; } static int power_cpu_init(int cpu) { - int i, cu, ret = 0; - cpumask_var_t mask, dummy_mask; - - cu = cpu / cores_per_cu; + struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu); + int i, cu; - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + if (pmu) + return 0; - if (!zalloc_cpumask_var(&dummy_mask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out; - } + cu = cpu / cores_per_cu; for (i = 0; i < cores_per_cu; i++) - cpumask_set_cpu(i, mask); + cpumask_set_cpu(i, pmu->mask); - cpumask_shift_left(mask, mask, cu * cores_per_cu); + cpumask_shift_left(pmu->mask, pmu->mask, cu * cores_per_cu); - if (!cpumask_and(dummy_mask, mask, &cpu_mask)) + if (!cpumask_and(pmu->tmp_mask, pmu->mask, &cpu_mask)) cpumask_set_cpu(cpu, &cpu_mask); - free_cpumask_var(dummy_mask); -out: - free_cpumask_var(mask); - - return ret; + return 0; } static int power_cpu_prepare(int cpu) { struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu); int phys_id = topology_physical_package_id(cpu); + int ret = 0; if (pmu) return 0; @@ -391,7 +380,17 @@ static int power_cpu_prepare(int cpu) if (!pmu) return -ENOMEM; - spin_lock_init(&pmu->lock); + if (!zalloc_cpumask_var(&pmu->mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out; + } + + if (!zalloc_cpumask_var(&pmu->tmp_mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out1; + } + + raw_spin_lock_init(&pmu->lock); INIT_LIST_HEAD(&pmu->active_list); @@ -400,12 +399,21 @@ static int power_cpu_prepare(int cpu) per_cpu(amd_power_pmu, cpu) = pmu; return 0; + +out1: + free_cpumask_var(pmu->mask); +out: + kfree(pmu); + + return ret; } static void power_cpu_kfree(int cpu) { struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu); + free_cpumask_var(pmu->mask); + free_cpumask_var(pmu->tmp_mask); kfree(pmu); }