Thomas, Acknowledged. I'll update the patch and generate separate patch(s) wherever cqm changes are required. One thing was checkpatch.pl gave errors in current cqm code. I had to break lines as it complained about some line(s) exceeding 80 chars.
Regards. -Kanaka > -----Original Message----- > From: Thomas Gleixner [mailto:t...@linutronix.de] > Sent: Tuesday, July 28, 2015 12:59 AM > To: Kanaka Juvva > Cc: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will; Andi > Kleen; > LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; Autee, Priya V; x...@kernel.org; > mi...@redhat.com; H. Peter Anvin; Shivappa, Vikas > Subject: Re: [PATCH v2] perf,x86: add Intel Memory Bandwidth Monitoring > (MBM) PMU > > On Tue, 21 Jul 2015, Kanaka Juvva wrote: > > diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c > > b/arch/x86/kernel/cpu/perf_event_intel_cqm.c > > index 1880761..dc1ce58 100644 > > --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c > > +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c > > @@ -12,10 +12,20 @@ > > #define MSR_IA32_PQR_ASSOC 0x0c8f > > #define MSR_IA32_QM_CTR 0x0c8e > > #define MSR_IA32_QM_EVTSEL 0x0c8d > > +#define MAX_MBM_CNTR 0xffffff > > Can we please have a consistent name space MBM_... here? > > > +#define MBM_SOCKET_MAX 8 > > +#define MBM_TIME_DELTA_MAX 1000 > > +#define MBM_TIME_DELTA_MIN 1000 > > +#define MBM_SCALING_FACTOR 1000 > > +#define MBM_SCALING_HALF (MBM_SCALING_FACTOR/2) > > +#define MBM_FIFO_SIZE_MIN 10 > > +#define MBM_FIFO_SIZE_MAX 300 > > > > -static u32 cqm_max_rmid = -1; > > -static unsigned int cqm_l3_scale; /* supposedly cacheline size */ > > > > +static u32 cqm_max_rmid = -1; > > +static unsigned int cqm_l3_scale;/* supposedly cacheline size */ > > Pointless white space change > > > +static unsigned mbm_window_size = MBM_FIFO_SIZE_MIN; static bool > > +cqm_llc_occ, cqm_mbm; > > /** > > * struct intel_pqr_state - State cache for the PQR MSR > > * @rmid: The cached Resource Monitoring ID > > @@ -42,6 +52,34 @@ struct intel_pqr_state { > > * interrupts disabled, which is sufficient for the protection. > > */ > > static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); > > +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu); static > > +DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free); > > + > > +/* > > + * mbm pmu is used for storing mbm_local and mbm_total > > + * events > > + */ > > +struct mbm_pmu { > > + spinlock_t lock; > > What is this lock protecting and how does it nest into perf locking? > > > + int n_active; /* number of active events */ > > If you want to document your struct members, please use kerneldoc and not > these hard to read tail comments. > > > + struct list_head active_list; > > + struct pmu *pmu; /* pointer to mbm perf_event */ > > + ktime_t timer_interval; /* in ktime_t unit */ > > + struct hrtimer hrtimer; > > +}; > > + > > +struct sample { > > + u64 bytes; /* mbm counter value read*/ > > + u64 cum_avg; /* current running average of bandwidth */ > > + ktime_t prev_time; > > + u64 index; /* current sample no */ > > + u32 mbmfifo[MBM_FIFO_SIZE_MAX]; /* window of last n bw values */ > > + u32 fifoin; /* mbmfifo in counter for sliding window */ > > + u32 fifoout; /* mbmfifo out counter for sliding window */ > > So above you use a proper layout which is actualy readable (aside of the tail > comments). Can you please stick with a consistent coding style? > > > +}; > > + > > +struct sample *mbm_total; /* curent stats for mbm_total events */ > > +struct sample *mbm_local; /* current stats for mbm_local events */ > > static ? And please get rid of these tail comments. > > > /* > > * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. > > @@ -66,6 +104,9 @@ static cpumask_t cqm_cpumask; > > #define RMID_VAL_UNAVAIL (1ULL << 62) > > > > #define QOS_L3_OCCUP_EVENT_ID (1 << 0) > > +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1) > > Please use tabs not spaces. > > > +#define QOS_MBM_LOCAL_EVENT_ID1 0x3 > > What's ID1 for heavens sake? Looks like > > (QOS_L3_OCCUP_EVENT_ID | QOS_MBM_TOTAL_EVENT_ID) > > So this wants a descriptive ID name and a comment. > > > +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2) > > > > #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID > > > > @@ -90,7 +131,8 @@ static u32 intel_cqm_rotation_rmid; > > */ > > static inline bool __rmid_valid(u32 rmid) { > > - if (!rmid || rmid == INVALID_RMID) > > + WARN_ON_ONCE(rmid > cqm_max_rmid); > > + if (!rmid || (rmid == INVALID_RMID) || (rmid > cqm_max_rmid)) > > return false; > > How is that related to this patch? Looks like a fix or prerequisite change to > the > existing code. > > > return true; > > @@ -125,6 +167,7 @@ struct cqm_rmid_entry { > > enum rmid_recycle_state state; > > struct list_head list; > > unsigned long queue_time; > > + bool config; > > }; > > > > /* > > @@ -176,6 +219,17 @@ static inline struct cqm_rmid_entry > *__rmid_entry(u32 rmid) > > return entry; > > } > > > > +static void mbm_reset_stats(u32 rmid) { > > + u32 vrmid = topology_physical_package_id(smp_processor_id()) * > > + cqm_max_rmid + rmid; > > Can you please explain that magic calculation? It's far from obvious why this > would be correct. > > > + > > + if ((!cqm_mbm) || (!mbm_local) || (!mbm_total)) > > + return; > > + memset(&mbm_local[vrmid], 0, sizeof(struct sample)); > > + memset(&mbm_total[vrmid], 0, sizeof(struct sample)); } > > + > > /* > > * Returns < 0 on fail. > > * > > @@ -190,8 +244,10 @@ static u32 __get_rmid(void) > > if (list_empty(&cqm_rmid_free_lru)) > > return INVALID_RMID; > > > > - entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, > list); > > + entry = list_first_entry(&cqm_rmid_free_lru, > > + struct cqm_rmid_entry, list); > > And the point of this change is? > > > list_del(&entry->list); > > + mbm_reset_stats(entry->rmid); > > > > return entry->rmid; > > } > > @@ -207,6 +263,7 @@ static void __put_rmid(u32 rmid) > > > > entry->queue_time = jiffies; > > entry->state = RMID_YOUNG; > > + mbm_reset_stats(rmid); > > > > list_add_tail(&entry->list, &cqm_rmid_limbo_lru); } @@ -232,6 > > +289,8 @@ static int intel_cqm_setup_rmid_cache(void) > > > > INIT_LIST_HEAD(&entry->list); > > entry->rmid = r; > > + entry->config = false; > > + > > cqm_rmid_ptrs[r] = entry; > > > > list_add_tail(&entry->list, &cqm_rmid_free_lru); @@ -254,6 > +313,8 > > @@ fail: > > kfree(cqm_rmid_ptrs[r]); > > > > kfree(cqm_rmid_ptrs); > > + kfree(mbm_local); > > + kfree(mbm_total); > > return -ENOMEM; > > } > > > > @@ -403,9 +464,11 @@ static void __intel_cqm_event_count(void *info); > > static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) { > > struct perf_event *event; > > + > > Superfluous newline. > > > struct list_head *head = &group->hw.cqm_group_entry; > > u32 old_rmid = group->hw.cqm_rmid; > > > > + > > Another one. Sigh! > > > lockdep_assert_held(&cache_mutex); > > > > /* > > @@ -494,6 +557,166 @@ static bool intel_cqm_sched_in_event(u32 rmid) > > return false; > > } > > > > +static u32 mbm_fifo_sum_lastn_out(int rmid, bool is_localbw) > > Random space after u32 > > > +{ > > + u32 val = 0, i, j, index; > > + > > + if (is_localbw) { > > + if (++mbm_local[rmid].fifoout >= mbm_window_size) > > + mbm_local[rmid].fifoout = 0; > > + index = mbm_local[rmid].fifoout; > > + for (i = 0; i < mbm_window_size - 1; i++) { > > + if (index + i >= mbm_window_size) > > + j = index + i - mbm_window_size; > > + else > > + j = index + i; > > + val += mbm_local[rmid].mbmfifo[j]; > > + } > > + return val; > > + } > > + > > + if (++mbm_total[rmid].fifoout >= mbm_window_size) > > + mbm_total[rmid].fifoout = 0; > > + index = mbm_total[rmid].fifoout; > > + for (i = 0; i < mbm_window_size - 1; i++) { > > + if (index + i >= mbm_window_size) > > + j = index + i - mbm_window_size; > > + else > > + j = index + i; > > + val += mbm_total[rmid].mbmfifo[j]; > > + } > > The concept of helper functions to avoid code duplication is > definitely applicable here. > > > + return val; > > +} > > + > > +static int mbm_fifo_in(int rmid, u32 val, bool is_localbw) > > Random space after int. > > > +{ > > + if (is_localbw) { > > + mbm_local[rmid].mbmfifo[mbm_local[rmid].fifoin] = val; > > + if (++mbm_local[rmid].fifoin >= mbm_window_size) > > + mbm_local[rmid].fifoin = 0; > > + } else { > > + mbm_total[rmid].mbmfifo[mbm_total[rmid].fifoin] = val; > > + if (++mbm_total[rmid].fifoin >= mbm_window_size) > > + mbm_total[rmid].fifoin = 0; > > + } > > static void mbm_fifo_in(struct sample *, u32 val) > > would get rid of code duplication. > > > + return 0; > > What's the purpose of having a return value here? > > > +} > > + > > +/* > > + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and > reads > > + * its MSR counter. if (MSR current value < MSR previous value) it is an > > + * overflow and overflow is handled. If MSR is read within last 100ms, > > + * then the value is ignored; this will suppress small deltas. We don't > > + * process MBM samples that are within 100ms. Bandwidth is calculated as: > > + * bandwidth = difference of last two msr counter values/time difference. > > + * cum_avg = Running Average bandwidth of last 'n' samples that are > processed > > + * Sliding window is used to save the last 'n' samples. Hence, > > + * n = sliding_window_size > > + * cum_avg is scaled down by a factor MBM_SCALING_FACTOR and rounded > to nearest > > + * integer. User interface reads the BW in MB/sec. > > + * Rounding algorithm : (X + 0.5): > > + * where X is scaled BW value. To avoid floating point arithmetic : > > + * BW- unscaled value > > + * (BW + MBM_SCALING_HALF)/MBM_SCALING_FACTOR is computed which > gives the > > + * scaled bandwidth. > > This is completely unreadable. > > > + */ > > +static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local) > > +{ > > + u64 val, tmp, diff_time, cma, bytes, index; > > + bool overflow = false; > > + ktime_t cur_time; > > + u32 tmp32 = rmid; > > + u32 vrmid = topology_physical_package_id(smp_processor_id()) * > > + cqm_max_rmid + rmid; > > + > > + rmid = vrmid; > > This is completely backwards. > > tmp32 = rmid; > rmid = vrmid; > do_stuff(rmid); > rmid = tmp32; > do_other_stuff(rmid); > > Why can't you use vrmid for do_stuff() and leave rmid alone? Just > because it would make the code simpler to read? > > > + cur_time = ktime_get_real(); > > Why would this operate on wall time and not on clock monotonic time? > > > + if (read_mbm_local) { > > + cma = mbm_local[rmid].cum_avg; > > + diff_time = ktime_ms_delta(cur_time, > > + mbm_local[rmid].prev_time); > > + if (diff_time < 100) > > + return cma; > > + mbm_local[rmid].prev_time = ktime_set(0, > > + (unsigned long)ktime_to_ns(cur_time)); > > + bytes = mbm_local[rmid].bytes; > > + index = mbm_local[rmid].index; > > + rmid = tmp32; > > + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID1, > rmid); > > Why is this using ID1? > > > + } else { > > + cma = mbm_total[rmid].cum_avg; > > + diff_time = ktime_ms_delta(cur_time, > > + mbm_total[rmid].prev_time); > > + if (diff_time < 100) > > + return cma; > > + mbm_total[rmid].prev_time = ktime_set(0, > > + (unsigned long)ktime_to_ns(cur_time)); > > + bytes = mbm_total[rmid].bytes; > > + index = mbm_total[rmid].index; > > + rmid = tmp32; > > + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID, > rmid); > > + } > > Random space after tab. > > > + rdmsrl(MSR_IA32_QM_CTR, val); > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) > > + return val; > > + > > + tmp = val; > > You really have a faible for random storage. tmp gets assigned to > mbm_*[rmid].bytes further down. This makes no sense at all. > > > + if (val < bytes) /* overflow handle it */ { > > These tail comments are crap. > > > + val = MAX_MBM_CNTR - bytes + val; > > + overflow = true; > > + } else > > + val = val - bytes; > > That else clause is missing braces. > > > + if (diff_time < MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN) > > + val = (val * MBM_TIME_DELTA_MAX) / diff_time; > > That resolves to > > if (diff_time < 0) > > Which is always false because diff_time is u64. > > What's the logic behind this? > > > + > > + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma)) > > + /* First sample, we can't use the time delta */ > > + diff_time = MBM_TIME_DELTA_MAX; > > And this? > > > + > > + rmid = vrmid; > > More obfuscation > > > + if ((diff_time <= MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN) > || > > + overflow) { > > Again, that resolves to > > if (overflow) > > And even if MBM_TIME_DELTA_MAX would be not equal > MBM_TIME_DELTA_MIN > then this wants some explanation about > > MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN > > versus > > MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN > > and the whole logic behind this time delta magic, if there is any. > > > + int bw, ret; > > + > > + if (index & (index < mbm_window_size)) > > + val = cma * MBM_SCALING_FACTOR + val / index - > > + cma / index; > > + > > + val = (val + MBM_SCALING_HALF) / MBM_SCALING_FACTOR; > > + if (index >= mbm_window_size) { > > These conditionals along with the magic math are undocumented. > > > + /* > > + * Compute the sum of recent n-1 samples and slide the > > + * window by 1 > > + */ > > + ret = mbm_fifo_sum_lastn_out(rmid, read_mbm_local); > > + /* recalculate the running average with current bw */ > > + ret = (ret + val) / mbm_window_size; > > + if (ret < 0) > > The average of positive numbers becomes negative ? > > > + ret = 0; > > + bw = val; > > + val = ret; > > Your back and forth assignements of random variables is not making the > code any more readable. > > > + } else > > + bw = val; > > + /* save the recent bw in fifo */ > > + mbm_fifo_in(rmid, bw, read_mbm_local); > > + > > + if (read_mbm_local) { > > Oh well. You have this read_mbm_local conditional 3 times in this > function. Why don't you make the function oblivious of this and hand > in a pointer to mbm_local or mbm_total? Would be too simple and not > enough obfuscated, right? > > > + mbm_local[rmid].index++; > > + mbm_local[rmid].cum_avg = val; > > + mbm_local[rmid].bytes = tmp; > > + mbm_local[rmid].prev_time = ktime_set(0, > > + (unsigned long)ktime_to_ns(cur_time)); > > + } else { > > + mbm_total[rmid].index++; > > + mbm_total[rmid].cum_avg = val; > > + mbm_total[rmid].bytes = tmp; > > + mbm_total[rmid].prev_time = ktime_set(0, > > + (unsigned long)ktime_to_ns(cur_time)); > > + } > > + return val; > > + } > > + return cma; > > +} > > + > > /* > > * Initially use this constant for both the limbo queue time and the > > * rotation timer interval, pmu::hrtimer_interval_ms. > > @@ -568,7 +791,8 @@ static bool intel_cqm_rmid_stabilize(unsigned int > *available) > > /* > > * Test whether an RMID is free for each package. > > */ > > - on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); > > + if (entry->config) > > This entry->config change wants to be a separate patch. It's > completely non obvious how this changes the behaviour of the existing code. > > > + on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, > true); > > > > list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { > > /* > > @@ -846,8 +1070,9 @@ static void intel_cqm_setup_event(struct perf_event > *event, > > struct perf_event **group) > > { > > struct perf_event *iter; > > - bool conflict = false; > > + > > u32 rmid; > > + bool conflict = false; > > Fits the overall picture of this patch: Sloppy > > > list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { > > rmid = iter->hw.cqm_rmid; > > @@ -875,6 +1100,40 @@ static void intel_cqm_setup_event(struct > perf_event *event, > > event->hw.cqm_rmid = rmid; > > } > > > > +static void intel_cqm_event_update(struct perf_event *event) > > +{ > > + unsigned int rmid; > > + u64 val = 0; > > + > > + /* > > + * Task events are handled by intel_cqm_event_count(). > > + */ > > + if (event->cpu == -1) > > + return; > > + > > + rmid = event->hw.cqm_rmid; > > + if (!__rmid_valid(rmid)) > > + return; > > + > > + switch (event->attr.config) { > > + case QOS_MBM_TOTAL_EVENT_ID: > > + val = __rmid_read_mbm(rmid, false); > > + break; > > + case QOS_MBM_LOCAL_EVENT_ID: > > + val = __rmid_read_mbm(rmid, true); > > + break; > > + default: > > + break; > > return? > > > + } > > + /* > > + * Ignore this reading on error states and do not update the value. > > + */ > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) > > + return; > > + > > + local64_set(&event->count, val); > > +} > > + > > static void intel_cqm_event_read(struct perf_event *event) > > { > > unsigned long flags; > > @@ -887,6 +1146,12 @@ static void intel_cqm_event_read(struct perf_event > *event) > > if (event->cpu == -1) > > return; > > > > + if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) || > > + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) > > + intel_cqm_event_update(event); > > + > > + if (!(event->attr.config & QOS_L3_OCCUP_EVENT_ID)) > > + return; > > raw_spin_lock_irqsave(&cache_lock, flags); > > rmid = event->hw.cqm_rmid; > > > > @@ -906,6 +1171,28 @@ out: > > raw_spin_unlock_irqrestore(&cache_lock, flags); > > } > > > > +static void __intel_cqm_event_total_bw_count(void *info) > > +{ > > + struct rmid_read *rr = info; > > + u64 val; > > + > > + val = __rmid_read_mbm(rr->rmid, false); > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) > > + return; > > + atomic64_add(val, &rr->value); > > +} > > + > > +static void __intel_cqm_event_local_bw_count(void *info) > > +{ > > + struct rmid_read *rr = info; > > + u64 val; > > + > > + val = __rmid_read_mbm(rr->rmid, true); > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) > > + return; > > + atomic64_add(val, &rr->value); > > You're really a fan of copy and paste. > > > +} > > + > > static void __intel_cqm_event_count(void *info) > > { > > struct rmid_read *rr = info; > > @@ -967,7 +1254,21 @@ static u64 intel_cqm_event_count(struct perf_event > *event) > > if (!__rmid_valid(rr.rmid)) > > goto out; > > > > - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); > > + switch (event->attr.config) { > > + case QOS_L3_OCCUP_EVENT_ID: > > + on_each_cpu_mask(&cqm_cpumask, > __intel_cqm_event_count, &rr, 1); > > + break; > > + case QOS_MBM_TOTAL_EVENT_ID: > > + on_each_cpu_mask(&cqm_cpumask, > __intel_cqm_event_total_bw_count, > > + &rr, 1); > > + break; > > + case QOS_MBM_LOCAL_EVENT_ID: > > + on_each_cpu_mask(&cqm_cpumask, > __intel_cqm_event_local_bw_count, > > + &rr, 1); > > + break; > > + default: > > + break; > > So there is nothing to do and you happily proceed? > > > + } > > > > raw_spin_lock_irqsave(&cache_lock, flags); > > if (event->hw.cqm_rmid == rr.rmid) > > @@ -977,6 +1278,39 @@ out: > > return __perf_event_count(event); > > } > > > static void intel_cqm_event_start(struct perf_event *event, int mode) > > { > > struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); > > @@ -995,19 +1329,48 @@ static void intel_cqm_event_start(struct > perf_event *event, int mode) > > } > > > > state->rmid = rmid; > > + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) { > > + struct cqm_rmid_entry *entry; > > + > > + entry = __rmid_entry(rmid); > > + entry->config = true; > > + } > > wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); > > + > > + if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) || > > + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) && > (cqm_mbm)) { > > + int cpu = get_cpu(); > > pmu->start() is called with interrupts disabled. So why do you need > get_cpu() here? > > > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu); > > + > > + if (pmu) { > > + if (pmu->n_active == 0) > > + mbm_hrtimer_init(pmu); > > That pmu timer wants to be initialized when the pmu is initialized. > > > + pmu->n_active++; > > + list_add_tail(&event->active_entry, > > + &pmu->active_list); > > + if (pmu->n_active == 1) > > + mbm_start_hrtimer(pmu); > > + } > > + put_cpu(); > > + } > > } > > > > static void intel_cqm_event_stop(struct perf_event *event, int mode) > > { > > struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); > > + struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu); > > > > if (event->hw.cqm_state & PERF_HES_STOPPED) > > return; > > > > event->hw.cqm_state |= PERF_HES_STOPPED; > > > > - intel_cqm_event_read(event); > > + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) > > + intel_cqm_event_read(event); > > + > > + if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) || > > + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) > > + intel_cqm_event_update(event); > > > > if (!--state->rmid_usecnt) { > > state->rmid = 0; > > @@ -1015,8 +1378,18 @@ static void intel_cqm_event_stop(struct > perf_event *event, int mode) > > } else { > > WARN_ON_ONCE(!state->rmid); > > } > > + > > + if (pmu) { > > + WARN_ON_ONCE(pmu->n_active <= 0); > > + pmu->n_active--; > > + if (pmu->n_active == 0) > > + mbm_stop_hrtimer(pmu); > > + list_del(&event->active_entry); > > + } > > + > > } > > > > + > > Sigh > > > static int intel_cqm_event_add(struct perf_event *event, int mode) > > { > > > +static ssize_t > > +sliding_window_size_store(struct device *dev, > > + struct device_attribute *attr, > > + const char *buf, size_t count) > > +{ > > + unsigned int bytes; > > + int ret; > > + > > + ret = kstrtouint(buf, 0, &bytes); > > + if (ret) > > + return ret; > > + > > + mutex_lock(&cache_mutex); > > + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX) > > + mbm_window_size = bytes; > > + else > > + bytes = MBM_FIFO_SIZE_MIN; > > What kind of twisted logic is that? > > > + > > + mutex_unlock(&cache_mutex); > > + > > + return count; > > +} > > + > > static DEVICE_ATTR_RW(max_recycle_threshold); > > +static DEVICE_ATTR_RW(sliding_window_size); > > > > static struct attribute *intel_cqm_attrs[] = { > > &dev_attr_max_recycle_threshold.attr, > > + &dev_attr_sliding_window_size.attr, > > NULL, > > }; > > > > @@ -1241,16 +1699,17 @@ static inline void cqm_pick_event_reader(int cpu) > > > > for_each_cpu(i, &cqm_cpumask) { > > if (phys_id == topology_physical_package_id(i)) > > - return; /* already got reader for this socket */ > > + return; /* already got reader for this socket */ > > More random crap. > > > } > > > > cpumask_set_cpu(cpu, &cqm_cpumask); > > } > > > > -static void intel_cqm_cpu_prepare(unsigned int cpu) > > +static int intel_cqm_cpu_prepare(unsigned int cpu) > > { > > struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); > > struct cpuinfo_x86 *c = &cpu_data(cpu); > > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu); > > > > state->rmid = 0; > > state->closid = 0; > > @@ -1258,12 +1717,27 @@ static void intel_cqm_cpu_prepare(unsigned int > cpu) > > > > WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); > > WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); > > + > > + if ((!pmu) && (cqm_mbm)) { > > + pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL, > NUMA_NO_NODE); > > + if (!pmu) > > + return -ENOMEM; > > + spin_lock_init(&pmu->lock); > > + INIT_LIST_HEAD(&pmu->active_list); > > + pmu->pmu = &intel_cqm_pmu; > > + pmu->n_active = 0; > > + pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX); > > + per_cpu(mbm_pmu, cpu) = pmu; > > + per_cpu(mbm_pmu_to_free, cpu) = NULL; > > So that mbm_pmu_to_free per cpu variable is NULL already and never > becomes anything else than NULL. What's the purpose of setting it NULL > again? And what's the purpose of it in general? > > > + } > > + return 0; > > } > > > > static void intel_cqm_cpu_exit(unsigned int cpu) > > { > > int phys_id = topology_physical_package_id(cpu); > > int i; > > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu); > > > > /* > > * Is @cpu a designated cqm reader? > > @@ -1280,6 +1754,13 @@ static void intel_cqm_cpu_exit(unsigned int cpu) > > break; > > } > > } > > + > > + /* cancel overflow polling timer for CPU */ > > + if (pmu) > > + mbm_stop_hrtimer(pmu); > > + kfree(mbm_local); > > + kfree(mbm_total); > > So this frees the module global storage if one cpu is unplugged and > leaves the pointer initialized so the rest of the code can happily > dereference it. > > > + > > } > > > > static int intel_cqm_cpu_notifier(struct notifier_block *nb, > > @@ -1289,7 +1770,7 @@ static int intel_cqm_cpu_notifier(struct > notifier_block *nb, > > > > switch (action & ~CPU_TASKS_FROZEN) { > > case CPU_UP_PREPARE: > > - intel_cqm_cpu_prepare(cpu); > > + return intel_cqm_cpu_prepare(cpu); > > break; > > case CPU_DOWN_PREPARE: > > intel_cqm_cpu_exit(cpu); > > @@ -1305,17 +1786,74 @@ static int intel_cqm_cpu_notifier(struct > notifier_block *nb, > > static const struct x86_cpu_id intel_cqm_match[] = { > > { .vendor = X86_VENDOR_INTEL, .feature = > X86_FEATURE_CQM_OCCUP_LLC }, > > {} > > + }, intel_mbm_match[] = { > > What's wrong with having a separate > > static const struct x86_cpu_id intel_mbm_match[] = { > > > + { .vendor = X86_VENDOR_INTEL, .feature = > X86_FEATURE_CQM_MBM_LOCAL }, > > + {} > > }; > > That again would make the change obvious, but that's not on your agenda. > > > > > static int __init intel_cqm_init(void) > > { > > char *str, scale[20]; > > - int i, cpu, ret; > > + int i = 0, cpu, ret; > > Initializing i is required because? > > > - if (!x86_match_cpu(intel_cqm_match)) > > + if (!x86_match_cpu(intel_cqm_match) && > > + (!x86_match_cpu(intel_mbm_match))) > > return -ENODEV; > > > > cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; > > + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid; > > + > > + if (x86_match_cpu(intel_cqm_match)) { > > + cqm_llc_occ = true; > > + intel_cqm_events_group.attrs = intel_cmt_events_attr; > > + } else > > + cqm_llc_occ = false; > > I see you do not trust the compiler to put cqm_llc_occ into the BSS > section and neither the kernel to zero it out proper. > > > + > > + if (x86_match_cpu(intel_mbm_match)) { > > + u32 mbm_scale_rounded = 0; > > + > > + cqm_mbm = true; > > + cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; > > + /* > > + * MBM counter values are in bytes. To conver this to MB/sec, > > + * we scale the MBM scale factor by 1000. Another 1000 factor > > + * scaling down is done > > + * after reading the counter val i.e. in the function > > + * __rmid_read_mbm() > > So the event attribute string is in KB/sec and at some other place you > convert it to MB/sec? There seems to be some hidden logic for this, > but that's definitely not decodeable for me. > > > + */ > > + mbm_scale_rounded = (cqm_l3_scale + 500) / 1000; > > div_round_up > > > + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid; > > + snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded); > > + str = kstrdup(scale, GFP_KERNEL); > > + if (!str) { > > + ret = -ENOMEM; > > + goto out; > > + } > > + > > + if (cqm_llc_occ) > > + intel_cqm_events_group.attrs = > > + intel_cmt_mbm_events_attr; > > And this line break is required because? > > > + else > > + intel_cqm_events_group.attrs = > intel_mbm_events_attr; > > + > > + event_attr_intel_cqm_llc_local_bw_scale.event_str > > + = event_attr_intel_cqm_llc_total_bw_scale.event_str = str; > > You really love to write unreadable crap. This is neither perl nor the > obfuscated c-code contest. What's wrong with two separate lines which > assign 'str' to each of the event attributes? > > > + mbm_local = kzalloc_node(sizeof(struct sample) * > > + (cqm_max_rmid + 1) * MBM_SOCKET_MAX, > > Calculating the size in a separate line with a proper variable would > make it readable. > > > + GFP_KERNEL, NUMA_NO_NODE); > > + if (!mbm_local) { > > Leaks str > > > + ret = -ENOMEM; > > + goto out; > > + } > > + mbm_total = kzalloc_node(sizeof(struct sample) * > > + (cqm_max_rmid + 1) * MBM_SOCKET_MAX, > > + GFP_KERNEL, NUMA_NO_NODE); > > + if (!mbm_total) { > > + ret = -ENOMEM; > > Leaks str and mbm_local > > > + goto out; > > + } > > What about sticking that mess into a separate function? > > > + } else > > + cqm_mbm = false; > > Sigh. > > > /* > > * It's possible that not all resources support the same number > > @@ -1328,44 +1866,48 @@ static int __init intel_cqm_init(void) > > */ > > cpu_notifier_register_begin(); > > > > - for_each_online_cpu(cpu) { > > - struct cpuinfo_x86 *c = &cpu_data(cpu); > > + if (cqm_llc_occ) { > > + for_each_online_cpu(cpu) { > > + struct cpuinfo_x86 *c = &cpu_data(cpu); > > > > - if (c->x86_cache_max_rmid < cqm_max_rmid) > > - cqm_max_rmid = c->x86_cache_max_rmid; > > + if (c->x86_cache_max_rmid < cqm_max_rmid) > > + cqm_max_rmid = c->x86_cache_max_rmid; > > > > - if (c->x86_cache_occ_scale != cqm_l3_scale) { > > - pr_err("Multiple LLC scale values, disabling\n"); > > - ret = -EINVAL; > > - goto out; > > + if (c->x86_cache_occ_scale != cqm_l3_scale) { > > + pr_err("Multiple LLC scale values, > > disabling\n"); > > + ret = -EINVAL; > > More memory leaks > > > + goto out; > > + } > > } > > - } > > > > - /* > > - * A reasonable upper limit on the max threshold is the number > > - * of lines tagged per RMID if all RMIDs have the same number of > > - * lines tagged in the LLC. > > - * > > - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. > > - */ > > - __intel_cqm_max_threshold = > > + /* > > + * A reasonable upper limit on the max threshold is the number > > + * of lines tagged per RMID if all RMIDs have the same number > of > > + * lines tagged in the LLC. > > + * > > + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. > > + */ > > + __intel_cqm_max_threshold = > > boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); > > > > - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); > > - str = kstrdup(scale, GFP_KERNEL); > > - if (!str) { > > - ret = -ENOMEM; > > - goto out; > > - } > > + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); > > + str = kstrdup(scale, GFP_KERNEL); > > + if (!str) { > > + ret = -ENOMEM; > > Ditto. > > > + goto out; > > + } > > > > - event_attr_intel_cqm_llc_scale.event_str = str; > > + event_attr_intel_cqm_llc_scale.event_str = str; > > + } > > > > ret = intel_cqm_setup_rmid_cache(); > > if (ret) > > goto out; > > > > for_each_online_cpu(i) { > > - intel_cqm_cpu_prepare(i); > > + ret = intel_cqm_cpu_prepare(i); > > + if (ret) > > + goto out; > > And that leaves even more half initialized stuff around. > > > cqm_pick_event_reader(i); > > } > > I fear your assessment in > > > http://lkml.kernel.org/r/06033C969873E840BDE9FC9B584F66B58BB89D@ORS > MSX116.amr.corp.intel.com > > is slightly wrong. > > That's a completely convoluted mess. Aside of that its broken all over > the place. > > This wants to be split up into preparatory patches which remodel the > existing code and move out cqm stuff into helper functions, so the mbm > stuff can be plugged in with its own helpers nicely. > > Thanks, > > tglx -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/