From: Tony Luck<tony.l...@intel.com> Includes all the core infrastructure to measure the total_bytes and bandwidth.
We have per socket counters for both total system wide L3 external bytes and local socket memory-controller bytes. The current b/w is calculated for a minimum diff time(time since it was last counted) of 100ms. The OS does MSR writes to MSR_IA32_QM_EVTSEL and MSR_IA32_QM_CTR to read the counters and uses the IA32_PQR_ASSOC_MSR to associate the RMID with the task. The tasks have a common RMID for cqm(cache quality of service monitoring) and MBM. Hence most of the scheduling code is reused from cqm. Signed-off-by: Vikas Shivappa <vikas.shiva...@linux.intel.com> --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 159 ++++++++++++++++++++++++++++- 1 file changed, 155 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index e45f5aa..b1c9663 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -13,6 +13,11 @@ #define MSR_IA32_QM_CTR 0x0c8e #define MSR_IA32_QM_EVTSEL 0x0c8d +/* + * MBM Counter is 24bits wide. MBM_CNTR_MAX defines max counter + * value + */ +#define MBM_CNTR_MAX 0xffffff static u32 cqm_max_rmid = -1; static unsigned int cqm_l3_scale; /* supposedly cacheline size */ static bool cqm_enabled, mbm_enabled; @@ -69,6 +74,16 @@ static struct sample *mbm_total; */ static struct sample *mbm_local; +#define pkg_id topology_physical_package_id(smp_processor_id()) +/* + * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. + * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of + * rmids per socket, an example is given below + * RMID1 of Socket0: vrmid = 1 + * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 + * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 + */ +#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) /* * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. * Also protects event->hw.cqm_rmid @@ -92,8 +107,19 @@ static cpumask_t cqm_cpumask; #define RMID_VAL_UNAVAIL (1ULL << 62) #define QOS_L3_OCCUP_EVENT_ID (1 << 0) +/* + * MBM Event IDs as defined in SDM section 17.15.5 + * Event IDs are used to program EVTSEL MSRs before reading mbm event counters + */ +enum mbm_evt_type { + QOS_MBM_TOTAL_EVENT_ID = 0x02, + QOS_MBM_LOCAL_EVENT_ID, + QOS_MBM_TOTAL_BW_EVENT_ID, + QOS_MBM_LOCAL_BW_EVENT_ID, +}; -#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID +#define QOS_MBM_BW_EVENT_MASK 0x04 +#define QOS_MBM_LOCAL_EVENT_MASK 0x01 /* * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). @@ -423,9 +449,16 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b) struct rmid_read { u32 rmid; atomic64_t value; + enum mbm_evt_type evt_type; }; static void __intel_cqm_event_count(void *info); +static void init_mbm_sample(u32 rmid, enum mbm_evt_type evt_type); + +static bool is_mbm_event(int e) +{ + return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_BW_EVENT_ID); +} /* * Exchange the RMID of a group of events. @@ -867,6 +900,98 @@ static void intel_cqm_rmid_rotate(struct work_struct *work) schedule_delayed_work(&intel_cqm_rmid_work, delay); } +static struct sample *update_sample(unsigned int rmid, + enum mbm_evt_type evt_type, int first) +{ + ktime_t cur_time; + struct sample *mbm_current; + u32 vrmid = rmid_2_index(rmid); + u64 val, bytes, diff_time; + u32 eventid; + + if (evt_type & QOS_MBM_LOCAL_EVENT_MASK) { + mbm_current = &mbm_local[vrmid]; + eventid = QOS_MBM_LOCAL_EVENT_ID; + } else { + mbm_current = &mbm_total[vrmid]; + eventid = QOS_MBM_TOTAL_EVENT_ID; + } + + cur_time = ktime_get(); + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return mbm_current; + val &= MBM_CNTR_MAX; + + if (first) { + mbm_current->interval_start = cur_time; + mbm_current->prev_msr = val; + mbm_current->total_bytes = 0; + mbm_current->interval_bytes = 0; + mbm_current->bandwidth = 0; + return mbm_current; + } + + if (val < mbm_current->prev_msr) + bytes = MBM_CNTR_MAX - mbm_current->prev_msr + val + 1; + else + bytes = val - mbm_current->prev_msr; + bytes *= cqm_l3_scale; + + mbm_current->total_bytes += bytes; + mbm_current->interval_bytes += bytes; + mbm_current->prev_msr = val; + diff_time = ktime_ms_delta(cur_time, mbm_current->interval_start); + + /* + * The b/w measured is really the most recent/current b/w. + * We wait till enough time has passed to avoid + * arthmetic rounding problems.Having it at >=100ms, + * such errors would be <=1%. + */ + if (diff_time > 100) { + bytes = mbm_current->interval_bytes * MSEC_PER_SEC; + do_div(bytes, diff_time); + mbm_current->bandwidth = bytes; + mbm_current->interval_bytes = 0; + mbm_current->interval_start = cur_time; + } + + return mbm_current; +} + +static u64 rmid_read_mbm(unsigned int rmid, enum mbm_evt_type evt_type) +{ + struct sample *mbm_current; + + mbm_current = update_sample(rmid, evt_type, 0); + + if (evt_type & QOS_MBM_BW_EVENT_MASK) + return mbm_current->bandwidth; + else + return mbm_current->total_bytes; +} + +static void __intel_mbm_event_init(void *info) +{ + struct rmid_read *rr = info; + + update_sample(rr->rmid, rr->evt_type, 1); +} + +static void init_mbm_sample(u32 rmid, enum mbm_evt_type evt_type) +{ + struct rmid_read rr = { + .value = ATOMIC64_INIT(0), + }; + + rr.rmid = rmid; + rr.evt_type = evt_type; + /* on each socket, init sample */ + on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); +} + /* * Find a group and setup RMID. * @@ -887,6 +1012,9 @@ static void intel_cqm_setup_event(struct perf_event *event, /* All tasks in a group share an RMID */ event->hw.cqm_rmid = rmid; *group = iter; + if (is_mbm_event(event->attr.config) && + __rmid_valid(rmid)) + init_mbm_sample(rmid, event->attr.config); return; } @@ -903,6 +1031,9 @@ static void intel_cqm_setup_event(struct perf_event *event, else rmid = __get_rmid(); + if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) + init_mbm_sample(rmid, event->attr.config); + event->hw.cqm_rmid = rmid; } @@ -924,7 +1055,10 @@ static void intel_cqm_event_read(struct perf_event *event) if (!__rmid_valid(rmid)) goto out; - val = __rmid_read(rmid); + if (is_mbm_event(event->attr.config)) + val = rmid_read_mbm(rmid, event->attr.config); + else + val = __rmid_read(rmid); /* * Ignore this reading on error states and do not update the value. @@ -955,6 +1089,17 @@ static inline bool cqm_group_leader(struct perf_event *event) return !list_empty(&event->hw.cqm_groups_entry); } +static void __intel_mbm_event_count(void *info) +{ + struct rmid_read *rr = info; + u64 val; + + val = rmid_read_mbm(rr->rmid, rr->evt_type); + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + atomic64_add(val, &rr->value); +} + static u64 intel_cqm_event_count(struct perf_event *event) { unsigned long flags; @@ -1008,7 +1153,12 @@ static u64 intel_cqm_event_count(struct perf_event *event) if (!__rmid_valid(rr.rmid)) goto out; - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + if (is_mbm_event(event->attr.config)) { + rr.evt_type = event->attr.config; + on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, &rr, 1); + } else { + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + } raw_spin_lock_irqsave(&cache_lock, flags); if (event->hw.cqm_rmid == rr.rmid) @@ -1123,7 +1273,8 @@ static int intel_cqm_event_init(struct perf_event *event) if (event->attr.type != intel_cqm_pmu.type) return -ENOENT; - if (event->attr.config & ~QOS_EVENT_MASK) + if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || + (event->attr.config > QOS_MBM_LOCAL_BW_EVENT_ID)) return -EINVAL; /* unsupported modes and filters */ -- 1.9.1