RE: [PATCH v2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU

Juvva, Kanaka D Mon, 03 Aug 2015 10:52:28 -0700

Thomas,

Acknowledged. I'll update the patch and generate separate patch(s) wherever cqm 
changes 
are required. One thing was checkpatch.pl gave errors in current cqm code. I 
had to break lines
as it complained about some line(s) exceeding 80 chars.


Regards.
-Kanaka

> -----Original Message-----
> From: Thomas Gleixner [mailto:t...@linutronix.de]
> Sent: Tuesday, July 28, 2015 12:59 AM
> To: Kanaka Juvva
> Cc: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will; Andi 
> Kleen;
> LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; Autee, Priya V; x...@kernel.org;
> mi...@redhat.com; H. Peter Anvin; Shivappa, Vikas
> Subject: Re: [PATCH v2] perf,x86: add Intel Memory Bandwidth Monitoring
> (MBM) PMU
> 
> On Tue, 21 Jul 2015, Kanaka Juvva wrote:
> > diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > index 1880761..dc1ce58 100644
> > --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > @@ -12,10 +12,20 @@
> >  #define MSR_IA32_PQR_ASSOC 0x0c8f
> >  #define MSR_IA32_QM_CTR            0x0c8e
> >  #define MSR_IA32_QM_EVTSEL 0x0c8d
> > +#define MAX_MBM_CNTR            0xffffff
> 
> Can we please have a consistent name space MBM_... here?
> 
> > +#define MBM_SOCKET_MAX          8
> > +#define MBM_TIME_DELTA_MAX      1000
> > +#define MBM_TIME_DELTA_MIN      1000
> > +#define MBM_SCALING_FACTOR      1000
> > +#define MBM_SCALING_HALF        (MBM_SCALING_FACTOR/2)
> > +#define MBM_FIFO_SIZE_MIN       10
> > +#define MBM_FIFO_SIZE_MAX       300
> >
> > -static u32 cqm_max_rmid = -1;
> > -static unsigned int cqm_l3_scale; /* supposedly cacheline size */
> >
> > +static u32 cqm_max_rmid = -1;
> > +static unsigned int cqm_l3_scale;/* supposedly cacheline size */
> 
> Pointless white space change
> 
> > +static unsigned mbm_window_size = MBM_FIFO_SIZE_MIN; static bool
> > +cqm_llc_occ, cqm_mbm;
> >  /**
> >   * struct intel_pqr_state - State cache for the PQR MSR
> >   * @rmid:          The cached Resource Monitoring ID
> > @@ -42,6 +52,34 @@ struct intel_pqr_state {
> >   * interrupts disabled, which is sufficient for the protection.
> >   */
> >  static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
> > +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu); static
> > +DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
> > +
> > +/*
> > + * mbm pmu is used for storing mbm_local and mbm_total
> > + * events
> > + */
> > +struct mbm_pmu {
> > +   spinlock_t       lock;
> 
> What is this lock protecting and how does it nest into perf locking?
> 
> > +   int              n_active; /* number of active events */
> 
> If you want to document your struct members, please use kerneldoc and not
> these hard to read tail comments.
> 
> > +   struct list_head active_list;
> > +   struct pmu       *pmu; /* pointer to mbm perf_event */
> > +   ktime_t          timer_interval; /* in ktime_t unit */
> > +   struct hrtimer   hrtimer;
> > +};
> > +
> > +struct sample {
> > +   u64 bytes; /* mbm counter value read*/
> > +   u64 cum_avg;   /* current running average of bandwidth */
> > +   ktime_t prev_time;
> > +   u64 index; /* current sample no */
> > +   u32 mbmfifo[MBM_FIFO_SIZE_MAX]; /* window of last n bw values */
> > +   u32  fifoin; /* mbmfifo in counter for sliding window */
> > +   u32  fifoout; /* mbmfifo out counter for sliding window */
> 
> So above you use a proper layout which is actualy readable (aside of the tail
> comments). Can you please stick with a consistent coding style?
> 
> > +};
> > +
> > +struct sample *mbm_total; /* curent stats for  mbm_total events */
> > +struct sample *mbm_local; /* current stats for mbm_local events */
> 
> static ? And please get rid of these tail comments.
> 
> >  /*
> >   * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
> > @@ -66,6 +104,9 @@ static cpumask_t cqm_cpumask;
> >  #define RMID_VAL_UNAVAIL   (1ULL << 62)
> >
> >  #define QOS_L3_OCCUP_EVENT_ID      (1 << 0)
> > +#define QOS_MBM_TOTAL_EVENT_ID  (1 << 1)
> 
> Please use tabs not spaces.
> 
> > +#define QOS_MBM_LOCAL_EVENT_ID1 0x3
> 
> What's ID1 for heavens sake? Looks like
> 
>        (QOS_L3_OCCUP_EVENT_ID | QOS_MBM_TOTAL_EVENT_ID)
> 
> So this wants a descriptive ID name and a comment.
> 
> > +#define QOS_MBM_LOCAL_EVENT_ID  (1 << 2)
> >
> >  #define QOS_EVENT_MASK     QOS_L3_OCCUP_EVENT_ID
> >
> > @@ -90,7 +131,8 @@ static u32 intel_cqm_rotation_rmid;
> >   */
> >  static inline bool __rmid_valid(u32 rmid)  {
> > -   if (!rmid || rmid == INVALID_RMID)
> > +   WARN_ON_ONCE(rmid > cqm_max_rmid);
> > +   if (!rmid || (rmid == INVALID_RMID) || (rmid > cqm_max_rmid))
> >             return false;
> 
> How is that related to this patch? Looks like a fix or prerequisite change to 
> the
> existing code.
> 
> >     return true;
> > @@ -125,6 +167,7 @@ struct cqm_rmid_entry {
> >     enum rmid_recycle_state state;
> >     struct list_head list;
> >     unsigned long queue_time;
> > +   bool config;
> >  };
> >
> >  /*
> > @@ -176,6 +219,17 @@ static inline struct cqm_rmid_entry
> *__rmid_entry(u32 rmid)
> >     return entry;
> >  }
> >
> > +static void mbm_reset_stats(u32 rmid) {
> > +   u32  vrmid =  topology_physical_package_id(smp_processor_id()) *
> > +                                              cqm_max_rmid + rmid;
> 
> Can you please explain that magic calculation? It's far from obvious why this
> would be correct.
> 
> > +
> > +   if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
> > +           return;
> > +   memset(&mbm_local[vrmid], 0, sizeof(struct sample));
> > +   memset(&mbm_total[vrmid], 0, sizeof(struct sample)); }
> > +
> >  /*
> >   * Returns < 0 on fail.
> >   *
> > @@ -190,8 +244,10 @@ static u32 __get_rmid(void)
> >     if (list_empty(&cqm_rmid_free_lru))
> >             return INVALID_RMID;
> >
> > -   entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry,
> list);
> > +   entry = list_first_entry(&cqm_rmid_free_lru,
> > +                             struct cqm_rmid_entry, list);
> 
> And the point of this change is?
> 
> >     list_del(&entry->list);
> > +   mbm_reset_stats(entry->rmid);
> >
> >     return entry->rmid;
> >  }
> > @@ -207,6 +263,7 @@ static void __put_rmid(u32 rmid)
> >
> >     entry->queue_time = jiffies;
> >     entry->state = RMID_YOUNG;
> > +   mbm_reset_stats(rmid);
> >
> >     list_add_tail(&entry->list, &cqm_rmid_limbo_lru);  } @@ -232,6
> > +289,8 @@ static int intel_cqm_setup_rmid_cache(void)
> >
> >             INIT_LIST_HEAD(&entry->list);
> >             entry->rmid = r;
> > +           entry->config = false;
> > +
> >             cqm_rmid_ptrs[r] = entry;
> >
> >             list_add_tail(&entry->list, &cqm_rmid_free_lru); @@ -254,6
> +313,8
> > @@ fail:
> >             kfree(cqm_rmid_ptrs[r]);
> >
> >     kfree(cqm_rmid_ptrs);
> > +   kfree(mbm_local);
> > +   kfree(mbm_total);
> >     return -ENOMEM;
> >  }
> >
> > @@ -403,9 +464,11 @@ static void __intel_cqm_event_count(void *info);
> > static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)  {
> >     struct perf_event *event;
> > +
> 
> Superfluous newline.
> 
> >     struct list_head *head = &group->hw.cqm_group_entry;
> >     u32 old_rmid = group->hw.cqm_rmid;
> >
> > +
> 
> Another one. Sigh!
> 
> >     lockdep_assert_held(&cache_mutex);
> >
> >     /*
> > @@ -494,6 +557,166 @@ static bool intel_cqm_sched_in_event(u32 rmid)
> >     return false;
> >  }
> >
> > +static u32  mbm_fifo_sum_lastn_out(int rmid, bool is_localbw)
> 
> Random space after u32
> 
> > +{
> > +   u32 val = 0, i, j, index;
> > +
> > +   if (is_localbw) {
> > +           if (++mbm_local[rmid].fifoout >=  mbm_window_size)
> > +                   mbm_local[rmid].fifoout =  0;
> > +           index =  mbm_local[rmid].fifoout;
> > +           for (i = 0; i < mbm_window_size - 1; i++) {
> > +                   if (index + i >= mbm_window_size)
> > +                           j = index + i - mbm_window_size;
> > +                   else
> > +                           j = index + i;
> > +                   val += mbm_local[rmid].mbmfifo[j];
> > +           }
> > +           return val;
> > +   }
> > +
> > +   if (++mbm_total[rmid].fifoout >=  mbm_window_size)
> > +           mbm_total[rmid].fifoout = 0;
> > +   index =  mbm_total[rmid].fifoout;
> > +   for (i = 0; i < mbm_window_size - 1; i++) {
> > +           if (index + i >= mbm_window_size)
> > +                   j = index + i - mbm_window_size;
> > +           else
> > +                   j = index + i;
> > +           val += mbm_total[rmid].mbmfifo[j];
> > +   }
> 
> The concept of helper functions to avoid code duplication is
> definitely applicable here.
> 
> > +   return val;
> > +}
> > +
> > +static int  mbm_fifo_in(int rmid, u32 val, bool is_localbw)
> 
> Random space after int.
> 
> > +{
> > +   if (is_localbw) {
> > +           mbm_local[rmid].mbmfifo[mbm_local[rmid].fifoin] = val;
> > +           if (++mbm_local[rmid].fifoin >=  mbm_window_size)
> > +                   mbm_local[rmid].fifoin =  0;
> > +   } else {
> > +           mbm_total[rmid].mbmfifo[mbm_total[rmid].fifoin] = val;
> > +           if (++mbm_total[rmid].fifoin >=  mbm_window_size)
> > +                   mbm_total[rmid].fifoin =  0;
> > +   }
> 
> static void mbm_fifo_in(struct sample *, u32 val)
> 
> would get rid of code duplication.
> 
> > +   return 0;
> 
> What's the purpose of having a return value here?
> 
> > +}
> > +
> > +/*
> > + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and
> reads
> > + * its MSR counter. if (MSR current value < MSR previous value) it is an
> > + * overflow and overflow is handled. If MSR is read within last 100ms,
> > + * then the value is ignored; this will suppress small deltas. We don't
> > + * process MBM samples that are within 100ms. Bandwidth is calculated as:
> > + * bandwidth = difference of last two msr counter values/time difference.
> > + * cum_avg = Running Average bandwidth of last 'n' samples that are
> processed
> > + * Sliding window is used to save the last 'n' samples. Hence,
> > + * n = sliding_window_size
> > + * cum_avg is scaled down by a  factor MBM_SCALING_FACTOR and rounded
> to nearest
> > + * integer. User interface reads the BW in MB/sec.
> > + * Rounding algorithm : (X + 0.5):
> > + * where X is scaled BW value. To avoid floating point arithmetic :
> > + * BW- unscaled value
> > + * (BW + MBM_SCALING_HALF)/MBM_SCALING_FACTOR is computed which
> gives the
> > + * scaled bandwidth.
> 
> This is completely unreadable.
> 
> > + */
> > +static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local)
> > +{
> > +   u64 val, tmp, diff_time, cma, bytes, index;
> > +   bool overflow = false;
> > +   ktime_t cur_time;
> > +   u32 tmp32 = rmid;
> > +   u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> > +                                            cqm_max_rmid + rmid;
> > +
> > +   rmid = vrmid;
> 
> This is completely backwards.
> 
>       tmp32 = rmid;
>       rmid = vrmid;
>       do_stuff(rmid);
>       rmid = tmp32;
>       do_other_stuff(rmid);
> 
> Why can't you use vrmid for do_stuff() and leave rmid alone? Just
> because it would make the code simpler to read?
> 
> > +   cur_time = ktime_get_real();
> 
> Why would this operate on wall time and not on clock monotonic time?
> 
> > +   if (read_mbm_local) {
> > +           cma = mbm_local[rmid].cum_avg;
> > +           diff_time = ktime_ms_delta(cur_time,
> > +                              mbm_local[rmid].prev_time);
> > +           if (diff_time < 100)
> > +                   return cma;
> > +           mbm_local[rmid].prev_time = ktime_set(0,
> > +                           (unsigned long)ktime_to_ns(cur_time));
> > +           bytes = mbm_local[rmid].bytes;
> > +           index = mbm_local[rmid].index;
> > +           rmid = tmp32;
> > +           wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID1,
> rmid);
> 
> Why is this using ID1?
> 
> > +   } else {
> > +           cma = mbm_total[rmid].cum_avg;
> > +           diff_time = ktime_ms_delta(cur_time,
> > +                                      mbm_total[rmid].prev_time);
> > +           if (diff_time < 100)
> > +                   return cma;
> > +           mbm_total[rmid].prev_time = ktime_set(0,
> > +                                   (unsigned long)ktime_to_ns(cur_time));
> > +           bytes = mbm_total[rmid].bytes;
> > +           index = mbm_total[rmid].index;
> > +           rmid = tmp32;
> > +           wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID,
> rmid);
> > +      }
> 
> Random space after tab.
> 
> > +   rdmsrl(MSR_IA32_QM_CTR, val);
> > +   if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > +           return val;
> > +
> > +   tmp = val;
> 
> You really have a faible for random storage. tmp gets assigned to
> mbm_*[rmid].bytes further down. This makes no sense at all.
> 
> > +   if (val < bytes) /* overflow handle it */ {
> 
> These tail comments are crap.
> 
> > +           val = MAX_MBM_CNTR - bytes + val;
> > +           overflow = true;
> > +   } else
> > +           val = val - bytes;
> 
> That else clause is missing braces.
> 
> > +   if (diff_time < MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN)
> > +           val =  (val * MBM_TIME_DELTA_MAX) / diff_time;
> 
> That resolves to
> 
>       if (diff_time < 0)
> 
> Which is always false because diff_time is u64.
> 
> What's the logic behind this?
> 
> > +
> > +   if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> > +           /* First sample, we can't use the time delta */
> > +           diff_time = MBM_TIME_DELTA_MAX;
> 
> And this?
> 
> > +
> > +   rmid = vrmid;
> 
> More obfuscation
> 
> > +   if ((diff_time <= MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN)
> ||
> > +                   overflow) {
> 
> Again, that resolves to
> 
>       if (overflow)
> 
> And even if MBM_TIME_DELTA_MAX would be not equal
> MBM_TIME_DELTA_MIN
> then this wants some explanation about
> 
>      MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN
> 
> versus
> 
>      MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN
> 
> and the whole logic behind this time delta magic, if there is any.
> 
> > +           int bw, ret;
> > +
> > +           if (index   & (index < mbm_window_size))
> > +                   val = cma * MBM_SCALING_FACTOR  + val / index -
> > +                                   cma / index;
> > +
> > +           val = (val + MBM_SCALING_HALF) / MBM_SCALING_FACTOR;
> > +           if (index >= mbm_window_size) {
> 
> These conditionals along with the magic math are undocumented.
> 
> > +                   /*
> > +                    * Compute the sum of recent n-1 samples and slide the
> > +                    * window by 1
> > +                    */
> > +                   ret = mbm_fifo_sum_lastn_out(rmid, read_mbm_local);
> > +                   /* recalculate the running average with current bw */
> > +                   ret = (ret + val) / mbm_window_size;
> > +                   if (ret < 0)
> 
> The average of positive numbers becomes negative ?
> 
> > +                           ret = 0;
> > +                   bw = val;
> > +                   val = ret;
> 
> Your back and forth assignements of random variables is not making the
> code any more readable.
> 
> > +           } else
> > +                   bw = val;
> > +           /* save the recent bw in fifo */
> > +           mbm_fifo_in(rmid, bw, read_mbm_local);
> > +
> > +           if (read_mbm_local) {
> 
> Oh well. You have this read_mbm_local conditional 3 times in this
> function. Why don't you make the function oblivious of this and hand
> in a pointer to mbm_local or mbm_total? Would be too simple and not
> enough obfuscated, right?
> 
> > +                   mbm_local[rmid].index++;
> > +                   mbm_local[rmid].cum_avg = val;
> > +                   mbm_local[rmid].bytes = tmp;
> > +                   mbm_local[rmid].prev_time = ktime_set(0,
> > +                                   (unsigned long)ktime_to_ns(cur_time));
> > +           } else {
> > +                   mbm_total[rmid].index++;
> > +                   mbm_total[rmid].cum_avg = val;
> > +                   mbm_total[rmid].bytes = tmp;
> > +                   mbm_total[rmid].prev_time = ktime_set(0,
> > +                                   (unsigned long)ktime_to_ns(cur_time));
> > +           }
> > +           return val;
> > +   }
> > +   return  cma;
> > +}
> > +
> >  /*
> >   * Initially use this constant for both the limbo queue time and the
> >   * rotation timer interval, pmu::hrtimer_interval_ms.
> > @@ -568,7 +791,8 @@ static bool intel_cqm_rmid_stabilize(unsigned int
> *available)
> >     /*
> >      * Test whether an RMID is free for each package.
> >      */
> > -   on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
> > +   if (entry->config)
> 
> This entry->config change wants to be a separate patch. It's
> completely non obvious how this changes the behaviour of the existing code.
> 
> > +           on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL,
> true);
> >
> >     list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
> >             /*
> > @@ -846,8 +1070,9 @@ static void intel_cqm_setup_event(struct perf_event
> *event,
> >                               struct perf_event **group)
> >  {
> >     struct perf_event *iter;
> > -   bool conflict = false;
> > +
> >     u32 rmid;
> > +   bool conflict = false;
> 
> Fits the overall picture of this patch: Sloppy
> 
> >     list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
> >             rmid = iter->hw.cqm_rmid;
> > @@ -875,6 +1100,40 @@ static void intel_cqm_setup_event(struct
> perf_event *event,
> >     event->hw.cqm_rmid = rmid;
> >  }
> >
> > +static void intel_cqm_event_update(struct perf_event *event)
> > +{
> > +   unsigned int rmid;
> > +   u64 val = 0;
> > +
> > +   /*
> > +    * Task events are handled by intel_cqm_event_count().
> > +    */
> > +   if (event->cpu == -1)
> > +           return;
> > +
> > +   rmid = event->hw.cqm_rmid;
> > +   if (!__rmid_valid(rmid))
> > +           return;
> > +
> > +   switch (event->attr.config) {
> > +   case QOS_MBM_TOTAL_EVENT_ID:
> > +           val = __rmid_read_mbm(rmid, false);
> > +           break;
> > +   case QOS_MBM_LOCAL_EVENT_ID:
> > +           val = __rmid_read_mbm(rmid, true);
> > +           break;
> > +   default:
> > +           break;
> 
>       return?
> 
> > +   }
> > +   /*
> > +    * Ignore this reading on error states and do not update the value.
> > +    */
> > +   if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > +           return;
> > +
> > +   local64_set(&event->count, val);
> > +}
> > +
> >  static void intel_cqm_event_read(struct perf_event *event)
> >  {
> >     unsigned long flags;
> > @@ -887,6 +1146,12 @@ static void intel_cqm_event_read(struct perf_event
> *event)
> >     if (event->cpu == -1)
> >             return;
> >
> > +   if  ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> > +        (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> > +           intel_cqm_event_update(event);
> > +
> > +   if (!(event->attr.config &  QOS_L3_OCCUP_EVENT_ID))
> > +           return;
> >     raw_spin_lock_irqsave(&cache_lock, flags);
> >     rmid = event->hw.cqm_rmid;
> >
> > @@ -906,6 +1171,28 @@ out:
> >     raw_spin_unlock_irqrestore(&cache_lock, flags);
> >  }
> >
> > +static void __intel_cqm_event_total_bw_count(void *info)
> > +{
> > +   struct rmid_read *rr = info;
> > +   u64 val;
> > +
> > +   val = __rmid_read_mbm(rr->rmid, false);
> > +   if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > +           return;
> > +   atomic64_add(val, &rr->value);
> > +}
> > +
> > +static void __intel_cqm_event_local_bw_count(void *info)
> > +{
> > +   struct rmid_read *rr = info;
> > +   u64 val;
> > +
> > +   val = __rmid_read_mbm(rr->rmid, true);
> > +   if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > +           return;
> > +   atomic64_add(val, &rr->value);
> 
> You're really a fan of copy and paste.
> 
> > +}
> > +
> >  static void __intel_cqm_event_count(void *info)
> >  {
> >     struct rmid_read *rr = info;
> > @@ -967,7 +1254,21 @@ static u64 intel_cqm_event_count(struct perf_event
> *event)
> >     if (!__rmid_valid(rr.rmid))
> >             goto out;
> >
> > -   on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
> > +   switch (event->attr.config) {
> > +   case  QOS_L3_OCCUP_EVENT_ID:
> > +           on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_count, &rr, 1);
> > +           break;
> > +   case  QOS_MBM_TOTAL_EVENT_ID:
> > +           on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_total_bw_count,
> > +                            &rr, 1);
> > +           break;
> > +   case  QOS_MBM_LOCAL_EVENT_ID:
> > +           on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_local_bw_count,
> > +                            &rr, 1);
> > +           break;
> > +   default:
> > +           break;
> 
> So there is nothing to do and you happily proceed?
> 
> > +   }
> >
> >     raw_spin_lock_irqsave(&cache_lock, flags);
> >     if (event->hw.cqm_rmid == rr.rmid)
> > @@ -977,6 +1278,39 @@ out:
> >     return __perf_event_count(event);
> >  }
> 
> >  static void intel_cqm_event_start(struct perf_event *event, int mode)
> >  {
> >     struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> > @@ -995,19 +1329,48 @@ static void intel_cqm_event_start(struct
> perf_event *event, int mode)
> >     }
> >
> >     state->rmid = rmid;
> > +   if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
> > +           struct cqm_rmid_entry *entry;
> > +
> > +           entry = __rmid_entry(rmid);
> > +           entry->config = true;
> > +   }
> >     wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
> > +
> > +   if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> > +        (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))  &&
> (cqm_mbm)) {
> > +           int cpu = get_cpu();
> 
> pmu->start() is called with interrupts disabled. So why do you need
> get_cpu() here?
> 
> > +           struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> > +
> > +           if (pmu) {
> > +                   if (pmu->n_active == 0)
> > +                           mbm_hrtimer_init(pmu);
> 
> That pmu timer wants to be initialized when the pmu is initialized.
> 
> > +                   pmu->n_active++;
> > +                   list_add_tail(&event->active_entry,
> > +                                 &pmu->active_list);
> > +                   if (pmu->n_active == 1)
> > +                           mbm_start_hrtimer(pmu);
> > +           }
> > +           put_cpu();
> > +   }
> >  }
> >
> >  static void intel_cqm_event_stop(struct perf_event *event, int mode)
> >  {
> >     struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> > +   struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
> >
> >     if (event->hw.cqm_state & PERF_HES_STOPPED)
> >             return;
> >
> >     event->hw.cqm_state |= PERF_HES_STOPPED;
> >
> > -   intel_cqm_event_read(event);
> > +   if (event->attr.config & QOS_L3_OCCUP_EVENT_ID)
> > +           intel_cqm_event_read(event);
> > +
> > +   if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> > +       (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> > +           intel_cqm_event_update(event);
> >
> >     if (!--state->rmid_usecnt) {
> >             state->rmid = 0;
> > @@ -1015,8 +1378,18 @@ static void intel_cqm_event_stop(struct
> perf_event *event, int mode)
> >     } else {
> >             WARN_ON_ONCE(!state->rmid);
> >     }
> > +
> > +   if (pmu) {
> > +           WARN_ON_ONCE(pmu->n_active <= 0);
> > +           pmu->n_active--;
> > +           if (pmu->n_active == 0)
> > +                   mbm_stop_hrtimer(pmu);
> > +           list_del(&event->active_entry);
> > +   }
> > +
> >  }
> >
> > +
> 
> Sigh
> 
> >  static int intel_cqm_event_add(struct perf_event *event, int mode)
> >  {
> 
> > +static ssize_t
> > +sliding_window_size_store(struct device *dev,
> > +                     struct device_attribute *attr,
> > +                     const char *buf, size_t count)
> > +{
> > +   unsigned int bytes;
> > +   int ret;
> > +
> > +   ret = kstrtouint(buf, 0, &bytes);
> > +   if (ret)
> > +           return ret;
> > +
> > +   mutex_lock(&cache_mutex);
> > +   if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> > +           mbm_window_size = bytes;
> > +   else
> > +           bytes = MBM_FIFO_SIZE_MIN;
> 
> What kind of twisted logic is that?
> 
> > +
> > +   mutex_unlock(&cache_mutex);
> > +
> > +   return count;
> > +}
> > +
> >  static DEVICE_ATTR_RW(max_recycle_threshold);
> > +static DEVICE_ATTR_RW(sliding_window_size);
> >
> >  static struct attribute *intel_cqm_attrs[] = {
> >     &dev_attr_max_recycle_threshold.attr,
> > +   &dev_attr_sliding_window_size.attr,
> >     NULL,
> >  };
> >
> > @@ -1241,16 +1699,17 @@ static inline void cqm_pick_event_reader(int cpu)
> >
> >     for_each_cpu(i, &cqm_cpumask) {
> >             if (phys_id == topology_physical_package_id(i))
> > -                   return; /* already got reader for this socket */
> > +                   return; /* already got reader for this socket */
> 
> More random crap.
> 
> >     }
> >
> >     cpumask_set_cpu(cpu, &cqm_cpumask);
> >  }
> >
> > -static void intel_cqm_cpu_prepare(unsigned int cpu)
> > +static int intel_cqm_cpu_prepare(unsigned int cpu)
> >  {
> >     struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
> >     struct cpuinfo_x86 *c = &cpu_data(cpu);
> > +   struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> >
> >     state->rmid = 0;
> >     state->closid = 0;
> > @@ -1258,12 +1717,27 @@ static void intel_cqm_cpu_prepare(unsigned int
> cpu)
> >
> >     WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
> >     WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
> > +
> > +   if ((!pmu) && (cqm_mbm)) {
> > +           pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL,
> NUMA_NO_NODE);
> > +           if (!pmu)
> > +                   return  -ENOMEM;
> > +           spin_lock_init(&pmu->lock);
> > +           INIT_LIST_HEAD(&pmu->active_list);
> > +           pmu->pmu = &intel_cqm_pmu;
> > +           pmu->n_active = 0;
> > +           pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
> > +           per_cpu(mbm_pmu, cpu) = pmu;
> > +           per_cpu(mbm_pmu_to_free, cpu) = NULL;
> 
> So that mbm_pmu_to_free per cpu variable is NULL already and never
> becomes anything else than NULL. What's the purpose of setting it NULL
> again? And what's the purpose of it in general?
> 
> > +   }
> > +   return 0;
> >  }
> >
> >  static void intel_cqm_cpu_exit(unsigned int cpu)
> >  {
> >     int phys_id = topology_physical_package_id(cpu);
> >     int i;
> > +   struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> >
> >     /*
> >      * Is @cpu a designated cqm reader?
> > @@ -1280,6 +1754,13 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
> >                     break;
> >             }
> >     }
> > +
> > +   /* cancel overflow polling timer for CPU */
> > +   if (pmu)
> > +           mbm_stop_hrtimer(pmu);
> > +   kfree(mbm_local);
> > +   kfree(mbm_total);
> 
> So this frees the module global storage if one cpu is unplugged and
> leaves the pointer initialized so the rest of the code can happily
> dereference it.
> 
> > +
> >  }
> >
> >  static int intel_cqm_cpu_notifier(struct notifier_block *nb,
> > @@ -1289,7 +1770,7 @@ static int intel_cqm_cpu_notifier(struct
> notifier_block *nb,
> >
> >     switch (action & ~CPU_TASKS_FROZEN) {
> >     case CPU_UP_PREPARE:
> > -           intel_cqm_cpu_prepare(cpu);
> > +           return intel_cqm_cpu_prepare(cpu);
> >             break;
> >     case CPU_DOWN_PREPARE:
> >             intel_cqm_cpu_exit(cpu);
> > @@ -1305,17 +1786,74 @@ static int intel_cqm_cpu_notifier(struct
> notifier_block *nb,
> >  static const struct x86_cpu_id intel_cqm_match[] = {
> >     { .vendor = X86_VENDOR_INTEL, .feature =
> X86_FEATURE_CQM_OCCUP_LLC },
> >     {}
> > +   }, intel_mbm_match[] = {
> 
> What's wrong with having a separate
> 
> static const struct x86_cpu_id intel_mbm_match[] = {
> 
> > +   { .vendor = X86_VENDOR_INTEL, .feature =
> X86_FEATURE_CQM_MBM_LOCAL },
> > +   {}
> >  };
> 
> That again would make the change obvious, but that's not on your agenda.
> 
> >
> >  static int __init intel_cqm_init(void)
> >  {
> >     char *str, scale[20];
> > -   int i, cpu, ret;
> > +   int i = 0, cpu, ret;
> 
> Initializing i is required because?
> 
> > -   if (!x86_match_cpu(intel_cqm_match))
> > +   if (!x86_match_cpu(intel_cqm_match) &&
> > +      (!x86_match_cpu(intel_mbm_match)))
> >             return -ENODEV;
> >
> >     cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> > +   cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> > +
> > +   if (x86_match_cpu(intel_cqm_match)) {
> > +           cqm_llc_occ = true;
> > +           intel_cqm_events_group.attrs = intel_cmt_events_attr;
> > +   } else
> > +           cqm_llc_occ = false;
> 
> I see you do not trust the compiler to put cqm_llc_occ into the BSS
> section and neither the kernel to zero it out proper.
> 
> > +
> > +   if (x86_match_cpu(intel_mbm_match)) {
> > +           u32 mbm_scale_rounded = 0;
> > +
> > +           cqm_mbm = true;
> > +           cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> > +           /*
> > +            * MBM counter values are  in bytes. To conver this to MB/sec,
> > +            * we  scale the MBM scale factor by 1000. Another 1000 factor
> > +            * scaling down is done
> > +            * after reading the counter val i.e. in the function
> > +            * __rmid_read_mbm()
> 
> So the event attribute string is in KB/sec and at some other place you
> convert it to MB/sec? There seems to be some hidden logic for this,
> but that's definitely not decodeable for me.
> 
> > +            */
> > +           mbm_scale_rounded =  (cqm_l3_scale + 500) / 1000;
> 
> div_round_up
> 
> > +           cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> > +           snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
> > +           str = kstrdup(scale, GFP_KERNEL);
> > +           if (!str) {
> > +                   ret = -ENOMEM;
> > +                   goto out;
> > +           }
> > +
> > +           if (cqm_llc_occ)
> > +                   intel_cqm_events_group.attrs =
> > +                           intel_cmt_mbm_events_attr;
> 
> And this line break is required because?
> 
> > +           else
> > +                   intel_cqm_events_group.attrs =
> intel_mbm_events_attr;
> > +
> > +           event_attr_intel_cqm_llc_local_bw_scale.event_str
> > +                 = event_attr_intel_cqm_llc_total_bw_scale.event_str = str;
> 
> You really love to write unreadable crap. This is neither perl nor the
> obfuscated c-code contest. What's wrong with two separate lines which
> assign 'str' to each of the event attributes?
> 
> > +           mbm_local = kzalloc_node(sizeof(struct sample) *
> > +                           (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
> 
> Calculating the size in a separate line with a proper variable would
> make it readable.
> 
> > +                                    GFP_KERNEL, NUMA_NO_NODE);
> > +           if (!mbm_local) {
> 
> Leaks str
> 
> > +                   ret = -ENOMEM;
> > +                   goto out;
> > +           }
> > +           mbm_total = kzalloc_node(sizeof(struct sample) *
> > +                           (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
> > +                                    GFP_KERNEL, NUMA_NO_NODE);
> > +           if (!mbm_total) {
> > +                   ret = -ENOMEM;
> 
> Leaks str and mbm_local
> 
> > +                   goto out;
> > +           }
> 
> What about sticking that mess into a separate function?
> 
> > +   } else
> > +           cqm_mbm = false;
> 
> Sigh.
> 
> >     /*
> >      * It's possible that not all resources support the same number
> > @@ -1328,44 +1866,48 @@ static int __init intel_cqm_init(void)
> >      */
> >     cpu_notifier_register_begin();
> >
> > -   for_each_online_cpu(cpu) {
> > -           struct cpuinfo_x86 *c = &cpu_data(cpu);
> > +   if (cqm_llc_occ) {
> > +           for_each_online_cpu(cpu) {
> > +                   struct cpuinfo_x86 *c = &cpu_data(cpu);
> >
> > -           if (c->x86_cache_max_rmid < cqm_max_rmid)
> > -                   cqm_max_rmid = c->x86_cache_max_rmid;
> > +                   if (c->x86_cache_max_rmid < cqm_max_rmid)
> > +                           cqm_max_rmid = c->x86_cache_max_rmid;
> >
> > -           if (c->x86_cache_occ_scale != cqm_l3_scale) {
> > -                   pr_err("Multiple LLC scale values, disabling\n");
> > -                   ret = -EINVAL;
> > -                   goto out;
> > +                   if (c->x86_cache_occ_scale != cqm_l3_scale) {
> > +                           pr_err("Multiple LLC scale values, 
> > disabling\n");
> > +                           ret = -EINVAL;
> 
> More memory leaks
> 
> > +                           goto out;
> > +                   }
> >             }
> > -   }
> >
> > -   /*
> > -    * A reasonable upper limit on the max threshold is the number
> > -    * of lines tagged per RMID if all RMIDs have the same number of
> > -    * lines tagged in the LLC.
> > -    *
> > -    * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> > -    */
> > -   __intel_cqm_max_threshold =
> > +           /*
> > +            * A reasonable upper limit on the max threshold is the number
> > +            * of lines tagged per RMID if all RMIDs have the same number
> of
> > +            * lines tagged in the LLC.
> > +            *
> > +            * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> > +            */
> > +           __intel_cqm_max_threshold =
> >             boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
> >
> > -   snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> > -   str = kstrdup(scale, GFP_KERNEL);
> > -   if (!str) {
> > -           ret = -ENOMEM;
> > -           goto out;
> > -   }
> > +           snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> > +           str = kstrdup(scale, GFP_KERNEL);
> > +           if (!str) {
> > +                   ret = -ENOMEM;
> 
> Ditto.
> 
> > +                   goto out;
> > +           }
> >
> > -   event_attr_intel_cqm_llc_scale.event_str = str;
> > +           event_attr_intel_cqm_llc_scale.event_str = str;
> > +   }
> >
> >     ret = intel_cqm_setup_rmid_cache();
> >     if (ret)
> >             goto out;
> >
> >     for_each_online_cpu(i) {
> > -           intel_cqm_cpu_prepare(i);
> > +           ret = intel_cqm_cpu_prepare(i);
> > +           if (ret)
> > +                   goto out;
> 
> And that leaves even more half initialized stuff around.
> 
> >             cqm_pick_event_reader(i);
> >     }
> 
> I fear your assessment in
> 
> 
> http://lkml.kernel.org/r/06033C969873E840BDE9FC9B584F66B58BB89D@ORS
> MSX116.amr.corp.intel.com
> 
> is slightly wrong.
> 
> That's a completely convoluted mess. Aside of that its broken all over
> the place.
> 
> This wants to be split up into preparatory patches which remodel the
> existing code and move out cqm stuff into helper functions, so the mbm
> stuff can be plugged in with its own helpers nicely.
> 
> Thanks,
> 
>       tglx

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH v2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU

Reply via email to