On Thu, Aug 16, 2018 at 01:16:08PM -0700, Reinette Chatre wrote: > + l2_miss_event = perf_event_create_kernel_counter(&perf_miss_attr, > + plr->cpu, > + NULL, NULL, NULL); > + if (IS_ERR(l2_miss_event)) > + goto out; > + > + l2_hit_event = perf_event_create_kernel_counter(&perf_hit_attr, > + plr->cpu, > + NULL, NULL, NULL); > + if (IS_ERR(l2_hit_event)) > + goto out_l2_miss; > + > + local_irq_disable(); > + /* > + * Check any possible error state of events used by performing > + * one local read. > + */ > + if (perf_event_read_local(l2_miss_event, &tmp, NULL, NULL)) { > + local_irq_enable(); > + goto out_l2_hit; > + } > + if (perf_event_read_local(l2_hit_event, &tmp, NULL, NULL)) { > + local_irq_enable(); > + goto out_l2_hit; > + } > + > + /* > + * Disable hardware prefetchers. > * > + * Call wrmsr direcly to avoid the local register variables from > + * being overwritten due to reordering of their assignment with > + * the wrmsr calls. > + */ > + __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); > + > + /* Initialize rest of local variables */ > + /* > + * Performance event has been validated right before this with > + * interrupts disabled - it is thus safe to read the counter index. > + */ > + l2_miss_pmcnum = x86_perf_rdpmc_ctr_get(l2_miss_event); > + l2_hit_pmcnum = x86_perf_rdpmc_ctr_get(l2_hit_event); > + line_size = plr->line_size; > + mem_r = plr->kmem; > + size = plr->size; > + > + /* > + * Read counter variables twice - first to load the instructions > + * used in L1 cache, second to capture accurate value that does not > + * include cache misses incurred because of instruction loads. > + */ > + rdpmcl(l2_hit_pmcnum, l2_hits_before); > + rdpmcl(l2_miss_pmcnum, l2_miss_before); > + /* > + * From SDM: Performing back-to-back fast reads are not guaranteed > + * to be monotonic. To guarantee monotonicity on back-toback reads, > + * a serializing instruction must be placed between the two > + * RDPMC instructions > + */ > + rmb(); > + rdpmcl(l2_hit_pmcnum, l2_hits_before); > + rdpmcl(l2_miss_pmcnum, l2_miss_before); > + /* > + * rdpmc is not a serializing instruction. Add barrier to prevent > + * instructions that follow to begin executing before reading the > + * counter value. > + */ > + rmb(); > + for (i = 0; i < size; i += line_size) { > + /* > + * Add a barrier to prevent speculative execution of this > + * loop reading beyond the end of the buffer. > + */ > + rmb(); > + asm volatile("mov (%0,%1,1), %%eax\n\t" > + : > + : "r" (mem_r), "r" (i) > + : "%eax", "memory"); > + } > + rdpmcl(l2_hit_pmcnum, l2_hits_after); > + rdpmcl(l2_miss_pmcnum, l2_miss_after); > + /* > + * rdpmc is not a serializing instruction. Add barrier to ensure > + * events measured have completed and prevent instructions that > + * follow to begin executing before reading the counter value. > + */ > + rmb(); > + /* Re-enable hardware prefetchers */ > + wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); > + local_irq_enable(); > + trace_pseudo_lock_l2(l2_hits_after - l2_hits_before, > + l2_miss_after - l2_miss_before); > + > +out_l2_hit: > + perf_event_release_kernel(l2_hit_event); > +out_l2_miss: > + perf_event_release_kernel(l2_miss_event); > +out: > + plr->thread_done = 1; > + wake_up_interruptible(&plr->lock_thread_wq); > + return 0; > +} > +
The above, looks a _LOT_ like the below. And while C does suck a little, I'm sure there's something we can do about this. > + l3_miss_event = perf_event_create_kernel_counter(&perf_miss_attr, > + plr->cpu, > + NULL, NULL, > + NULL); > + if (IS_ERR(l3_miss_event)) > + goto out; > + > + l3_hit_event = perf_event_create_kernel_counter(&perf_hit_attr, > + plr->cpu, > + NULL, NULL, > + NULL); > + if (IS_ERR(l3_hit_event)) > + goto out_l3_miss; > + > local_irq_disable(); > /* > + * Check any possible error state of events used by performing > + * one local read. > + */ > + if (perf_event_read_local(l3_miss_event, &tmp, NULL, NULL)) { > + local_irq_enable(); > + goto out_l3_hit; > + } > + if (perf_event_read_local(l3_hit_event, &tmp, NULL, NULL)) { > + local_irq_enable(); > + goto out_l3_hit; > + } > + > + /* > + * Disable hardware prefetchers. > + * > * Call wrmsr direcly to avoid the local register variables from > * being overwritten due to reordering of their assignment with > * the wrmsr calls. > */ > __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); > + > + /* Initialize rest of local variables */ > + /* > + * Performance event has been validated right before this with > + * interrupts disabled - it is thus safe to read the counter index. > + */ > + l3_hit_pmcnum = x86_perf_rdpmc_ctr_get(l3_hit_event); > + l3_miss_pmcnum = x86_perf_rdpmc_ctr_get(l3_miss_event); > + line_size = plr->line_size; > mem_r = plr->kmem; > size = plr->size; > + > + /* > + * Read counter variables twice - first to load the instructions > + * used in L1 cache, second to capture accurate value that does not > + * include cache misses incurred because of instruction loads. > + */ > + rdpmcl(l3_hit_pmcnum, l3_hits_before); > + rdpmcl(l3_miss_pmcnum, l3_miss_before); > + /* > + * From SDM: Performing back-to-back fast reads are not guaranteed > + * to be monotonic. To guarantee monotonicity on back-toback reads, > + * a serializing instruction must be placed between the two > + * RDPMC instructions > + */ > + rmb(); > + rdpmcl(l3_hit_pmcnum, l3_hits_before); > + rdpmcl(l3_miss_pmcnum, l3_miss_before); > + /* > + * rdpmc is not a serializing instruction. Add barrier to prevent > + * instructions that follow to begin executing before reading the > + * counter value. > + */ > + rmb(); > for (i = 0; i < size; i += line_size) { > + /* > + * Add a barrier to prevent speculative execution of this > + * loop reading beyond the end of the buffer. > + */ > + rmb(); > asm volatile("mov (%0,%1,1), %%eax\n\t" > : > : "r" (mem_r), "r" (i) > : "%eax", "memory"); > } > + rdpmcl(l3_hit_pmcnum, l3_hits_after); > + rdpmcl(l3_miss_pmcnum, l3_miss_after); > /* > + * rdpmc is not a serializing instruction. Add barrier to ensure > + * events measured have completed and prevent instructions that > + * follow to begin executing before reading the counter value. > */ > + rmb(); > + /* Re-enable hardware prefetchers */ > wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); > local_irq_enable(); > + l3_miss_after -= l3_miss_before; > + if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { > + /* > + * On BDW references and misses are counted, need to adjust. > + * Sometimes the "hits" counter is a bit more than the > + * references, for example, x references but x + 1 hits. > + * To not report invalid hit values in this case we treat > + * that as misses equal to references. > + */ > + /* First compute the number of cache references measured */ > + l3_hits_after -= l3_hits_before; > + /* Next convert references to cache hits */ > + l3_hits_after -= l3_miss_after > l3_hits_after ? > + l3_hits_after : l3_miss_after; > + } else { > + l3_hits_after -= l3_hits_before; > } > + trace_pseudo_lock_l3(l3_hits_after, l3_miss_after); > > +out_l3_hit: > + perf_event_release_kernel(l3_hit_event); > +out_l3_miss: > + perf_event_release_kernel(l3_miss_event);