Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-31 Thread Vince Weaver
On Tue, 30 May 2017, Peter Zijlstra wrote:

> On Wed, May 24, 2017 at 12:01:50PM -0400, Vince Weaver wrote:
> > I already have people really grumpy that you have to have one mmap() page 
> > per event, meaning you sacrifice one TLB entry for each event you are 
> > measuring.
> 
> So there is space in that page. We could maybe look at having an array
> of stuff (max 32 entries?) which would cover a whole event group.
> 
> Then you only need to mmap() the page for the leading event.

Yes, I think that was the interface they were hoping for.
I've been meaning to run some tests and see if there is a noticible hit 
with TLB misses but haven't had the chance.

> Something like the below for the uapi changes I suppose. I've not
> tried to actually implement it yet, but would something like that be
> usable?

it looks usable from a quick glance.  Often the problems only turn up once 
you try to implement it.

Vince




> 
> 
> ---
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index b1c0b187acfe..40ff77e52b9d 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -345,7 +345,8 @@ struct perf_event_attr {
>   context_switch :  1, /* context switch data */
>   write_backward :  1, /* Write ring buffer from 
> end to beginning */
>   namespaces :  1, /* include namespaces data 
> */
> - __reserved_1   : 35;
> + group_pmc  :  1, /* use group_pmc in 
> perf_event_mmap_page */
> + __reserved_1   : 34;
>  
>   union {
>   __u32   wakeup_events;/* wakeup every n events */
> @@ -469,7 +470,8 @@ struct perf_event_mmap_page {
>   cap_user_rdpmc  : 1, /* The RDPMC 
> instruction can be used to read counts */
>   cap_user_time   : 1, /* The time_* 
> fields are used */
>   cap_user_time_zero  : 1, /* The time_zero 
> field is used */
> - cap_res : 59;
> + cap_group_pmc   : 1, /* The group_pmc 
> field is used, ignore @index and @offset */
> + cap_res : 58;
>   };
>   };
>  
> @@ -530,11 +532,29 @@ struct perf_event_mmap_page {
>   __u64   time_zero;
>   __u32   size;   /* Header size up to __reserved[] 
> fields. */
>  
> + __u32   __reserved_4byte_hole;
> +
> + /*
> +  * If cap_group_pmc this array replaces @index and @offset. The array
> +  * will contain an entry for each group member lead by the event 
> belonging
> +  * to this mmap().
> +  *
> +  * The @id field can be used to identify which sibling event the 
> respective
> +  * @index and @offset values belong to. Assuming an immutable group, the
> +  * array index will stay constant for each event.
> +  */
> + struct {
> + __u32   index;
> + __u32   __reserved_hole;
> + __s64   offset;
> + __u64   id; /* event id */
> + } group_pmc[32];
> +
>   /*
>* Hole for extension of the self monitor capabilities
>*/
>  
> - __u8__reserved[118*8+4];/* align to 1k. */
> + __u8__reserved[22*8];   /* align to 1k. */
>  
>   /*
>* Control data for the mmap() data buffer.
> 


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-31 Thread Vince Weaver
On Tue, 30 May 2017, Peter Zijlstra wrote:

> On Wed, May 24, 2017 at 12:01:50PM -0400, Vince Weaver wrote:
> > I already have people really grumpy that you have to have one mmap() page 
> > per event, meaning you sacrifice one TLB entry for each event you are 
> > measuring.
> 
> So there is space in that page. We could maybe look at having an array
> of stuff (max 32 entries?) which would cover a whole event group.
> 
> Then you only need to mmap() the page for the leading event.

Yes, I think that was the interface they were hoping for.
I've been meaning to run some tests and see if there is a noticible hit 
with TLB misses but haven't had the chance.

> Something like the below for the uapi changes I suppose. I've not
> tried to actually implement it yet, but would something like that be
> usable?

it looks usable from a quick glance.  Often the problems only turn up once 
you try to implement it.

Vince




> 
> 
> ---
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index b1c0b187acfe..40ff77e52b9d 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -345,7 +345,8 @@ struct perf_event_attr {
>   context_switch :  1, /* context switch data */
>   write_backward :  1, /* Write ring buffer from 
> end to beginning */
>   namespaces :  1, /* include namespaces data 
> */
> - __reserved_1   : 35;
> + group_pmc  :  1, /* use group_pmc in 
> perf_event_mmap_page */
> + __reserved_1   : 34;
>  
>   union {
>   __u32   wakeup_events;/* wakeup every n events */
> @@ -469,7 +470,8 @@ struct perf_event_mmap_page {
>   cap_user_rdpmc  : 1, /* The RDPMC 
> instruction can be used to read counts */
>   cap_user_time   : 1, /* The time_* 
> fields are used */
>   cap_user_time_zero  : 1, /* The time_zero 
> field is used */
> - cap_res : 59;
> + cap_group_pmc   : 1, /* The group_pmc 
> field is used, ignore @index and @offset */
> + cap_res : 58;
>   };
>   };
>  
> @@ -530,11 +532,29 @@ struct perf_event_mmap_page {
>   __u64   time_zero;
>   __u32   size;   /* Header size up to __reserved[] 
> fields. */
>  
> + __u32   __reserved_4byte_hole;
> +
> + /*
> +  * If cap_group_pmc this array replaces @index and @offset. The array
> +  * will contain an entry for each group member lead by the event 
> belonging
> +  * to this mmap().
> +  *
> +  * The @id field can be used to identify which sibling event the 
> respective
> +  * @index and @offset values belong to. Assuming an immutable group, the
> +  * array index will stay constant for each event.
> +  */
> + struct {
> + __u32   index;
> + __u32   __reserved_hole;
> + __s64   offset;
> + __u64   id; /* event id */
> + } group_pmc[32];
> +
>   /*
>* Hole for extension of the self monitor capabilities
>*/
>  
> - __u8__reserved[118*8+4];/* align to 1k. */
> + __u8__reserved[22*8];   /* align to 1k. */
>  
>   /*
>* Control data for the mmap() data buffer.
> 


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Andi Kleen
> > BTW there's an alternative solution in cycling the NMI watchdog over
> > all available CPUs. Then it would eventually cover all. But that's
> > less real time friendly than relying on RCU.
> 
> I don't think we need to worry too much about the watchdog being rt
> friendly. Robustness is the thing that worries me most.

Ok. Then just cycling is the most robust method, and it's very
simple.

You're right. Perhaps it's better than to rely on the RCU machinery which
seems to become ever more and more complex.

People who care extremely about latencies can just turn it off.

The main problem with the proposal is that it depends on the BIOS not 
locking the TCO watchdog. On the systems were it is locked we would
either need to continue using the PMU for the watchdog, or find some other 
watchdog source that can be programmed to be a NMI.

-Andi


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Andi Kleen
> > BTW there's an alternative solution in cycling the NMI watchdog over
> > all available CPUs. Then it would eventually cover all. But that's
> > less real time friendly than relying on RCU.
> 
> I don't think we need to worry too much about the watchdog being rt
> friendly. Robustness is the thing that worries me most.

Ok. Then just cycling is the most robust method, and it's very
simple.

You're right. Perhaps it's better than to rely on the RCU machinery which
seems to become ever more and more complex.

People who care extremely about latencies can just turn it off.

The main problem with the proposal is that it depends on the BIOS not 
locking the TCO watchdog. On the systems were it is locked we would
either need to continue using the PMU for the watchdog, or find some other 
watchdog source that can be programmed to be a NMI.

-Andi


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Peter Zijlstra
On Tue, May 30, 2017 at 10:51:51AM -0700, Andi Kleen wrote:
> On Tue, May 30, 2017 at 07:40:14PM +0200, Peter Zijlstra wrote:
> > On Tue, May 30, 2017 at 10:22:08AM -0700, Andi Kleen wrote:
> > > > > You would only need a single one per system however, not one per CPU.
> > > > > RCU already tracks all the CPUs, all we need is a single NMI watchdog
> > > > > that makes sure RCU itself does not get stuck.
> > > > > 
> > > > > So we just have to find a single watchdog somewhere that can trigger
> > > > > NMI.
> > > > 
> > > > But then you have to IPI broadcast the NMI, which is less than ideal.
> > > 
> > > Only when the watchdog times out to print the backtraces.
> > 
> > The current NMI watchdog has a per-cpu state. So that means either doing
> > for_all_cpu() loops or IPI broadcasts from the NMI tickle. Neither is
> > something you really want.
> 
> The normal case is that the RCU stall only prints the backtrace for
> the CPU that stalled.
> 
> The extra NMI watchdog should only kick in when RCU is broken too,
> or the CPU that owns the stall detection stalled too, which should be rare.

Well, if we can drive the RCU watchdog from NMI (using RTC/HPET or
whatever) it might be good enough if we can convince ourselves there's
no holes in it otherwise.

The obvious hole being locked up while RCU isn't considering the CPU
'interesting'.

> In this case it's reasonable to print backtrace for all, like sysrq would do.
> In theory could try to figure out what the current CPU that would own stall
> detection is, but it's probably safer to do it for all.
> 
> BTW there's an alternative solution in cycling the NMI watchdog over
> all available CPUs. Then it would eventually cover all. But that's
> less real time friendly than relying on RCU.

I don't think we need to worry too much about the watchdog being rt
friendly. Robustness is the thing that worries me most.

> > > > RCU doesn't have that problem because the quiescent state is a global
> > > > thing. CPU progress, which is what the NMI watchdog tests, is very much
> > > > per logical CPU though.
> > > 
> > > RCU already has a CPU stall detector. It should work (and usually
> > > triggers before the NMI watchdog in my experience unless the
> > > whole system is dead)
> > 
> > It only goes look at CPU state once it detects the global QS is stalled
> > I think. But I've not had much luck with the RCU one -- although I think
> > its been improved since I last had a hard problem.
> 
> I've seen it trigger.

Oh, I've seen it trigger plenty,.. just not when I needed it and/or it
didn't contain useful bits.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Peter Zijlstra
On Tue, May 30, 2017 at 10:51:51AM -0700, Andi Kleen wrote:
> On Tue, May 30, 2017 at 07:40:14PM +0200, Peter Zijlstra wrote:
> > On Tue, May 30, 2017 at 10:22:08AM -0700, Andi Kleen wrote:
> > > > > You would only need a single one per system however, not one per CPU.
> > > > > RCU already tracks all the CPUs, all we need is a single NMI watchdog
> > > > > that makes sure RCU itself does not get stuck.
> > > > > 
> > > > > So we just have to find a single watchdog somewhere that can trigger
> > > > > NMI.
> > > > 
> > > > But then you have to IPI broadcast the NMI, which is less than ideal.
> > > 
> > > Only when the watchdog times out to print the backtraces.
> > 
> > The current NMI watchdog has a per-cpu state. So that means either doing
> > for_all_cpu() loops or IPI broadcasts from the NMI tickle. Neither is
> > something you really want.
> 
> The normal case is that the RCU stall only prints the backtrace for
> the CPU that stalled.
> 
> The extra NMI watchdog should only kick in when RCU is broken too,
> or the CPU that owns the stall detection stalled too, which should be rare.

Well, if we can drive the RCU watchdog from NMI (using RTC/HPET or
whatever) it might be good enough if we can convince ourselves there's
no holes in it otherwise.

The obvious hole being locked up while RCU isn't considering the CPU
'interesting'.

> In this case it's reasonable to print backtrace for all, like sysrq would do.
> In theory could try to figure out what the current CPU that would own stall
> detection is, but it's probably safer to do it for all.
> 
> BTW there's an alternative solution in cycling the NMI watchdog over
> all available CPUs. Then it would eventually cover all. But that's
> less real time friendly than relying on RCU.

I don't think we need to worry too much about the watchdog being rt
friendly. Robustness is the thing that worries me most.

> > > > RCU doesn't have that problem because the quiescent state is a global
> > > > thing. CPU progress, which is what the NMI watchdog tests, is very much
> > > > per logical CPU though.
> > > 
> > > RCU already has a CPU stall detector. It should work (and usually
> > > triggers before the NMI watchdog in my experience unless the
> > > whole system is dead)
> > 
> > It only goes look at CPU state once it detects the global QS is stalled
> > I think. But I've not had much luck with the RCU one -- although I think
> > its been improved since I last had a hard problem.
> 
> I've seen it trigger.

Oh, I've seen it trigger plenty,.. just not when I needed it and/or it
didn't contain useful bits.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Andi Kleen
On Tue, May 30, 2017 at 07:40:14PM +0200, Peter Zijlstra wrote:
> On Tue, May 30, 2017 at 10:22:08AM -0700, Andi Kleen wrote:
> > > > You would only need a single one per system however, not one per CPU.
> > > > RCU already tracks all the CPUs, all we need is a single NMI watchdog
> > > > that makes sure RCU itself does not get stuck.
> > > > 
> > > > So we just have to find a single watchdog somewhere that can trigger
> > > > NMI.
> > > 
> > > But then you have to IPI broadcast the NMI, which is less than ideal.
> > 
> > Only when the watchdog times out to print the backtraces.
> 
> The current NMI watchdog has a per-cpu state. So that means either doing
> for_all_cpu() loops or IPI broadcasts from the NMI tickle. Neither is
> something you really want.

The normal case is that the RCU stall only prints the backtrace for
the CPU that stalled.

The extra NMI watchdog should only kick in when RCU is broken too,
or the CPU that owns the stall detection stalled too, which should be rare.

In this case it's reasonable to print backtrace for all, like sysrq would do.
In theory could try to figure out what the current CPU that would own stall
detection is, but it's probably safer to do it for all.

BTW there's an alternative solution in cycling the NMI watchdog over
all available CPUs. Then it would eventually cover all. But that's
less real time friendly than relying on RCU.

> > > RCU doesn't have that problem because the quiescent state is a global
> > > thing. CPU progress, which is what the NMI watchdog tests, is very much
> > > per logical CPU though.
> > 
> > RCU already has a CPU stall detector. It should work (and usually
> > triggers before the NMI watchdog in my experience unless the
> > whole system is dead)
> 
> It only goes look at CPU state once it detects the global QS is stalled
> I think. But I've not had much luck with the RCU one -- although I think
> its been improved since I last had a hard problem.

I've seen it trigger.

-Andi


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Andi Kleen
On Tue, May 30, 2017 at 07:40:14PM +0200, Peter Zijlstra wrote:
> On Tue, May 30, 2017 at 10:22:08AM -0700, Andi Kleen wrote:
> > > > You would only need a single one per system however, not one per CPU.
> > > > RCU already tracks all the CPUs, all we need is a single NMI watchdog
> > > > that makes sure RCU itself does not get stuck.
> > > > 
> > > > So we just have to find a single watchdog somewhere that can trigger
> > > > NMI.
> > > 
> > > But then you have to IPI broadcast the NMI, which is less than ideal.
> > 
> > Only when the watchdog times out to print the backtraces.
> 
> The current NMI watchdog has a per-cpu state. So that means either doing
> for_all_cpu() loops or IPI broadcasts from the NMI tickle. Neither is
> something you really want.

The normal case is that the RCU stall only prints the backtrace for
the CPU that stalled.

The extra NMI watchdog should only kick in when RCU is broken too,
or the CPU that owns the stall detection stalled too, which should be rare.

In this case it's reasonable to print backtrace for all, like sysrq would do.
In theory could try to figure out what the current CPU that would own stall
detection is, but it's probably safer to do it for all.

BTW there's an alternative solution in cycling the NMI watchdog over
all available CPUs. Then it would eventually cover all. But that's
less real time friendly than relying on RCU.

> > > RCU doesn't have that problem because the quiescent state is a global
> > > thing. CPU progress, which is what the NMI watchdog tests, is very much
> > > per logical CPU though.
> > 
> > RCU already has a CPU stall detector. It should work (and usually
> > triggers before the NMI watchdog in my experience unless the
> > whole system is dead)
> 
> It only goes look at CPU state once it detects the global QS is stalled
> I think. But I've not had much luck with the RCU one -- although I think
> its been improved since I last had a hard problem.

I've seen it trigger.

-Andi


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Peter Zijlstra
On Tue, May 30, 2017 at 10:22:08AM -0700, Andi Kleen wrote:
> > > You would only need a single one per system however, not one per CPU.
> > > RCU already tracks all the CPUs, all we need is a single NMI watchdog
> > > that makes sure RCU itself does not get stuck.
> > > 
> > > So we just have to find a single watchdog somewhere that can trigger
> > > NMI.
> > 
> > But then you have to IPI broadcast the NMI, which is less than ideal.
> 
> Only when the watchdog times out to print the backtraces.

The current NMI watchdog has a per-cpu state. So that means either doing
for_all_cpu() loops or IPI broadcasts from the NMI tickle. Neither is
something you really want.

> > RCU doesn't have that problem because the quiescent state is a global
> > thing. CPU progress, which is what the NMI watchdog tests, is very much
> > per logical CPU though.
> 
> RCU already has a CPU stall detector. It should work (and usually
> triggers before the NMI watchdog in my experience unless the
> whole system is dead)

It only goes look at CPU state once it detects the global QS is stalled
I think. But I've not had much luck with the RCU one -- although I think
its been improved since I last had a hard problem.



Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Peter Zijlstra
On Tue, May 30, 2017 at 10:22:08AM -0700, Andi Kleen wrote:
> > > You would only need a single one per system however, not one per CPU.
> > > RCU already tracks all the CPUs, all we need is a single NMI watchdog
> > > that makes sure RCU itself does not get stuck.
> > > 
> > > So we just have to find a single watchdog somewhere that can trigger
> > > NMI.
> > 
> > But then you have to IPI broadcast the NMI, which is less than ideal.
> 
> Only when the watchdog times out to print the backtraces.

The current NMI watchdog has a per-cpu state. So that means either doing
for_all_cpu() loops or IPI broadcasts from the NMI tickle. Neither is
something you really want.

> > RCU doesn't have that problem because the quiescent state is a global
> > thing. CPU progress, which is what the NMI watchdog tests, is very much
> > per logical CPU though.
> 
> RCU already has a CPU stall detector. It should work (and usually
> triggers before the NMI watchdog in my experience unless the
> whole system is dead)

It only goes look at CPU state once it detects the global QS is stalled
I think. But I've not had much luck with the RCU one -- although I think
its been improved since I last had a hard problem.



Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Peter Zijlstra
On Wed, May 24, 2017 at 12:01:50PM -0400, Vince Weaver wrote:
> I already have people really grumpy that you have to have one mmap() page 
> per event, meaning you sacrifice one TLB entry for each event you are 
> measuring.

So there is space in that page. We could maybe look at having an array
of stuff (max 32 entries?) which would cover a whole event group.

Then you only need to mmap() the page for the leading event.

Looking at the layout it would be slightly awkward to do (or we need to
ref the layout, which will undoubtedly be painful too). But for groups
the time fields at least are all shared.

At the very least we need index and offset, ideally pmc_width would be
the same for all counters (can we assume that?).

Something like the below for the uapi changes I suppose. I've not
tried to actually implement it yet, but would something like that be
usable?


---
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..40ff77e52b9d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -345,7 +345,8 @@ struct perf_event_attr {
context_switch :  1, /* context switch data */
write_backward :  1, /* Write ring buffer from 
end to beginning */
namespaces :  1, /* include namespaces data 
*/
-   __reserved_1   : 35;
+   group_pmc  :  1, /* use group_pmc in 
perf_event_mmap_page */
+   __reserved_1   : 34;
 
union {
__u32   wakeup_events;/* wakeup every n events */
@@ -469,7 +470,8 @@ struct perf_event_mmap_page {
cap_user_rdpmc  : 1, /* The RDPMC 
instruction can be used to read counts */
cap_user_time   : 1, /* The time_* 
fields are used */
cap_user_time_zero  : 1, /* The time_zero 
field is used */
-   cap_res : 59;
+   cap_group_pmc   : 1, /* The group_pmc 
field is used, ignore @index and @offset */
+   cap_res : 58;
};
};
 
@@ -530,11 +532,29 @@ struct perf_event_mmap_page {
__u64   time_zero;
__u32   size;   /* Header size up to __reserved[] 
fields. */
 
+   __u32   __reserved_4byte_hole;
+
+   /*
+* If cap_group_pmc this array replaces @index and @offset. The array
+* will contain an entry for each group member lead by the event 
belonging
+* to this mmap().
+*
+* The @id field can be used to identify which sibling event the 
respective
+* @index and @offset values belong to. Assuming an immutable group, the
+* array index will stay constant for each event.
+*/
+   struct {
+   __u32   index;
+   __u32   __reserved_hole;
+   __s64   offset;
+   __u64   id; /* event id */
+   } group_pmc[32];
+
/*
 * Hole for extension of the self monitor capabilities
 */
 
-   __u8__reserved[118*8+4];/* align to 1k. */
+   __u8__reserved[22*8];   /* align to 1k. */
 
/*
 * Control data for the mmap() data buffer.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Peter Zijlstra
On Wed, May 24, 2017 at 12:01:50PM -0400, Vince Weaver wrote:
> I already have people really grumpy that you have to have one mmap() page 
> per event, meaning you sacrifice one TLB entry for each event you are 
> measuring.

So there is space in that page. We could maybe look at having an array
of stuff (max 32 entries?) which would cover a whole event group.

Then you only need to mmap() the page for the leading event.

Looking at the layout it would be slightly awkward to do (or we need to
ref the layout, which will undoubtedly be painful too). But for groups
the time fields at least are all shared.

At the very least we need index and offset, ideally pmc_width would be
the same for all counters (can we assume that?).

Something like the below for the uapi changes I suppose. I've not
tried to actually implement it yet, but would something like that be
usable?


---
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..40ff77e52b9d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -345,7 +345,8 @@ struct perf_event_attr {
context_switch :  1, /* context switch data */
write_backward :  1, /* Write ring buffer from 
end to beginning */
namespaces :  1, /* include namespaces data 
*/
-   __reserved_1   : 35;
+   group_pmc  :  1, /* use group_pmc in 
perf_event_mmap_page */
+   __reserved_1   : 34;
 
union {
__u32   wakeup_events;/* wakeup every n events */
@@ -469,7 +470,8 @@ struct perf_event_mmap_page {
cap_user_rdpmc  : 1, /* The RDPMC 
instruction can be used to read counts */
cap_user_time   : 1, /* The time_* 
fields are used */
cap_user_time_zero  : 1, /* The time_zero 
field is used */
-   cap_res : 59;
+   cap_group_pmc   : 1, /* The group_pmc 
field is used, ignore @index and @offset */
+   cap_res : 58;
};
};
 
@@ -530,11 +532,29 @@ struct perf_event_mmap_page {
__u64   time_zero;
__u32   size;   /* Header size up to __reserved[] 
fields. */
 
+   __u32   __reserved_4byte_hole;
+
+   /*
+* If cap_group_pmc this array replaces @index and @offset. The array
+* will contain an entry for each group member lead by the event 
belonging
+* to this mmap().
+*
+* The @id field can be used to identify which sibling event the 
respective
+* @index and @offset values belong to. Assuming an immutable group, the
+* array index will stay constant for each event.
+*/
+   struct {
+   __u32   index;
+   __u32   __reserved_hole;
+   __s64   offset;
+   __u64   id; /* event id */
+   } group_pmc[32];
+
/*
 * Hole for extension of the self monitor capabilities
 */
 
-   __u8__reserved[118*8+4];/* align to 1k. */
+   __u8__reserved[22*8];   /* align to 1k. */
 
/*
 * Control data for the mmap() data buffer.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Andi Kleen
> > You would only need a single one per system however, not one per CPU.
> > RCU already tracks all the CPUs, all we need is a single NMI watchdog
> > that makes sure RCU itself does not get stuck.
> > 
> > So we just have to find a single watchdog somewhere that can trigger
> > NMI.
> 
> But then you have to IPI broadcast the NMI, which is less than ideal.

Only when the watchdog times out to print the backtraces.

> 
> RCU doesn't have that problem because the quiescent state is a global
> thing. CPU progress, which is what the NMI watchdog tests, is very much
> per logical CPU though.

RCU already has a CPU stall detector. It should work (and usually
triggers before the NMI watchdog in my experience unless the
whole system is dead)

-Andi


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Andi Kleen
> > You would only need a single one per system however, not one per CPU.
> > RCU already tracks all the CPUs, all we need is a single NMI watchdog
> > that makes sure RCU itself does not get stuck.
> > 
> > So we just have to find a single watchdog somewhere that can trigger
> > NMI.
> 
> But then you have to IPI broadcast the NMI, which is less than ideal.

Only when the watchdog times out to print the backtraces.

> 
> RCU doesn't have that problem because the quiescent state is a global
> thing. CPU progress, which is what the NMI watchdog tests, is very much
> per logical CPU though.

RCU already has a CPU stall detector. It should work (and usually
triggers before the NMI watchdog in my experience unless the
whole system is dead)

-Andi


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Thomas Gleixner
On Tue, 30 May 2017, Stephane Eranian wrote:

> On Tue, May 30, 2017 at 2:25 AM, Peter Zijlstra  wrote:
> > On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
> >> Ultimately, I would like to see the watchdog move out of the PMU. That
> >> is the only sensible solution.
> >> You just need a resource able to interrupt on NMI or you handle
> >> interrupt masking in software as has
> >> been proposed on LKML.
> >
> > So even if we do the soft masking, we still need to deal with regions
> > where the interrupts are disabled. Once an interrupt hits the soft mask
> > we still hardware mask.
> >
> What I was thinking is that you never hardware mask, software always
> catches the hw interrupts and keeps them pending or deliver them
> depending on sw mask.

That does not work.

  1) You still need some sections where you must have hardware masking.

  2) Software cannot keep them pending in hardware other than disabling
 interrupts at the CPU level and returning to the interrupted sections
 without issuing EOI. Reenabling them at the CPU level will reinject
 them in hardware.

 If we actually want send the EOI and keep a irq pending mask in
 software then we cannot do that quick in the low level entry code. We
 need to go up into the generic interrupt code, take a bunch of locks
 which expect interrupts disable and figure out what to do with the
 interrupt.

 That's required because the low level entry code has no idea how to
 deal with that interrupt, i.e. which physical interrupt controller to
 talk to. Due to our stacked irq chips we can end up with rather
 complex call chains there.

 Of course this involves locking which will deadlock if such a lock has
 been taken at some other place with interrupts just soft disabled.

Thanks,

tglx


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Thomas Gleixner
On Tue, 30 May 2017, Stephane Eranian wrote:

> On Tue, May 30, 2017 at 2:25 AM, Peter Zijlstra  wrote:
> > On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
> >> Ultimately, I would like to see the watchdog move out of the PMU. That
> >> is the only sensible solution.
> >> You just need a resource able to interrupt on NMI or you handle
> >> interrupt masking in software as has
> >> been proposed on LKML.
> >
> > So even if we do the soft masking, we still need to deal with regions
> > where the interrupts are disabled. Once an interrupt hits the soft mask
> > we still hardware mask.
> >
> What I was thinking is that you never hardware mask, software always
> catches the hw interrupts and keeps them pending or deliver them
> depending on sw mask.

That does not work.

  1) You still need some sections where you must have hardware masking.

  2) Software cannot keep them pending in hardware other than disabling
 interrupts at the CPU level and returning to the interrupted sections
 without issuing EOI. Reenabling them at the CPU level will reinject
 them in hardware.

 If we actually want send the EOI and keep a irq pending mask in
 software then we cannot do that quick in the low level entry code. We
 need to go up into the generic interrupt code, take a bunch of locks
 which expect interrupts disable and figure out what to do with the
 interrupt.

 That's required because the low level entry code has no idea how to
 deal with that interrupt, i.e. which physical interrupt controller to
 talk to. Due to our stacked irq chips we can end up with rather
 complex call chains there.

 Of course this involves locking which will deadlock if such a lock has
 been taken at some other place with interrupts just soft disabled.

Thanks,

tglx


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Stephane Eranian
On Tue, May 30, 2017 at 9:28 AM, Peter Zijlstra  wrote:
> On Tue, May 30, 2017 at 06:51:28AM -0700, Andi Kleen wrote:
>> On Tue, May 30, 2017 at 11:25:23AM +0200, Peter Zijlstra wrote:
>> > On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
>> > > Ultimately, I would like to see the watchdog move out of the PMU. That
>> > > is the only sensible solution.
>> > > You just need a resource able to interrupt on NMI or you handle
>> > > interrupt masking in software as has
>> > > been proposed on LKML.
>> >
>> > So even if we do the soft masking, we still need to deal with regions
>> > where the interrupts are disabled. Once an interrupt hits the soft mask
>> > we still hardware mask.
>> >
>> > So to get full and reliable coverage we still need an NMI source.
>>
>> You would only need a single one per system however, not one per CPU.
>> RCU already tracks all the CPUs, all we need is a single NMI watchdog
>> that makes sure RCU itself does not get stuck.
>>
>> So we just have to find a single watchdog somewhere that can trigger
>> NMI.
>
> But then you have to IPI broadcast the NMI, which is less than ideal.
>
> RCU doesn't have that problem because the quiescent state is a global
> thing. CPU progress, which is what the NMI watchdog tests, is very much
> per logical CPU though.
>
>> > I agree that it would be lovely to free up the one counter though.
>>
>> One option is to use the TCO watchdog in the chipset instead.
>> Unfortunatley it's not an universal solution because some BIOS lock
>> the TCO watchdog for their own use. But if you have a BIOS that
>> doesn't do that it should work.
>
> I suppose you could also route the HPET to the NMI vector and other
> similar things. Still, you're then stuck with IPI broadcasts, which
> suck.
>
Can the HPET interrupt (whatever vector) be broadcast to all CPUs by hw?

>> > One other approach is running the watchdog off of _any_ PMI, then all we
>> > need to ensure is that PMIs happen semi regularly. There are two cases
>> > where this becomes 'interesting':
>>
>> Seems fairly complex.
>
> Yes.. :/


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Stephane Eranian
On Tue, May 30, 2017 at 9:28 AM, Peter Zijlstra  wrote:
> On Tue, May 30, 2017 at 06:51:28AM -0700, Andi Kleen wrote:
>> On Tue, May 30, 2017 at 11:25:23AM +0200, Peter Zijlstra wrote:
>> > On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
>> > > Ultimately, I would like to see the watchdog move out of the PMU. That
>> > > is the only sensible solution.
>> > > You just need a resource able to interrupt on NMI or you handle
>> > > interrupt masking in software as has
>> > > been proposed on LKML.
>> >
>> > So even if we do the soft masking, we still need to deal with regions
>> > where the interrupts are disabled. Once an interrupt hits the soft mask
>> > we still hardware mask.
>> >
>> > So to get full and reliable coverage we still need an NMI source.
>>
>> You would only need a single one per system however, not one per CPU.
>> RCU already tracks all the CPUs, all we need is a single NMI watchdog
>> that makes sure RCU itself does not get stuck.
>>
>> So we just have to find a single watchdog somewhere that can trigger
>> NMI.
>
> But then you have to IPI broadcast the NMI, which is less than ideal.
>
> RCU doesn't have that problem because the quiescent state is a global
> thing. CPU progress, which is what the NMI watchdog tests, is very much
> per logical CPU though.
>
>> > I agree that it would be lovely to free up the one counter though.
>>
>> One option is to use the TCO watchdog in the chipset instead.
>> Unfortunatley it's not an universal solution because some BIOS lock
>> the TCO watchdog for their own use. But if you have a BIOS that
>> doesn't do that it should work.
>
> I suppose you could also route the HPET to the NMI vector and other
> similar things. Still, you're then stuck with IPI broadcasts, which
> suck.
>
Can the HPET interrupt (whatever vector) be broadcast to all CPUs by hw?

>> > One other approach is running the watchdog off of _any_ PMI, then all we
>> > need to ensure is that PMIs happen semi regularly. There are two cases
>> > where this becomes 'interesting':
>>
>> Seems fairly complex.
>
> Yes.. :/


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Stephane Eranian
On Tue, May 30, 2017 at 2:25 AM, Peter Zijlstra  wrote:
> On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
>> Ultimately, I would like to see the watchdog move out of the PMU. That
>> is the only sensible solution.
>> You just need a resource able to interrupt on NMI or you handle
>> interrupt masking in software as has
>> been proposed on LKML.
>
> So even if we do the soft masking, we still need to deal with regions
> where the interrupts are disabled. Once an interrupt hits the soft mask
> we still hardware mask.
>
What I was thinking is that you never hardware mask, software always
catches the hw interrupts and keeps them pending or deliver them
depending on sw mask.

> So to get full and reliable coverage we still need an NMI source.
>
> I agree that it would be lovely to free up the one counter though.
>
>
> One other approach is running the watchdog off of _any_ PMI, then all we
> need to ensure is that PMIs happen semi regularly. There are two cases
> where this becomes 'interesting':
>
>  - we have only !sampling events; in this case we have PMIs but at the
>max period to properly account for counter overflow. This is too
>large a period. We'd have to muck with the max period of at least one
>counter.
>
>  - we have _no_ events; in this case we need to somehow schedule an
>event anyway.
>
> It might be possible to deal with both cases by fudging the state of one
> of the fixed counters. Never clear the EN bit for that counter and
> reduce the max period for that one counter.
>
>
> I think a scheme like that was mentioned before, but I'm also afraid
> that it'll turn into quite the mess if we try it. And by its very nature
> it adds complexity and therefore risks reducing the reliability of the
> thing :/


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Stephane Eranian
On Tue, May 30, 2017 at 2:25 AM, Peter Zijlstra  wrote:
> On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
>> Ultimately, I would like to see the watchdog move out of the PMU. That
>> is the only sensible solution.
>> You just need a resource able to interrupt on NMI or you handle
>> interrupt masking in software as has
>> been proposed on LKML.
>
> So even if we do the soft masking, we still need to deal with regions
> where the interrupts are disabled. Once an interrupt hits the soft mask
> we still hardware mask.
>
What I was thinking is that you never hardware mask, software always
catches the hw interrupts and keeps them pending or deliver them
depending on sw mask.

> So to get full and reliable coverage we still need an NMI source.
>
> I agree that it would be lovely to free up the one counter though.
>
>
> One other approach is running the watchdog off of _any_ PMI, then all we
> need to ensure is that PMIs happen semi regularly. There are two cases
> where this becomes 'interesting':
>
>  - we have only !sampling events; in this case we have PMIs but at the
>max period to properly account for counter overflow. This is too
>large a period. We'd have to muck with the max period of at least one
>counter.
>
>  - we have _no_ events; in this case we need to somehow schedule an
>event anyway.
>
> It might be possible to deal with both cases by fudging the state of one
> of the fixed counters. Never clear the EN bit for that counter and
> reduce the max period for that one counter.
>
>
> I think a scheme like that was mentioned before, but I'm also afraid
> that it'll turn into quite the mess if we try it. And by its very nature
> it adds complexity and therefore risks reducing the reliability of the
> thing :/


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Peter Zijlstra
On Tue, May 30, 2017 at 06:51:28AM -0700, Andi Kleen wrote:
> On Tue, May 30, 2017 at 11:25:23AM +0200, Peter Zijlstra wrote:
> > On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
> > > Ultimately, I would like to see the watchdog move out of the PMU. That
> > > is the only sensible solution.
> > > You just need a resource able to interrupt on NMI or you handle
> > > interrupt masking in software as has
> > > been proposed on LKML.
> > 
> > So even if we do the soft masking, we still need to deal with regions
> > where the interrupts are disabled. Once an interrupt hits the soft mask
> > we still hardware mask.
> > 
> > So to get full and reliable coverage we still need an NMI source.
> 
> You would only need a single one per system however, not one per CPU.
> RCU already tracks all the CPUs, all we need is a single NMI watchdog
> that makes sure RCU itself does not get stuck.
> 
> So we just have to find a single watchdog somewhere that can trigger
> NMI.

But then you have to IPI broadcast the NMI, which is less than ideal.

RCU doesn't have that problem because the quiescent state is a global
thing. CPU progress, which is what the NMI watchdog tests, is very much
per logical CPU though.

> > I agree that it would be lovely to free up the one counter though.
> 
> One option is to use the TCO watchdog in the chipset instead. 
> Unfortunatley it's not an universal solution because some BIOS lock
> the TCO watchdog for their own use. But if you have a BIOS that
> doesn't do that it should work.

I suppose you could also route the HPET to the NMI vector and other
similar things. Still, you're then stuck with IPI broadcasts, which
suck.

> > One other approach is running the watchdog off of _any_ PMI, then all we
> > need to ensure is that PMIs happen semi regularly. There are two cases
> > where this becomes 'interesting':
> 
> Seems fairly complex.

Yes.. :/


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Peter Zijlstra
On Tue, May 30, 2017 at 06:51:28AM -0700, Andi Kleen wrote:
> On Tue, May 30, 2017 at 11:25:23AM +0200, Peter Zijlstra wrote:
> > On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
> > > Ultimately, I would like to see the watchdog move out of the PMU. That
> > > is the only sensible solution.
> > > You just need a resource able to interrupt on NMI or you handle
> > > interrupt masking in software as has
> > > been proposed on LKML.
> > 
> > So even if we do the soft masking, we still need to deal with regions
> > where the interrupts are disabled. Once an interrupt hits the soft mask
> > we still hardware mask.
> > 
> > So to get full and reliable coverage we still need an NMI source.
> 
> You would only need a single one per system however, not one per CPU.
> RCU already tracks all the CPUs, all we need is a single NMI watchdog
> that makes sure RCU itself does not get stuck.
> 
> So we just have to find a single watchdog somewhere that can trigger
> NMI.

But then you have to IPI broadcast the NMI, which is less than ideal.

RCU doesn't have that problem because the quiescent state is a global
thing. CPU progress, which is what the NMI watchdog tests, is very much
per logical CPU though.

> > I agree that it would be lovely to free up the one counter though.
> 
> One option is to use the TCO watchdog in the chipset instead. 
> Unfortunatley it's not an universal solution because some BIOS lock
> the TCO watchdog for their own use. But if you have a BIOS that
> doesn't do that it should work.

I suppose you could also route the HPET to the NMI vector and other
similar things. Still, you're then stuck with IPI broadcasts, which
suck.

> > One other approach is running the watchdog off of _any_ PMI, then all we
> > need to ensure is that PMIs happen semi regularly. There are two cases
> > where this becomes 'interesting':
> 
> Seems fairly complex.

Yes.. :/


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Andi Kleen
On Tue, May 30, 2017 at 11:25:23AM +0200, Peter Zijlstra wrote:
> On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
> > Ultimately, I would like to see the watchdog move out of the PMU. That
> > is the only sensible solution.
> > You just need a resource able to interrupt on NMI or you handle
> > interrupt masking in software as has
> > been proposed on LKML.
> 
> So even if we do the soft masking, we still need to deal with regions
> where the interrupts are disabled. Once an interrupt hits the soft mask
> we still hardware mask.
> 
> So to get full and reliable coverage we still need an NMI source.

You would only need a single one per system however, not one per CPU.
RCU already tracks all the CPUs, all we need is a single NMI watchdog
that makes sure RCU itself does not get stuck.

So we just have to find a single watchdog somewhere that can trigger
NMI.

> I agree that it would be lovely to free up the one counter though.

One option is to use the TCO watchdog in the chipset instead. 
Unfortunatley it's not an universal solution because some BIOS lock
the TCO watchdog for their own use. But if you have a BIOS that
doesn't do that it should work.

> One other approach is running the watchdog off of _any_ PMI, then all we
> need to ensure is that PMIs happen semi regularly. There are two cases
> where this becomes 'interesting':

Seems fairly complex.

-Andi


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Andi Kleen
On Tue, May 30, 2017 at 11:25:23AM +0200, Peter Zijlstra wrote:
> On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
> > Ultimately, I would like to see the watchdog move out of the PMU. That
> > is the only sensible solution.
> > You just need a resource able to interrupt on NMI or you handle
> > interrupt masking in software as has
> > been proposed on LKML.
> 
> So even if we do the soft masking, we still need to deal with regions
> where the interrupts are disabled. Once an interrupt hits the soft mask
> we still hardware mask.
> 
> So to get full and reliable coverage we still need an NMI source.

You would only need a single one per system however, not one per CPU.
RCU already tracks all the CPUs, all we need is a single NMI watchdog
that makes sure RCU itself does not get stuck.

So we just have to find a single watchdog somewhere that can trigger
NMI.

> I agree that it would be lovely to free up the one counter though.

One option is to use the TCO watchdog in the chipset instead. 
Unfortunatley it's not an universal solution because some BIOS lock
the TCO watchdog for their own use. But if you have a BIOS that
doesn't do that it should work.

> One other approach is running the watchdog off of _any_ PMI, then all we
> need to ensure is that PMIs happen semi regularly. There are two cases
> where this becomes 'interesting':

Seems fairly complex.

-Andi


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Peter Zijlstra
On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
> Ultimately, I would like to see the watchdog move out of the PMU. That
> is the only sensible solution.
> You just need a resource able to interrupt on NMI or you handle
> interrupt masking in software as has
> been proposed on LKML.

So even if we do the soft masking, we still need to deal with regions
where the interrupts are disabled. Once an interrupt hits the soft mask
we still hardware mask.

So to get full and reliable coverage we still need an NMI source.

I agree that it would be lovely to free up the one counter though.


One other approach is running the watchdog off of _any_ PMI, then all we
need to ensure is that PMIs happen semi regularly. There are two cases
where this becomes 'interesting':

 - we have only !sampling events; in this case we have PMIs but at the
   max period to properly account for counter overflow. This is too
   large a period. We'd have to muck with the max period of at least one
   counter.

 - we have _no_ events; in this case we need to somehow schedule an
   event anyway.

It might be possible to deal with both cases by fudging the state of one
of the fixed counters. Never clear the EN bit for that counter and
reduce the max period for that one counter.


I think a scheme like that was mentioned before, but I'm also afraid
that it'll turn into quite the mess if we try it. And by its very nature
it adds complexity and therefore risks reducing the reliability of the
thing :/


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-30 Thread Peter Zijlstra
On Sun, May 28, 2017 at 01:31:09PM -0700, Stephane Eranian wrote:
> Ultimately, I would like to see the watchdog move out of the PMU. That
> is the only sensible solution.
> You just need a resource able to interrupt on NMI or you handle
> interrupt masking in software as has
> been proposed on LKML.

So even if we do the soft masking, we still need to deal with regions
where the interrupts are disabled. Once an interrupt hits the soft mask
we still hardware mask.

So to get full and reliable coverage we still need an NMI source.

I agree that it would be lovely to free up the one counter though.


One other approach is running the watchdog off of _any_ PMI, then all we
need to ensure is that PMIs happen semi regularly. There are two cases
where this becomes 'interesting':

 - we have only !sampling events; in this case we have PMIs but at the
   max period to properly account for counter overflow. This is too
   large a period. We'd have to muck with the max period of at least one
   counter.

 - we have _no_ events; in this case we need to somehow schedule an
   event anyway.

It might be possible to deal with both cases by fudging the state of one
of the fixed counters. Never clear the EN bit for that counter and
reduce the max period for that one counter.


I think a scheme like that was mentioned before, but I'm also afraid
that it'll turn into quite the mess if we try it. And by its very nature
it adds complexity and therefore risks reducing the reliability of the
thing :/


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-28 Thread Stephane Eranian
On Wed, May 24, 2017 at 9:01 AM, Vince Weaver  wrote:
>
> On Wed, 24 May 2017, Andi Kleen wrote:
>
> > > Right, I did not even consider the rdpmc, but yeah, you will get a count 
> > > that
> > > is not relevant to the user visible event. Unless you fake it using the 
> > > time
> > > scaling fields there but that's ugly.
> >
> > Could add another scaling field to the mmap page for this.
>
> The whole point of the rdpmc() implementation is to be low overhead.
> If you have to parse 10 different mmap() fields it starts to defeat the
> purpose.
>
> I already have people really grumpy that you have to have one mmap() page
> per event, meaning you sacrifice one TLB entry for each event you are
> measuring.
>
>
> If the watchdog counter is constantly running, can't you just modify
> perf_event to just grab start/stop values at context switch time and
> provide the difference to the user?  Sort of like the "always running"
> patchsets that float around? Though I guess that doesn't help much with
> sampling.
>
This goes back to my initial point of simply increasing the period to
avoid false positive and stay with the
core-cycle event. It is much simpler. And again, if you are
deadlocked, it should not matter if you detect
it after 2mn or 5mn or 10mn. The accuracy is not so critical. And by
choosing a large enough period, you
avoid the issue with Turbo, even if you consider Turbo being able to
double your reference frequency.

Ultimately, I would like to see the watchdog move out of the PMU. That
is the only sensible solution.
You just need a resource able to interrupt on NMI or you handle
interrupt masking in software as has
been proposed on LKML.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-28 Thread Stephane Eranian
On Wed, May 24, 2017 at 9:01 AM, Vince Weaver  wrote:
>
> On Wed, 24 May 2017, Andi Kleen wrote:
>
> > > Right, I did not even consider the rdpmc, but yeah, you will get a count 
> > > that
> > > is not relevant to the user visible event. Unless you fake it using the 
> > > time
> > > scaling fields there but that's ugly.
> >
> > Could add another scaling field to the mmap page for this.
>
> The whole point of the rdpmc() implementation is to be low overhead.
> If you have to parse 10 different mmap() fields it starts to defeat the
> purpose.
>
> I already have people really grumpy that you have to have one mmap() page
> per event, meaning you sacrifice one TLB entry for each event you are
> measuring.
>
>
> If the watchdog counter is constantly running, can't you just modify
> perf_event to just grab start/stop values at context switch time and
> provide the difference to the user?  Sort of like the "always running"
> patchsets that float around? Though I guess that doesn't help much with
> sampling.
>
This goes back to my initial point of simply increasing the period to
avoid false positive and stay with the
core-cycle event. It is much simpler. And again, if you are
deadlocked, it should not matter if you detect
it after 2mn or 5mn or 10mn. The accuracy is not so critical. And by
choosing a large enough period, you
avoid the issue with Turbo, even if you consider Turbo being able to
double your reference frequency.

Ultimately, I would like to see the watchdog move out of the PMU. That
is the only sensible solution.
You just need a resource able to interrupt on NMI or you handle
interrupt masking in software as has
been proposed on LKML.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-27 Thread kbuild test robot
Hi Kan,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.12-rc2 next-20170526]
[cannot apply to tip/x86/core]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/kan-liang-intel-com/perf-x86-intel-enable-CPU-ref_cycles-for-GP-counter/20170522-111500
config: i386-randconfig-x0-05280944 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   arch/x86/built-in.o: In function `nhm_ref_cycles_rep':
>> (.text+0x64ff): undefined reference to `__udivdi3'

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-27 Thread kbuild test robot
Hi Kan,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.12-rc2 next-20170526]
[cannot apply to tip/x86/core]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/kan-liang-intel-com/perf-x86-intel-enable-CPU-ref_cycles-for-GP-counter/20170522-111500
config: i386-randconfig-x0-05280944 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   arch/x86/built-in.o: In function `nhm_ref_cycles_rep':
>> (.text+0x64ff): undefined reference to `__udivdi3'

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-24 Thread Andi Kleen
> The whole point of the rdpmc() implementation is to be low overhead.
> If you have to parse 10 different mmap() fields it starts to defeat the 
> purpose.

You would only use it with ref-cycles of course. So for the normal
case there is no overhead.

> If the watchdog counter is constantly running, can't you just modify 
> perf_event to just grab start/stop values at context switch time and 
> provide the difference to the user?  Sort of like the "always running" 
> patchsets that float around? Though I guess that doesn't help much with 
> sampling.

This wouldn't work with ring filters unfortunately. 

-Andi


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-24 Thread Andi Kleen
> The whole point of the rdpmc() implementation is to be low overhead.
> If you have to parse 10 different mmap() fields it starts to defeat the 
> purpose.

You would only use it with ref-cycles of course. So for the normal
case there is no overhead.

> If the watchdog counter is constantly running, can't you just modify 
> perf_event to just grab start/stop values at context switch time and 
> provide the difference to the user?  Sort of like the "always running" 
> patchsets that float around? Though I guess that doesn't help much with 
> sampling.

This wouldn't work with ring filters unfortunately. 

-Andi


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-24 Thread Vince Weaver
On Wed, 24 May 2017, Andi Kleen wrote:

> > Right, I did not even consider the rdpmc, but yeah, you will get a count 
> > that
> > is not relevant to the user visible event. Unless you fake it using the time
> > scaling fields there but that's ugly.
> 
> Could add another scaling field to the mmap page for this.

The whole point of the rdpmc() implementation is to be low overhead.
If you have to parse 10 different mmap() fields it starts to defeat the 
purpose.

I already have people really grumpy that you have to have one mmap() page 
per event, meaning you sacrifice one TLB entry for each event you are 
measuring.


If the watchdog counter is constantly running, can't you just modify 
perf_event to just grab start/stop values at context switch time and 
provide the difference to the user?  Sort of like the "always running" 
patchsets that float around? Though I guess that doesn't help much with 
sampling.

Vince


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-24 Thread Vince Weaver
On Wed, 24 May 2017, Andi Kleen wrote:

> > Right, I did not even consider the rdpmc, but yeah, you will get a count 
> > that
> > is not relevant to the user visible event. Unless you fake it using the time
> > scaling fields there but that's ugly.
> 
> Could add another scaling field to the mmap page for this.

The whole point of the rdpmc() implementation is to be low overhead.
If you have to parse 10 different mmap() fields it starts to defeat the 
purpose.

I already have people really grumpy that you have to have one mmap() page 
per event, meaning you sacrifice one TLB entry for each event you are 
measuring.


If the watchdog counter is constantly running, can't you just modify 
perf_event to just grab start/stop values at context switch time and 
provide the difference to the user?  Sort of like the "always running" 
patchsets that float around? Though I guess that doesn't help much with 
sampling.

Vince


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-24 Thread Andi Kleen
> Right, I did not even consider the rdpmc, but yeah, you will get a count that
> is not relevant to the user visible event. Unless you fake it using the time
> scaling fields there but that's ugly.

Could add another scaling field to the mmap page for this.

-Andi



Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-24 Thread Andi Kleen
> Right, I did not even consider the rdpmc, but yeah, you will get a count that
> is not relevant to the user visible event. Unless you fake it using the time
> scaling fields there but that's ugly.

Could add another scaling field to the mmap page for this.

-Andi



Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-23 Thread Stephane Eranian
On Mon, May 22, 2017 at 11:39 PM, Peter Zijlstra  wrote:
> On Mon, May 22, 2017 at 12:28:26PM -0700, Stephane Eranian wrote:
>> On Mon, May 22, 2017 at 12:23 PM, Peter Zijlstra  
>> wrote:
>> > On Mon, May 22, 2017 at 04:55:47PM +, Liang, Kan wrote:
>> >>
>> >>
>> >> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
>> >> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
>> >> > > 580b60f..e8b2326 100644
>> >> > > --- a/arch/x86/events/core.c
>> >> > > +++ b/arch/x86/events/core.c
>> >> > > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event
>> >> > *event)
>> >> > >   delta = (new_raw_count << shift) - (prev_raw_count << shift);
>> >> > >   delta >>= shift;
>> >> > >
>> >> > > + /* Correct the count number if applying ref_cycles replacement */
>> >> > > + if (!is_sampling_event(event) &&
>> >> > > + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
>> >> > > + delta *= x86_pmu.ref_cycles_factor;
>> >> >
>> >> > That condition seems wrong, why only correct for !sampling events?
>> >> >
>> >>
>> >> For sampling, it's either fixed freq mode or fixed period mode.
>> >>  - In the fixed freq mode, we should do nothing, because the adaptive
>> >>frequency algorithm will handle it.
>> >>  - In the fixed period mode, we have already adjusted the period in
>> >> ref_cycles_rep().
>> >> Therefore, we should only handle !sampling events here.
>> >
>> > How so? For sampling events the actual event count should also be
>> > accurate.
>>
>> Yes, it must be. Because you can reconstruct the total number of
>> occurrences of the event by adding
>> all the periods recorded in each sample. So the period in each sample
>> must reflect user event and not
>> kernel event.
>
> Well, that, but you can equally use read() or the mmap()'ed rdpmc stuff
> on a sampling event. The fact that is also generates samples does not
> mean it should not also function as a non-sampling event.

Right, I did not even consider the rdpmc, but yeah, you will get a count that
is not relevant to the user visible event. Unless you fake it using the time
scaling fields there but that's ugly.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-23 Thread Stephane Eranian
On Mon, May 22, 2017 at 11:39 PM, Peter Zijlstra  wrote:
> On Mon, May 22, 2017 at 12:28:26PM -0700, Stephane Eranian wrote:
>> On Mon, May 22, 2017 at 12:23 PM, Peter Zijlstra  
>> wrote:
>> > On Mon, May 22, 2017 at 04:55:47PM +, Liang, Kan wrote:
>> >>
>> >>
>> >> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
>> >> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
>> >> > > 580b60f..e8b2326 100644
>> >> > > --- a/arch/x86/events/core.c
>> >> > > +++ b/arch/x86/events/core.c
>> >> > > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event
>> >> > *event)
>> >> > >   delta = (new_raw_count << shift) - (prev_raw_count << shift);
>> >> > >   delta >>= shift;
>> >> > >
>> >> > > + /* Correct the count number if applying ref_cycles replacement */
>> >> > > + if (!is_sampling_event(event) &&
>> >> > > + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
>> >> > > + delta *= x86_pmu.ref_cycles_factor;
>> >> >
>> >> > That condition seems wrong, why only correct for !sampling events?
>> >> >
>> >>
>> >> For sampling, it's either fixed freq mode or fixed period mode.
>> >>  - In the fixed freq mode, we should do nothing, because the adaptive
>> >>frequency algorithm will handle it.
>> >>  - In the fixed period mode, we have already adjusted the period in
>> >> ref_cycles_rep().
>> >> Therefore, we should only handle !sampling events here.
>> >
>> > How so? For sampling events the actual event count should also be
>> > accurate.
>>
>> Yes, it must be. Because you can reconstruct the total number of
>> occurrences of the event by adding
>> all the periods recorded in each sample. So the period in each sample
>> must reflect user event and not
>> kernel event.
>
> Well, that, but you can equally use read() or the mmap()'ed rdpmc stuff
> on a sampling event. The fact that is also generates samples does not
> mean it should not also function as a non-sampling event.

Right, I did not even consider the rdpmc, but yeah, you will get a count that
is not relevant to the user visible event. Unless you fake it using the time
scaling fields there but that's ugly.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-23 Thread Peter Zijlstra
On Mon, May 22, 2017 at 12:28:26PM -0700, Stephane Eranian wrote:
> On Mon, May 22, 2017 at 12:23 PM, Peter Zijlstra  wrote:
> > On Mon, May 22, 2017 at 04:55:47PM +, Liang, Kan wrote:
> >>
> >>
> >> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> >> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
> >> > > 580b60f..e8b2326 100644
> >> > > --- a/arch/x86/events/core.c
> >> > > +++ b/arch/x86/events/core.c
> >> > > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event
> >> > *event)
> >> > >   delta = (new_raw_count << shift) - (prev_raw_count << shift);
> >> > >   delta >>= shift;
> >> > >
> >> > > + /* Correct the count number if applying ref_cycles replacement */
> >> > > + if (!is_sampling_event(event) &&
> >> > > + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
> >> > > + delta *= x86_pmu.ref_cycles_factor;
> >> >
> >> > That condition seems wrong, why only correct for !sampling events?
> >> >
> >>
> >> For sampling, it's either fixed freq mode or fixed period mode.
> >>  - In the fixed freq mode, we should do nothing, because the adaptive
> >>frequency algorithm will handle it.
> >>  - In the fixed period mode, we have already adjusted the period in
> >> ref_cycles_rep().
> >> Therefore, we should only handle !sampling events here.
> >
> > How so? For sampling events the actual event count should also be
> > accurate.
> 
> Yes, it must be. Because you can reconstruct the total number of
> occurrences of the event by adding
> all the periods recorded in each sample. So the period in each sample
> must reflect user event and not
> kernel event.

Well, that, but you can equally use read() or the mmap()'ed rdpmc stuff
on a sampling event. The fact that is also generates samples does not
mean it should not also function as a non-sampling event.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-23 Thread Peter Zijlstra
On Mon, May 22, 2017 at 12:28:26PM -0700, Stephane Eranian wrote:
> On Mon, May 22, 2017 at 12:23 PM, Peter Zijlstra  wrote:
> > On Mon, May 22, 2017 at 04:55:47PM +, Liang, Kan wrote:
> >>
> >>
> >> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> >> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
> >> > > 580b60f..e8b2326 100644
> >> > > --- a/arch/x86/events/core.c
> >> > > +++ b/arch/x86/events/core.c
> >> > > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event
> >> > *event)
> >> > >   delta = (new_raw_count << shift) - (prev_raw_count << shift);
> >> > >   delta >>= shift;
> >> > >
> >> > > + /* Correct the count number if applying ref_cycles replacement */
> >> > > + if (!is_sampling_event(event) &&
> >> > > + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
> >> > > + delta *= x86_pmu.ref_cycles_factor;
> >> >
> >> > That condition seems wrong, why only correct for !sampling events?
> >> >
> >>
> >> For sampling, it's either fixed freq mode or fixed period mode.
> >>  - In the fixed freq mode, we should do nothing, because the adaptive
> >>frequency algorithm will handle it.
> >>  - In the fixed period mode, we have already adjusted the period in
> >> ref_cycles_rep().
> >> Therefore, we should only handle !sampling events here.
> >
> > How so? For sampling events the actual event count should also be
> > accurate.
> 
> Yes, it must be. Because you can reconstruct the total number of
> occurrences of the event by adding
> all the periods recorded in each sample. So the period in each sample
> must reflect user event and not
> kernel event.

Well, that, but you can equally use read() or the mmap()'ed rdpmc stuff
on a sampling event. The fact that is also generates samples does not
mean it should not also function as a non-sampling event.


RE: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Liang, Kan
> 
> On Mon, May 22, 2017 at 12:23 PM, Peter Zijlstra 
> wrote:
> > On Mon, May 22, 2017 at 04:55:47PM +, Liang, Kan wrote:
> >>
> >>
> >> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> >> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> >> > > index
> >> > > 580b60f..e8b2326 100644
> >> > > --- a/arch/x86/events/core.c
> >> > > +++ b/arch/x86/events/core.c
> >> > > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct
> perf_event
> >> > *event)
> >> > >   delta = (new_raw_count << shift) - (prev_raw_count << shift);
> >> > >   delta >>= shift;
> >> > >
> >> > > + /* Correct the count number if applying ref_cycles replacement
> >> > > + */ if (!is_sampling_event(event) &&
> >> > > + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
> >> > > + delta *= x86_pmu.ref_cycles_factor;
> >> >
> >> > That condition seems wrong, why only correct for !sampling events?
> >> >
> >>
> >> For sampling, it's either fixed freq mode or fixed period mode.
> >>  - In the fixed freq mode, we should do nothing, because the adaptive
> >>frequency algorithm will handle it.
> >>  - In the fixed period mode, we have already adjusted the period in
> >> ref_cycles_rep().
> >> Therefore, we should only handle !sampling events here.
> >
> > How so? For sampling events the actual event count should also be
> > accurate.
> 
> Yes, it must be. Because you can reconstruct the total number of occurrences
> of the event by adding all the periods recorded in each sample. So the period
> in each sample must reflect user event and not kernel event.

Peter and Stephane, you are right.
After adjusting the period, I can only make sure the number of samples for
the bus_cycles event is the same as that for ref cycles event.
I still need to adjust the number of occurrences of the event accordingly,
to make it accurate.
I will change it in next version.

Thanks,
Kan


RE: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Liang, Kan
> 
> On Mon, May 22, 2017 at 12:23 PM, Peter Zijlstra 
> wrote:
> > On Mon, May 22, 2017 at 04:55:47PM +, Liang, Kan wrote:
> >>
> >>
> >> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> >> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> >> > > index
> >> > > 580b60f..e8b2326 100644
> >> > > --- a/arch/x86/events/core.c
> >> > > +++ b/arch/x86/events/core.c
> >> > > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct
> perf_event
> >> > *event)
> >> > >   delta = (new_raw_count << shift) - (prev_raw_count << shift);
> >> > >   delta >>= shift;
> >> > >
> >> > > + /* Correct the count number if applying ref_cycles replacement
> >> > > + */ if (!is_sampling_event(event) &&
> >> > > + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
> >> > > + delta *= x86_pmu.ref_cycles_factor;
> >> >
> >> > That condition seems wrong, why only correct for !sampling events?
> >> >
> >>
> >> For sampling, it's either fixed freq mode or fixed period mode.
> >>  - In the fixed freq mode, we should do nothing, because the adaptive
> >>frequency algorithm will handle it.
> >>  - In the fixed period mode, we have already adjusted the period in
> >> ref_cycles_rep().
> >> Therefore, we should only handle !sampling events here.
> >
> > How so? For sampling events the actual event count should also be
> > accurate.
> 
> Yes, it must be. Because you can reconstruct the total number of occurrences
> of the event by adding all the periods recorded in each sample. So the period
> in each sample must reflect user event and not kernel event.

Peter and Stephane, you are right.
After adjusting the period, I can only make sure the number of samples for
the bus_cycles event is the same as that for ref cycles event.
I still need to adjust the number of occurrences of the event accordingly,
to make it accurate.
I will change it in next version.

Thanks,
Kan


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Stephane Eranian
On Mon, May 22, 2017 at 12:23 PM, Peter Zijlstra  wrote:
> On Mon, May 22, 2017 at 04:55:47PM +, Liang, Kan wrote:
>>
>>
>> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
>> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
>> > > 580b60f..e8b2326 100644
>> > > --- a/arch/x86/events/core.c
>> > > +++ b/arch/x86/events/core.c
>> > > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event
>> > *event)
>> > >   delta = (new_raw_count << shift) - (prev_raw_count << shift);
>> > >   delta >>= shift;
>> > >
>> > > + /* Correct the count number if applying ref_cycles replacement */
>> > > + if (!is_sampling_event(event) &&
>> > > + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
>> > > + delta *= x86_pmu.ref_cycles_factor;
>> >
>> > That condition seems wrong, why only correct for !sampling events?
>> >
>>
>> For sampling, it's either fixed freq mode or fixed period mode.
>>  - In the fixed freq mode, we should do nothing, because the adaptive
>>frequency algorithm will handle it.
>>  - In the fixed period mode, we have already adjusted the period in
>> ref_cycles_rep().
>> Therefore, we should only handle !sampling events here.
>
> How so? For sampling events the actual event count should also be
> accurate.

Yes, it must be. Because you can reconstruct the total number of
occurrences of the event by adding
all the periods recorded in each sample. So the period in each sample
must reflect user event and not
kernel event.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Stephane Eranian
On Mon, May 22, 2017 at 12:23 PM, Peter Zijlstra  wrote:
> On Mon, May 22, 2017 at 04:55:47PM +, Liang, Kan wrote:
>>
>>
>> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
>> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
>> > > 580b60f..e8b2326 100644
>> > > --- a/arch/x86/events/core.c
>> > > +++ b/arch/x86/events/core.c
>> > > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event
>> > *event)
>> > >   delta = (new_raw_count << shift) - (prev_raw_count << shift);
>> > >   delta >>= shift;
>> > >
>> > > + /* Correct the count number if applying ref_cycles replacement */
>> > > + if (!is_sampling_event(event) &&
>> > > + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
>> > > + delta *= x86_pmu.ref_cycles_factor;
>> >
>> > That condition seems wrong, why only correct for !sampling events?
>> >
>>
>> For sampling, it's either fixed freq mode or fixed period mode.
>>  - In the fixed freq mode, we should do nothing, because the adaptive
>>frequency algorithm will handle it.
>>  - In the fixed period mode, we have already adjusted the period in
>> ref_cycles_rep().
>> Therefore, we should only handle !sampling events here.
>
> How so? For sampling events the actual event count should also be
> accurate.

Yes, it must be. Because you can reconstruct the total number of
occurrences of the event by adding
all the periods recorded in each sample. So the period in each sample
must reflect user event and not
kernel event.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Peter Zijlstra
On Mon, May 22, 2017 at 04:55:47PM +, Liang, Kan wrote:
> 
> 
> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
> > > 580b60f..e8b2326 100644
> > > --- a/arch/x86/events/core.c
> > > +++ b/arch/x86/events/core.c
> > > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event
> > *event)
> > >   delta = (new_raw_count << shift) - (prev_raw_count << shift);
> > >   delta >>= shift;
> > >
> > > + /* Correct the count number if applying ref_cycles replacement */
> > > + if (!is_sampling_event(event) &&
> > > + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
> > > + delta *= x86_pmu.ref_cycles_factor;
> > 
> > That condition seems wrong, why only correct for !sampling events?
> >
> 
> For sampling, it's either fixed freq mode or fixed period mode.
>  - In the fixed freq mode, we should do nothing, because the adaptive
>frequency algorithm will handle it.
>  - In the fixed period mode, we have already adjusted the period in 
> ref_cycles_rep().
> Therefore, we should only handle !sampling events here.

How so? For sampling events the actual event count should also be
accurate.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Peter Zijlstra
On Mon, May 22, 2017 at 04:55:47PM +, Liang, Kan wrote:
> 
> 
> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
> > > 580b60f..e8b2326 100644
> > > --- a/arch/x86/events/core.c
> > > +++ b/arch/x86/events/core.c
> > > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event
> > *event)
> > >   delta = (new_raw_count << shift) - (prev_raw_count << shift);
> > >   delta >>= shift;
> > >
> > > + /* Correct the count number if applying ref_cycles replacement */
> > > + if (!is_sampling_event(event) &&
> > > + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
> > > + delta *= x86_pmu.ref_cycles_factor;
> > 
> > That condition seems wrong, why only correct for !sampling events?
> >
> 
> For sampling, it's either fixed freq mode or fixed period mode.
>  - In the fixed freq mode, we should do nothing, because the adaptive
>frequency algorithm will handle it.
>  - In the fixed period mode, we have already adjusted the period in 
> ref_cycles_rep().
> Therefore, we should only handle !sampling events here.

How so? For sampling events the actual event count should also be
accurate.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Stephane Eranian
Hi,

On Mon, May 22, 2017 at 1:30 AM, Peter Zijlstra  wrote:
> On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
>> From: Kan Liang 
>>
>> The CPU ref_cycles can only be used by one user at the same time,
>> otherwise a "not counted" error will be displaced.
>> [kan]$ sudo perf stat -x, -e ref-cycles,ref-cycles -- sleep 1
>> 1203264,,ref-cycles,513112,100.00
>> ,,ref-cycles,0,0.00
>>
>> CPU ref_cycles can only be counted by fixed counter 2. It uses
>> pseudo-encoding. The GP counter doesn't recognize.
>>
>> BUS_CYCLES (0x013c) is another event which is not affected by core
>> frequency changes. It has a constant ratio with the CPU ref_cycles.
>> BUS_CYCLES could be used as an alternative event for ref_cycles on GP
>> counter.
>> A hook is implemented in x86_schedule_events. If the fixed counter 2 is
>> occupied and a GP counter is assigned, BUS_CYCLES is used to replace
>> ref_cycles. A new flag PERF_X86_EVENT_REF_CYCLES_REP in
>> hw_perf_event is introduced to indicate the replacement.
>> To make the switch transparent, counting and sampling are also specially
>> handled.
>>  - For counting, it multiplies the result with the constant ratio after
>>reading it.
>>  - For sampling with fixed period, the BUS_CYCLES period = ref_cycles
>>period / the constant ratio.
>>  - For sampling with fixed frequency, the adaptive frequency algorithm
>>will figure it out on its own. Do nothing.
>>
>> The constant ratio is model specific.
>> For the model after NEHALEM but before Skylake, the ratio is defined in
>> MSR_PLATFORM_INFO.
>> For the model after Skylake, it can be get from CPUID.15H.
>> For Knights Landing, Goldmont and later, the ratio is always 1.
>>
>> The old Silvermont/Airmont, Core2 and Atom machines are not covered by
>> the patch. The behavior on those machines will not change.
>
> Maybe I missed it, but *why* are we doing this?

Yes, I would like to understand the motivation for this added
complexity as well.

My guess is that you have a situation where ref-cycles is used
constantly, i.e., pinned, and therefore you
lose the ability to count it for any other user. This is the case when
you switch the hard lockup detector
(NMI watchdog) to using ref-cycles instead of core cycles. This is
what you are doing in patch 2/2 actually.
Another scenario could be with virtual machines. KVM makes all guests
events use pinned events on the host. So if the guest is measuring
ref-cycles, then the host cannot.Well, I am hoping this is not the
case because as far as I remember system-wide pinned has
higher priority than per-process pinned.

You cannot make your change transparent in sampling mode. You are
adjusting the period with the ratio. If
the user asks for the period to be recorded in each sample, the
modified period will be captured. If I say I
want to sample every 1M ref-cycles  and I set event_attr.sample_type =
PERF_SAMPLE_PERIOD, then I
expect to see 1M in each sample and not some scaled value. So you need
to address this problem, including
in frequency mode.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Stephane Eranian
Hi,

On Mon, May 22, 2017 at 1:30 AM, Peter Zijlstra  wrote:
> On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
>> From: Kan Liang 
>>
>> The CPU ref_cycles can only be used by one user at the same time,
>> otherwise a "not counted" error will be displaced.
>> [kan]$ sudo perf stat -x, -e ref-cycles,ref-cycles -- sleep 1
>> 1203264,,ref-cycles,513112,100.00
>> ,,ref-cycles,0,0.00
>>
>> CPU ref_cycles can only be counted by fixed counter 2. It uses
>> pseudo-encoding. The GP counter doesn't recognize.
>>
>> BUS_CYCLES (0x013c) is another event which is not affected by core
>> frequency changes. It has a constant ratio with the CPU ref_cycles.
>> BUS_CYCLES could be used as an alternative event for ref_cycles on GP
>> counter.
>> A hook is implemented in x86_schedule_events. If the fixed counter 2 is
>> occupied and a GP counter is assigned, BUS_CYCLES is used to replace
>> ref_cycles. A new flag PERF_X86_EVENT_REF_CYCLES_REP in
>> hw_perf_event is introduced to indicate the replacement.
>> To make the switch transparent, counting and sampling are also specially
>> handled.
>>  - For counting, it multiplies the result with the constant ratio after
>>reading it.
>>  - For sampling with fixed period, the BUS_CYCLES period = ref_cycles
>>period / the constant ratio.
>>  - For sampling with fixed frequency, the adaptive frequency algorithm
>>will figure it out on its own. Do nothing.
>>
>> The constant ratio is model specific.
>> For the model after NEHALEM but before Skylake, the ratio is defined in
>> MSR_PLATFORM_INFO.
>> For the model after Skylake, it can be get from CPUID.15H.
>> For Knights Landing, Goldmont and later, the ratio is always 1.
>>
>> The old Silvermont/Airmont, Core2 and Atom machines are not covered by
>> the patch. The behavior on those machines will not change.
>
> Maybe I missed it, but *why* are we doing this?

Yes, I would like to understand the motivation for this added
complexity as well.

My guess is that you have a situation where ref-cycles is used
constantly, i.e., pinned, and therefore you
lose the ability to count it for any other user. This is the case when
you switch the hard lockup detector
(NMI watchdog) to using ref-cycles instead of core cycles. This is
what you are doing in patch 2/2 actually.
Another scenario could be with virtual machines. KVM makes all guests
events use pinned events on the host. So if the guest is measuring
ref-cycles, then the host cannot.Well, I am hoping this is not the
case because as far as I remember system-wide pinned has
higher priority than per-process pinned.

You cannot make your change transparent in sampling mode. You are
adjusting the period with the ratio. If
the user asks for the period to be recorded in each sample, the
modified period will be captured. If I say I
want to sample every 1M ref-cycles  and I set event_attr.sample_type =
PERF_SAMPLE_PERIOD, then I
expect to see 1M in each sample and not some scaled value. So you need
to address this problem, including
in frequency mode.


RE: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Liang, Kan


> On Mon, May 22, 2017 at 11:19:16AM +0200, Peter Zijlstra wrote:
> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> > > @@ -934,6 +938,21 @@ int x86_schedule_events(struct cpu_hw_events
> > > *cpuc, int n, int *assign)
> 
> > >   for (i = 0; i < n; i++) {
> > >   e = cpuc->event_list[i];
> > >   e->hw.flags |= PERF_X86_EVENT_COMMITTED;
> > > +
> > > + /*
> > > +  * 0x0300 is pseudo-encoding for REF_CPU_CYCLES.
> > > +  * It indicates that fixed counter 2 should be used.
> > > +  *
> > > +  * If fixed counter 2 is occupied and a GP counter
> > > +  * is assigned, an alternative event which can be
> > > +  * counted in GP counter will be used to replace
> > > +  * the pseudo-encoding REF_CPU_CYCLES event.
> > > +  */
> > > + if (((e->hw.config & X86_RAW_EVENT_MASK) ==
> 0x0300) &&
> > > + (assign[i] < INTEL_PMC_IDX_FIXED) &&
> > > + x86_pmu.ref_cycles_rep)
> > > + x86_pmu.ref_cycles_rep(e);
> > > +
> > >   if (x86_pmu.commit_scheduling)
> > >   x86_pmu.commit_scheduling(cpuc, i,
> assign[i]);
> > >   }
> >
> > This looks dodgy, this is the branch were we managed to schedule all
> > events. Why would we need to consider anything here?
> >
> > I was expecting a retry if there are still unscheduled events and one
> > of the events was our 0x0300 event. In that case you have to reset the
> > event and retry the whole scheduling thing.
> 
> Ah, I see what you've done. That Changelog could use a lot of help, it's 
> barely
> readable.

Thanks for the suggestions.
I will modify the changelog and make it clearer why we need the patch. 

Thanks,
Kan


RE: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Liang, Kan


> On Mon, May 22, 2017 at 11:19:16AM +0200, Peter Zijlstra wrote:
> > On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> > > @@ -934,6 +938,21 @@ int x86_schedule_events(struct cpu_hw_events
> > > *cpuc, int n, int *assign)
> 
> > >   for (i = 0; i < n; i++) {
> > >   e = cpuc->event_list[i];
> > >   e->hw.flags |= PERF_X86_EVENT_COMMITTED;
> > > +
> > > + /*
> > > +  * 0x0300 is pseudo-encoding for REF_CPU_CYCLES.
> > > +  * It indicates that fixed counter 2 should be used.
> > > +  *
> > > +  * If fixed counter 2 is occupied and a GP counter
> > > +  * is assigned, an alternative event which can be
> > > +  * counted in GP counter will be used to replace
> > > +  * the pseudo-encoding REF_CPU_CYCLES event.
> > > +  */
> > > + if (((e->hw.config & X86_RAW_EVENT_MASK) ==
> 0x0300) &&
> > > + (assign[i] < INTEL_PMC_IDX_FIXED) &&
> > > + x86_pmu.ref_cycles_rep)
> > > + x86_pmu.ref_cycles_rep(e);
> > > +
> > >   if (x86_pmu.commit_scheduling)
> > >   x86_pmu.commit_scheduling(cpuc, i,
> assign[i]);
> > >   }
> >
> > This looks dodgy, this is the branch were we managed to schedule all
> > events. Why would we need to consider anything here?
> >
> > I was expecting a retry if there are still unscheduled events and one
> > of the events was our 0x0300 event. In that case you have to reset the
> > event and retry the whole scheduling thing.
> 
> Ah, I see what you've done. That Changelog could use a lot of help, it's 
> barely
> readable.

Thanks for the suggestions.
I will modify the changelog and make it clearer why we need the patch. 

Thanks,
Kan


RE: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Liang, Kan


> On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
> > 580b60f..e8b2326 100644
> > --- a/arch/x86/events/core.c
> > +++ b/arch/x86/events/core.c
> > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event
> *event)
> > delta = (new_raw_count << shift) - (prev_raw_count << shift);
> > delta >>= shift;
> >
> > +   /* Correct the count number if applying ref_cycles replacement */
> > +   if (!is_sampling_event(event) &&
> > +   (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
> > +   delta *= x86_pmu.ref_cycles_factor;
> 
> That condition seems wrong, why only correct for !sampling events?
>

For sampling, it's either fixed freq mode or fixed period mode.
 - In the fixed freq mode, we should do nothing, because the adaptive
   frequency algorithm will handle it.
 - In the fixed period mode, we have already adjusted the period in 
ref_cycles_rep().
Therefore, we should only handle !sampling events here.

 
> > local64_add(delta, >count);
> > local64_sub(delta, >period_left);
> >
> 
> 
> > @@ -934,6 +938,21 @@ int x86_schedule_events(struct cpu_hw_events
> *cpuc, int n, int *assign)
> > for (i = 0; i < n; i++) {
> > e = cpuc->event_list[i];
> > e->hw.flags |= PERF_X86_EVENT_COMMITTED;
> > +
> > +   /*
> > +* 0x0300 is pseudo-encoding for REF_CPU_CYCLES.
> > +* It indicates that fixed counter 2 should be used.
> > +*
> > +* If fixed counter 2 is occupied and a GP counter
> > +* is assigned, an alternative event which can be
> > +* counted in GP counter will be used to replace
> > +* the pseudo-encoding REF_CPU_CYCLES event.
> > +*/
> > +   if (((e->hw.config & X86_RAW_EVENT_MASK) ==
> 0x0300) &&
> > +   (assign[i] < INTEL_PMC_IDX_FIXED) &&
> > +   x86_pmu.ref_cycles_rep)
> > +   x86_pmu.ref_cycles_rep(e);
> > +
> > if (x86_pmu.commit_scheduling)
> > x86_pmu.commit_scheduling(cpuc, i,
> assign[i]);
> > }
> 
> This looks dodgy, this is the branch were we managed to schedule all events.
> Why would we need to consider anything here?
> 
> I was expecting a retry if there are still unscheduled events and one of the
> events was our 0x0300 event. In that case you have to reset the event and
> retry the whole scheduling thing.

Will do it.

Thanks,
Kan


RE: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Liang, Kan


> On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
> > 580b60f..e8b2326 100644
> > --- a/arch/x86/events/core.c
> > +++ b/arch/x86/events/core.c
> > @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event
> *event)
> > delta = (new_raw_count << shift) - (prev_raw_count << shift);
> > delta >>= shift;
> >
> > +   /* Correct the count number if applying ref_cycles replacement */
> > +   if (!is_sampling_event(event) &&
> > +   (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
> > +   delta *= x86_pmu.ref_cycles_factor;
> 
> That condition seems wrong, why only correct for !sampling events?
>

For sampling, it's either fixed freq mode or fixed period mode.
 - In the fixed freq mode, we should do nothing, because the adaptive
   frequency algorithm will handle it.
 - In the fixed period mode, we have already adjusted the period in 
ref_cycles_rep().
Therefore, we should only handle !sampling events here.

 
> > local64_add(delta, >count);
> > local64_sub(delta, >period_left);
> >
> 
> 
> > @@ -934,6 +938,21 @@ int x86_schedule_events(struct cpu_hw_events
> *cpuc, int n, int *assign)
> > for (i = 0; i < n; i++) {
> > e = cpuc->event_list[i];
> > e->hw.flags |= PERF_X86_EVENT_COMMITTED;
> > +
> > +   /*
> > +* 0x0300 is pseudo-encoding for REF_CPU_CYCLES.
> > +* It indicates that fixed counter 2 should be used.
> > +*
> > +* If fixed counter 2 is occupied and a GP counter
> > +* is assigned, an alternative event which can be
> > +* counted in GP counter will be used to replace
> > +* the pseudo-encoding REF_CPU_CYCLES event.
> > +*/
> > +   if (((e->hw.config & X86_RAW_EVENT_MASK) ==
> 0x0300) &&
> > +   (assign[i] < INTEL_PMC_IDX_FIXED) &&
> > +   x86_pmu.ref_cycles_rep)
> > +   x86_pmu.ref_cycles_rep(e);
> > +
> > if (x86_pmu.commit_scheduling)
> > x86_pmu.commit_scheduling(cpuc, i,
> assign[i]);
> > }
> 
> This looks dodgy, this is the branch were we managed to schedule all events.
> Why would we need to consider anything here?
> 
> I was expecting a retry if there are still unscheduled events and one of the
> events was our 0x0300 event. In that case you have to reset the event and
> retry the whole scheduling thing.

Will do it.

Thanks,
Kan


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Peter Zijlstra
On Mon, May 22, 2017 at 11:19:16AM +0200, Peter Zijlstra wrote:
> On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> > @@ -934,6 +938,21 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, 
> > int n, int *assign)

> > for (i = 0; i < n; i++) {
> > e = cpuc->event_list[i];
> > e->hw.flags |= PERF_X86_EVENT_COMMITTED;
> > +
> > +   /*
> > +* 0x0300 is pseudo-encoding for REF_CPU_CYCLES.
> > +* It indicates that fixed counter 2 should be used.
> > +*
> > +* If fixed counter 2 is occupied and a GP counter
> > +* is assigned, an alternative event which can be
> > +* counted in GP counter will be used to replace
> > +* the pseudo-encoding REF_CPU_CYCLES event.
> > +*/
> > +   if (((e->hw.config & X86_RAW_EVENT_MASK) == 0x0300) &&
> > +   (assign[i] < INTEL_PMC_IDX_FIXED) &&
> > +   x86_pmu.ref_cycles_rep)
> > +   x86_pmu.ref_cycles_rep(e);
> > +
> > if (x86_pmu.commit_scheduling)
> > x86_pmu.commit_scheduling(cpuc, i, assign[i]);
> > }
> 
> This looks dodgy, this is the branch were we managed to schedule all
> events. Why would we need to consider anything here?
> 
> I was expecting a retry if there are still unscheduled events and one of
> the events was our 0x0300 event. In that case you have to reset the
> event and retry the whole scheduling thing.

Ah, I see what you've done. That Changelog could use a lot of help, it's
barely readable.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Peter Zijlstra
On Mon, May 22, 2017 at 11:19:16AM +0200, Peter Zijlstra wrote:
> On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> > @@ -934,6 +938,21 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, 
> > int n, int *assign)

> > for (i = 0; i < n; i++) {
> > e = cpuc->event_list[i];
> > e->hw.flags |= PERF_X86_EVENT_COMMITTED;
> > +
> > +   /*
> > +* 0x0300 is pseudo-encoding for REF_CPU_CYCLES.
> > +* It indicates that fixed counter 2 should be used.
> > +*
> > +* If fixed counter 2 is occupied and a GP counter
> > +* is assigned, an alternative event which can be
> > +* counted in GP counter will be used to replace
> > +* the pseudo-encoding REF_CPU_CYCLES event.
> > +*/
> > +   if (((e->hw.config & X86_RAW_EVENT_MASK) == 0x0300) &&
> > +   (assign[i] < INTEL_PMC_IDX_FIXED) &&
> > +   x86_pmu.ref_cycles_rep)
> > +   x86_pmu.ref_cycles_rep(e);
> > +
> > if (x86_pmu.commit_scheduling)
> > x86_pmu.commit_scheduling(cpuc, i, assign[i]);
> > }
> 
> This looks dodgy, this is the branch were we managed to schedule all
> events. Why would we need to consider anything here?
> 
> I was expecting a retry if there are still unscheduled events and one of
> the events was our 0x0300 event. In that case you have to reset the
> event and retry the whole scheduling thing.

Ah, I see what you've done. That Changelog could use a lot of help, it's
barely readable.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Peter Zijlstra
On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 580b60f..e8b2326 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event *event)
>   delta = (new_raw_count << shift) - (prev_raw_count << shift);
>   delta >>= shift;
>  
> + /* Correct the count number if applying ref_cycles replacement */
> + if (!is_sampling_event(event) &&
> + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
> + delta *= x86_pmu.ref_cycles_factor;

That condition seems wrong, why only correct for !sampling events?

>   local64_add(delta, >count);
>   local64_sub(delta, >period_left);
>  


> @@ -934,6 +938,21 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int 
> n, int *assign)
>   for (i = 0; i < n; i++) {
>   e = cpuc->event_list[i];
>   e->hw.flags |= PERF_X86_EVENT_COMMITTED;
> +
> + /*
> +  * 0x0300 is pseudo-encoding for REF_CPU_CYCLES.
> +  * It indicates that fixed counter 2 should be used.
> +  *
> +  * If fixed counter 2 is occupied and a GP counter
> +  * is assigned, an alternative event which can be
> +  * counted in GP counter will be used to replace
> +  * the pseudo-encoding REF_CPU_CYCLES event.
> +  */
> + if (((e->hw.config & X86_RAW_EVENT_MASK) == 0x0300) &&
> + (assign[i] < INTEL_PMC_IDX_FIXED) &&
> + x86_pmu.ref_cycles_rep)
> + x86_pmu.ref_cycles_rep(e);
> +
>   if (x86_pmu.commit_scheduling)
>   x86_pmu.commit_scheduling(cpuc, i, assign[i]);
>   }

This looks dodgy, this is the branch were we managed to schedule all
events. Why would we need to consider anything here?

I was expecting a retry if there are still unscheduled events and one of
the events was our 0x0300 event. In that case you have to reset the
event and retry the whole scheduling thing.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Peter Zijlstra
On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 580b60f..e8b2326 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event *event)
>   delta = (new_raw_count << shift) - (prev_raw_count << shift);
>   delta >>= shift;
>  
> + /* Correct the count number if applying ref_cycles replacement */
> + if (!is_sampling_event(event) &&
> + (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
> + delta *= x86_pmu.ref_cycles_factor;

That condition seems wrong, why only correct for !sampling events?

>   local64_add(delta, >count);
>   local64_sub(delta, >period_left);
>  


> @@ -934,6 +938,21 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int 
> n, int *assign)
>   for (i = 0; i < n; i++) {
>   e = cpuc->event_list[i];
>   e->hw.flags |= PERF_X86_EVENT_COMMITTED;
> +
> + /*
> +  * 0x0300 is pseudo-encoding for REF_CPU_CYCLES.
> +  * It indicates that fixed counter 2 should be used.
> +  *
> +  * If fixed counter 2 is occupied and a GP counter
> +  * is assigned, an alternative event which can be
> +  * counted in GP counter will be used to replace
> +  * the pseudo-encoding REF_CPU_CYCLES event.
> +  */
> + if (((e->hw.config & X86_RAW_EVENT_MASK) == 0x0300) &&
> + (assign[i] < INTEL_PMC_IDX_FIXED) &&
> + x86_pmu.ref_cycles_rep)
> + x86_pmu.ref_cycles_rep(e);
> +
>   if (x86_pmu.commit_scheduling)
>   x86_pmu.commit_scheduling(cpuc, i, assign[i]);
>   }

This looks dodgy, this is the branch were we managed to schedule all
events. Why would we need to consider anything here?

I was expecting a retry if there are still unscheduled events and one of
the events was our 0x0300 event. In that case you have to reset the
event and retry the whole scheduling thing.


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Peter Zijlstra
On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> From: Kan Liang 
> 
> The CPU ref_cycles can only be used by one user at the same time,
> otherwise a "not counted" error will be displaced.
> [kan]$ sudo perf stat -x, -e ref-cycles,ref-cycles -- sleep 1
> 1203264,,ref-cycles,513112,100.00
> ,,ref-cycles,0,0.00
> 
> CPU ref_cycles can only be counted by fixed counter 2. It uses
> pseudo-encoding. The GP counter doesn't recognize.
> 
> BUS_CYCLES (0x013c) is another event which is not affected by core
> frequency changes. It has a constant ratio with the CPU ref_cycles.
> BUS_CYCLES could be used as an alternative event for ref_cycles on GP
> counter.
> A hook is implemented in x86_schedule_events. If the fixed counter 2 is
> occupied and a GP counter is assigned, BUS_CYCLES is used to replace
> ref_cycles. A new flag PERF_X86_EVENT_REF_CYCLES_REP in
> hw_perf_event is introduced to indicate the replacement.
> To make the switch transparent, counting and sampling are also specially
> handled.
>  - For counting, it multiplies the result with the constant ratio after
>reading it.
>  - For sampling with fixed period, the BUS_CYCLES period = ref_cycles
>period / the constant ratio.
>  - For sampling with fixed frequency, the adaptive frequency algorithm
>will figure it out on its own. Do nothing.
> 
> The constant ratio is model specific.
> For the model after NEHALEM but before Skylake, the ratio is defined in
> MSR_PLATFORM_INFO.
> For the model after Skylake, it can be get from CPUID.15H.
> For Knights Landing, Goldmont and later, the ratio is always 1.
> 
> The old Silvermont/Airmont, Core2 and Atom machines are not covered by
> the patch. The behavior on those machines will not change.

Maybe I missed it, but *why* are we doing this?


Re: [PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-22 Thread Peter Zijlstra
On Fri, May 19, 2017 at 10:06:21AM -0700, kan.li...@intel.com wrote:
> From: Kan Liang 
> 
> The CPU ref_cycles can only be used by one user at the same time,
> otherwise a "not counted" error will be displaced.
> [kan]$ sudo perf stat -x, -e ref-cycles,ref-cycles -- sleep 1
> 1203264,,ref-cycles,513112,100.00
> ,,ref-cycles,0,0.00
> 
> CPU ref_cycles can only be counted by fixed counter 2. It uses
> pseudo-encoding. The GP counter doesn't recognize.
> 
> BUS_CYCLES (0x013c) is another event which is not affected by core
> frequency changes. It has a constant ratio with the CPU ref_cycles.
> BUS_CYCLES could be used as an alternative event for ref_cycles on GP
> counter.
> A hook is implemented in x86_schedule_events. If the fixed counter 2 is
> occupied and a GP counter is assigned, BUS_CYCLES is used to replace
> ref_cycles. A new flag PERF_X86_EVENT_REF_CYCLES_REP in
> hw_perf_event is introduced to indicate the replacement.
> To make the switch transparent, counting and sampling are also specially
> handled.
>  - For counting, it multiplies the result with the constant ratio after
>reading it.
>  - For sampling with fixed period, the BUS_CYCLES period = ref_cycles
>period / the constant ratio.
>  - For sampling with fixed frequency, the adaptive frequency algorithm
>will figure it out on its own. Do nothing.
> 
> The constant ratio is model specific.
> For the model after NEHALEM but before Skylake, the ratio is defined in
> MSR_PLATFORM_INFO.
> For the model after Skylake, it can be get from CPUID.15H.
> For Knights Landing, Goldmont and later, the ratio is always 1.
> 
> The old Silvermont/Airmont, Core2 and Atom machines are not covered by
> the patch. The behavior on those machines will not change.

Maybe I missed it, but *why* are we doing this?


[PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-19 Thread kan . liang
From: Kan Liang 

The CPU ref_cycles can only be used by one user at the same time,
otherwise a "not counted" error will be displaced.
[kan]$ sudo perf stat -x, -e ref-cycles,ref-cycles -- sleep 1
1203264,,ref-cycles,513112,100.00
,,ref-cycles,0,0.00

CPU ref_cycles can only be counted by fixed counter 2. It uses
pseudo-encoding. The GP counter doesn't recognize.

BUS_CYCLES (0x013c) is another event which is not affected by core
frequency changes. It has a constant ratio with the CPU ref_cycles.
BUS_CYCLES could be used as an alternative event for ref_cycles on GP
counter.
A hook is implemented in x86_schedule_events. If the fixed counter 2 is
occupied and a GP counter is assigned, BUS_CYCLES is used to replace
ref_cycles. A new flag PERF_X86_EVENT_REF_CYCLES_REP in
hw_perf_event is introduced to indicate the replacement.
To make the switch transparent, counting and sampling are also specially
handled.
 - For counting, it multiplies the result with the constant ratio after
   reading it.
 - For sampling with fixed period, the BUS_CYCLES period = ref_cycles
   period / the constant ratio.
 - For sampling with fixed frequency, the adaptive frequency algorithm
   will figure it out on its own. Do nothing.

The constant ratio is model specific.
For the model after NEHALEM but before Skylake, the ratio is defined in
MSR_PLATFORM_INFO.
For the model after Skylake, it can be get from CPUID.15H.
For Knights Landing, Goldmont and later, the ratio is always 1.

The old Silvermont/Airmont, Core2 and Atom machines are not covered by
the patch. The behavior on those machines will not change.

Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 19 +
 arch/x86/events/intel/core.c | 93 
 arch/x86/events/perf_event.h |  3 ++
 3 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 580b60f..e8b2326 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event *event)
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
 
+   /* Correct the count number if applying ref_cycles replacement */
+   if (!is_sampling_event(event) &&
+   (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
+   delta *= x86_pmu.ref_cycles_factor;
local64_add(delta, >count);
local64_sub(delta, >period_left);
 
@@ -934,6 +938,21 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, 
int *assign)
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+
+   /*
+* 0x0300 is pseudo-encoding for REF_CPU_CYCLES.
+* It indicates that fixed counter 2 should be used.
+*
+* If fixed counter 2 is occupied and a GP counter
+* is assigned, an alternative event which can be
+* counted in GP counter will be used to replace
+* the pseudo-encoding REF_CPU_CYCLES event.
+*/
+   if (((e->hw.config & X86_RAW_EVENT_MASK) == 0x0300) &&
+   (assign[i] < INTEL_PMC_IDX_FIXED) &&
+   x86_pmu.ref_cycles_rep)
+   x86_pmu.ref_cycles_rep(e);
+
if (x86_pmu.commit_scheduling)
x86_pmu.commit_scheduling(cpuc, i, assign[i]);
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a6d91d4..af5a464 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3063,6 +3063,55 @@ static unsigned bdw_limit_period(struct perf_event 
*event, unsigned left)
return left;
 }
 
+static __init unsigned int glm_get_ref_cycles_factor(void)
+{
+   return 1;
+}
+
+static __init unsigned int nhm_get_ref_cycles_factor(void)
+{
+   u64 platform_info;
+
+   rdmsrl(MSR_PLATFORM_INFO, platform_info);
+   return (platform_info >> 8) & 0xff;
+}
+
+static __init unsigned int skl_get_ref_cycles_factor(void)
+{
+   unsigned int cpuid21_eax, cpuid21_ebx, cpuid21_ecx, unused;
+
+   cpuid(21, _eax, _ebx, _ecx, );
+   if (!cpuid21_eax || !cpuid21_ebx)
+   return 0;
+
+   return cpuid21_ebx / cpuid21_eax;
+}
+
+/*
+ * BUS_CYCLES (0x013c) is another event which is not affected by core
+ * frequency changes. It has a constant ratio with the CPU ref_cycles.
+ * BUS_CYCLES could be used as an alternative event for ref_cycles on
+ * GP counter.
+ */
+void nhm_ref_cycles_rep(struct perf_event *event)
+{
+   struct hw_perf_event *hwc = >hw;
+
+   hwc->config = (hwc->config & ~X86_RAW_EVENT_MASK) |
+   

[PATCH 1/2] perf/x86/intel: enable CPU ref_cycles for GP counter

2017-05-19 Thread kan . liang
From: Kan Liang 

The CPU ref_cycles can only be used by one user at the same time,
otherwise a "not counted" error will be displaced.
[kan]$ sudo perf stat -x, -e ref-cycles,ref-cycles -- sleep 1
1203264,,ref-cycles,513112,100.00
,,ref-cycles,0,0.00

CPU ref_cycles can only be counted by fixed counter 2. It uses
pseudo-encoding. The GP counter doesn't recognize.

BUS_CYCLES (0x013c) is another event which is not affected by core
frequency changes. It has a constant ratio with the CPU ref_cycles.
BUS_CYCLES could be used as an alternative event for ref_cycles on GP
counter.
A hook is implemented in x86_schedule_events. If the fixed counter 2 is
occupied and a GP counter is assigned, BUS_CYCLES is used to replace
ref_cycles. A new flag PERF_X86_EVENT_REF_CYCLES_REP in
hw_perf_event is introduced to indicate the replacement.
To make the switch transparent, counting and sampling are also specially
handled.
 - For counting, it multiplies the result with the constant ratio after
   reading it.
 - For sampling with fixed period, the BUS_CYCLES period = ref_cycles
   period / the constant ratio.
 - For sampling with fixed frequency, the adaptive frequency algorithm
   will figure it out on its own. Do nothing.

The constant ratio is model specific.
For the model after NEHALEM but before Skylake, the ratio is defined in
MSR_PLATFORM_INFO.
For the model after Skylake, it can be get from CPUID.15H.
For Knights Landing, Goldmont and later, the ratio is always 1.

The old Silvermont/Airmont, Core2 and Atom machines are not covered by
the patch. The behavior on those machines will not change.

Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 19 +
 arch/x86/events/intel/core.c | 93 
 arch/x86/events/perf_event.h |  3 ++
 3 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 580b60f..e8b2326 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -101,6 +101,10 @@ u64 x86_perf_event_update(struct perf_event *event)
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
 
+   /* Correct the count number if applying ref_cycles replacement */
+   if (!is_sampling_event(event) &&
+   (hwc->flags & PERF_X86_EVENT_REF_CYCLES_REP))
+   delta *= x86_pmu.ref_cycles_factor;
local64_add(delta, >count);
local64_sub(delta, >period_left);
 
@@ -934,6 +938,21 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, 
int *assign)
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+
+   /*
+* 0x0300 is pseudo-encoding for REF_CPU_CYCLES.
+* It indicates that fixed counter 2 should be used.
+*
+* If fixed counter 2 is occupied and a GP counter
+* is assigned, an alternative event which can be
+* counted in GP counter will be used to replace
+* the pseudo-encoding REF_CPU_CYCLES event.
+*/
+   if (((e->hw.config & X86_RAW_EVENT_MASK) == 0x0300) &&
+   (assign[i] < INTEL_PMC_IDX_FIXED) &&
+   x86_pmu.ref_cycles_rep)
+   x86_pmu.ref_cycles_rep(e);
+
if (x86_pmu.commit_scheduling)
x86_pmu.commit_scheduling(cpuc, i, assign[i]);
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a6d91d4..af5a464 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3063,6 +3063,55 @@ static unsigned bdw_limit_period(struct perf_event 
*event, unsigned left)
return left;
 }
 
+static __init unsigned int glm_get_ref_cycles_factor(void)
+{
+   return 1;
+}
+
+static __init unsigned int nhm_get_ref_cycles_factor(void)
+{
+   u64 platform_info;
+
+   rdmsrl(MSR_PLATFORM_INFO, platform_info);
+   return (platform_info >> 8) & 0xff;
+}
+
+static __init unsigned int skl_get_ref_cycles_factor(void)
+{
+   unsigned int cpuid21_eax, cpuid21_ebx, cpuid21_ecx, unused;
+
+   cpuid(21, _eax, _ebx, _ecx, );
+   if (!cpuid21_eax || !cpuid21_ebx)
+   return 0;
+
+   return cpuid21_ebx / cpuid21_eax;
+}
+
+/*
+ * BUS_CYCLES (0x013c) is another event which is not affected by core
+ * frequency changes. It has a constant ratio with the CPU ref_cycles.
+ * BUS_CYCLES could be used as an alternative event for ref_cycles on
+ * GP counter.
+ */
+void nhm_ref_cycles_rep(struct perf_event *event)
+{
+   struct hw_perf_event *hwc = >hw;
+
+   hwc->config = (hwc->config & ~X86_RAW_EVENT_MASK) |
+