[Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-05-03 Thread Umesh Nerlige Ramappa
Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
  register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
  register and return it to user.

v11: (Jani)
- IS_GEN deprecated. User GRAPHICS_VER instead.

v12: (Jason)
- Split cpu timestamp array into timestamp and delta for cleaner API

v13:
- Return the width of cs cycles
- Conform to kernel doc format

Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Lionel Landwerlin 
Reviewed-by: Jason Ekstrand 
---
 drivers/gpu/drm/i915/i915_query.c | 157 ++
 include/uapi/drm/i915_drm.h   |  56 +++
 2 files changed, 213 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..2e7039c71866 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
 
 #include 
 
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,160 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
return total_length;
 }
 
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+   /*
+* Use logic same as the perf subsystem to allow user to select the
+* reference clock id to be used for timestamps.
+*/
+   switch (clk_id) {
+   case CLOCK_MONOTONIC:
+   return _get_ns;
+   case CLOCK_MONOTONIC_RAW:
+   return _get_raw_ns;
+   case CLOCK_REALTIME:
+   return _get_real_ns;
+   case CLOCK_BOOTTIME:
+   return _get_boottime_ns;
+   case CLOCK_TAI:
+   return _get_clocktai_ns;
+   default:
+   return NULL;
+   }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+ i915_reg_t lower_reg,
+ i915_reg_t upper_reg,
+ u64 *cs_ts,
+ u64 *cpu_ts,
+ u64 *cpu_delta,
+ __ktime_func_t cpu_clock)
+{
+   u32 upper, lower, old_upper, loop = 0;
+
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   do {
+   *cpu_delta = local_clock();
+   *cpu_ts = cpu_clock();
+   lower = intel_uncore_read_fw(uncore, lower_reg);
+   *cpu_delta = local_clock() - *cpu_delta;
+   old_upper = upper;
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   } while (upper != old_upper && loop++ < 2);
+
+   *cs_ts = (u64)upper << 32 | lower;
+
+   return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+ u64 *cs_ts, u64 *cpu_ts, u64 *cpu_delta,
+ __ktime_func_t cpu_clock)
+{
+   struct intel_uncore *uncore = engine->uncore;
+   enum forcewake_domains fw_domains;
+   u32 base = engine->mmio_base;
+   intel_wakeref_t wakeref;
+   int ret;
+
+   fw_domains = intel_uncore_forcewake_for_reg(uncore,
+   RING_TIMESTAMP(base),
+   FW_REG_READ);
+
+   with_intel_runtime_pm(uncore->rpm, wakeref) {
+   spin_lock_irq(>lock);
+   intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+   ret = __read_timestamps(uncore,
+   RING_TIMESTAMP(base),
+   RING_TIMESTAMP_UDW(base),
+   cs_ts,
+   cpu_ts,
+   cpu_delta,
+  

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-05-03 Thread Umesh Nerlige Ramappa

On Sat, May 01, 2021 at 10:27:03AM -0500, Jason Ekstrand wrote:

  On April 30, 2021 23:01:44 "Dixit, Ashutosh" 
  wrote:

On Fri, 30 Apr 2021 19:19:59 -0700, Umesh Nerlige Ramappa wrote:

  On Fri, Apr 30, 2021 at 07:35:41PM -0500, Jason Ekstrand wrote:

On April 30, 2021 18:00:58 "Dixit, Ashutosh"

wrote:
On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote:
Looks like the engine can be dropped since all timestamps are in
sync.
I
just have one more question here. The timestamp itself is 36 bits.
 Should
the uapi also report the timestamp width to the user OR should I
just
return the lower 32 bits of the timestamp?
Yeah, I think reporting the timestamp width is a good idea since
we're
reporting the period/frequency here.

  Actually, I forgot that we are handling the overflow before returning
  the
  cs_cycles to the user and overflow handling was the only reason I
  thought
  user should know the width. Would you stil recommend returning the
  width in
  the uapi?

The width is needed for userspace to figure out if overflow has occured
between two successive query calls. I don't think I see this happening
in
the code.

  Right... We (UMDs) currently just hard-code it to 36 bits because that's
  what we've had on all platforms since close enough to forever. We bake in
  the frequency based on PCI ID. Returning the number of bits, like I said,
  goes nicely with the frequency. It's not necessary, assuming sufficiently
  smart userspace (neither is frequency), but it seems to go with it. I
  guess I don't care much either way.
  Coming back to the multi-tile issue we discussed internally, I think that
  is something we should care about. Since this works by reading the
  timestamp register on an engine, I think leaving the engine specifier in
  there is fine. Userspace should know that there's actually only one clock
  and just query one of them (probably RCS). For crazy multi-device cases,
  we'll either query per logical device (read tile) or we'll have to make
  them look like a single device and sync the timestamps somehow in the UMD
  by carrying around an offset factor.
  As is, this patch is
  Reviewed-by: Jason Ekstrand 


Thanks, I will add the width here and post the final version.

Regards,
Umesh



  I still need to review the ANV patch before we can land this though.
  --Jason

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-05-01 Thread Jason Ekstrand

On April 30, 2021 23:01:44 "Dixit, Ashutosh"  wrote:

On Fri, 30 Apr 2021 19:19:59 -0700, Umesh Nerlige Ramappa wrote:


On Fri, Apr 30, 2021 at 07:35:41PM -0500, Jason Ekstrand wrote:

On April 30, 2021 18:00:58 "Dixit, Ashutosh" 
wrote:

On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote:

Looks like the engine can be dropped since all timestamps are in sync.
I
just have one more question here. The timestamp itself is 36 bits.
Should
the uapi also report the timestamp width to the user OR should I just
return the lower 32 bits of the timestamp?

Yeah, I think reporting the timestamp width is a good idea since we're
reporting the period/frequency here.


Actually, I forgot that we are handling the overflow before returning the
cs_cycles to the user and overflow handling was the only reason I thought
user should know the width. Would you stil recommend returning the width in
the uapi?


The width is needed for userspace to figure out if overflow has occured
between two successive query calls. I don't think I see this happening in
the code.


Right... We (UMDs) currently just hard-code it to 36 bits because that's 
what we've had on all platforms since close enough to forever. We bake in 
the frequency based on PCI ID. Returning the number of bits, like I said, 
goes nicely with the frequency. It's not necessary, assuming sufficiently 
smart userspace (neither is frequency), but it seems to go with it. I guess 
I don't care much either way.


Coming back to the multi-tile issue we discussed internally, I think that 
is something we should care about. Since this works by reading the 
timestamp register on an engine, I think leaving the engine specifier in 
there is fine. Userspace should know that there's actually only one clock 
and just query one of them (probably RCS). For crazy multi-device cases, 
we'll either query per logical device (read tile) or we'll have to make 
them look like a single device and sync the timestamps somehow in the UMD 
by carrying around an offset factor.


As is, this patch is

Reviewed-by: Jason Ekstrand 

I still need to review the ANV patch before we can land this though.

--Jason
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-30 Thread Dixit, Ashutosh
On Fri, 30 Apr 2021 19:19:59 -0700, Umesh Nerlige Ramappa wrote:
>
> On Fri, Apr 30, 2021 at 07:35:41PM -0500, Jason Ekstrand wrote:
> >   On April 30, 2021 18:00:58 "Dixit, Ashutosh" 
> >   wrote:
> >
> > On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote:
> >
> >   Looks like the engine can be dropped since all timestamps are in sync.
> >   I
> >   just have one more question here. The timestamp itself is 36 bits.
> >    Should
> >   the uapi also report the timestamp width to the user OR should I just
> >   return the lower 32 bits of the timestamp?
> >
> >   Yeah, I think reporting the timestamp width is a good idea since we're
> >   reporting the period/frequency here.
>
> Actually, I forgot that we are handling the overflow before returning the
> cs_cycles to the user and overflow handling was the only reason I thought
> user should know the width. Would you stil recommend returning the width in
> the uapi?

The width is needed for userspace to figure out if overflow has occured
between two successive query calls. I don't think I see this happening in
the code.
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-30 Thread Umesh Nerlige Ramappa

On Fri, Apr 30, 2021 at 07:35:41PM -0500, Jason Ekstrand wrote:

  On April 30, 2021 18:00:58 "Dixit, Ashutosh" 
  wrote:

On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote:

  Looks like the engine can be dropped since all timestamps are in sync.
  I
  just have one more question here. The timestamp itself is 36 bits.
   Should
  the uapi also report the timestamp width to the user OR should I just
  return the lower 32 bits of the timestamp?

  Yeah, I think reporting the timestamp width is a good idea since we're
  reporting the period/frequency here.


Actually, I forgot that we are handling the overflow before returning 
the cs_cycles to the user and overflow handling was the only reason I 
thought user should know the width. Would you stil recommend returning 
the width in the uapi?


Thanks,
Umesh




How would exposing only the lower 32 bits of the timestamp work?
The way to avoid exposing the width would be to expose the timestamp as
a
regular 64 bit value. In the kernel engine state, have a variable for
the
counter and keep on accumulating that (on each query) to full 64 bits in
spite of the 36 bit HW counter overflow.

  That's doesn't actually work since you can query the 64-bit timestamp
  value from the GPU. The way this is handled in Vulkan is that the number
  of timestamp bits is reported to the application as a queue property.
  --Jason

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-30 Thread Jason Ekstrand

On April 30, 2021 18:00:58 "Dixit, Ashutosh"  wrote:


On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote:


Looks like the engine can be dropped since all timestamps are in sync. I
just have one more question here. The timestamp itself is 36 bits.  Should
the uapi also report the timestamp width to the user OR should I just
return the lower 32 bits of the timestamp?


Yeah, I think reporting the timestamp width is a good idea since we're 
reporting the period/frequency here.





How would exposing only the lower 32 bits of the timestamp work?

The way to avoid exposing the width would be to expose the timestamp as a
regular 64 bit value. In the kernel engine state, have a variable for the
counter and keep on accumulating that (on each query) to full 64 bits in
spite of the 36 bit HW counter overflow.


That's doesn't actually work since you can query the 64-bit timestamp value 
from the GPU. The way this is handled in Vulkan is that the number of 
timestamp bits is reported to the application as a queue property.


--Jason





___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-30 Thread Dixit, Ashutosh
On Fri, 30 Apr 2021 16:00:46 -0700, Dixit, Ashutosh wrote:
>
> On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote:
> >
> > Looks like the engine can be dropped since all timestamps are in sync. I
> > just have one more question here. The timestamp itself is 36 bits.  Should
> > the uapi also report the timestamp width to the user OR should I just
> > return the lower 32 bits of the timestamp?
>
> How would exposing only the lower 32 bits of the timestamp work?

It would work I guess but overflow every few seconds. So if the counters
are sampled at a low frequency (once every few seconds) it would yield
misleading timestamps.

> The way to avoid exposing the width would be to expose the timestamp as a
> regular 64 bit value. In the kernel engine state, have a variable for the
> counter and keep on accumulating that (on each query) to full 64 bits in
> spite of the 36 bit HW counter overflow.
>
> So not exposing the width (or exposing a 64 bit timestamp) is a cleaner
> interface but also more work in the kernel.
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-30 Thread Dixit, Ashutosh
On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote:
>
> Looks like the engine can be dropped since all timestamps are in sync. I
> just have one more question here. The timestamp itself is 36 bits.  Should
> the uapi also report the timestamp width to the user OR should I just
> return the lower 32 bits of the timestamp?

How would exposing only the lower 32 bits of the timestamp work?

The way to avoid exposing the width would be to expose the timestamp as a
regular 64 bit value. In the kernel engine state, have a variable for the
counter and keep on accumulating that (on each query) to full 64 bits in
spite of the 36 bit HW counter overflow.

So not exposing the width (or exposing a 64 bit timestamp) is a cleaner
interface but also more work in the kernel.
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-30 Thread Umesh Nerlige Ramappa

On Thu, Apr 29, 2021 at 02:07:58PM -0500, Jason Ekstrand wrote:

On Wed, Apr 28, 2021 at 7:34 PM Umesh Nerlige Ramappa
 wrote:


Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
  register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
  register and return it to user.

v11: (Jani)
- IS_GEN deprecated. User GRAPHICS_VER instead.

v12: (Jason)
- Split cpu timestamp array into timestamp and delta for cleaner API

Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Lionel Landwerlin 
---
 drivers/gpu/drm/i915/i915_query.c | 148 ++
 include/uapi/drm/i915_drm.h   |  52 +++
 2 files changed, 200 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..357c44e8177c 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@

 #include 

+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,151 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
return total_length;
 }

+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+   /*
+* Use logic same as the perf subsystem to allow user to select the
+* reference clock id to be used for timestamps.
+*/
+   switch (clk_id) {
+   case CLOCK_MONOTONIC:
+   return _get_ns;
+   case CLOCK_MONOTONIC_RAW:
+   return _get_raw_ns;
+   case CLOCK_REALTIME:
+   return _get_real_ns;
+   case CLOCK_BOOTTIME:
+   return _get_boottime_ns;
+   case CLOCK_TAI:
+   return _get_clocktai_ns;
+   default:
+   return NULL;
+   }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+ i915_reg_t lower_reg,
+ i915_reg_t upper_reg,
+ u64 *cs_ts,
+ u64 *cpu_ts,
+ u64 *cpu_delta,
+ __ktime_func_t cpu_clock)
+{
+   u32 upper, lower, old_upper, loop = 0;
+
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   do {
+   *cpu_delta = local_clock();
+   *cpu_ts = cpu_clock();
+   lower = intel_uncore_read_fw(uncore, lower_reg);
+   *cpu_delta = local_clock() - *cpu_delta;
+   old_upper = upper;
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   } while (upper != old_upper && loop++ < 2);
+
+   *cs_ts = (u64)upper << 32 | lower;
+
+   return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+ u64 *cs_ts, u64 *cpu_ts, u64 *cpu_delta,
+ __ktime_func_t cpu_clock)
+{
+   struct intel_uncore *uncore = engine->uncore;
+   enum forcewake_domains fw_domains;
+   u32 base = engine->mmio_base;
+   intel_wakeref_t wakeref;
+   int ret;
+
+   fw_domains = intel_uncore_forcewake_for_reg(uncore,
+   RING_TIMESTAMP(base),
+   FW_REG_READ);
+
+   with_intel_runtime_pm(uncore->rpm, wakeref) {
+   spin_lock_irq(>lock);
+   intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+   ret = __read_timestamps(uncore,
+   RING_TIMESTAMP(base),
+   RING_TIMESTAMP_UDW(base),
+   cs_ts,
+   cpu_ts,
+

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-29 Thread Jason Ekstrand
On Wed, Apr 28, 2021 at 7:34 PM Umesh Nerlige Ramappa
 wrote:
>
> Perf measurements rely on CPU and engine timestamps to correlate
> events of interest across these time domains. Current mechanisms get
> these timestamps separately and the calculated delta between these
> timestamps lack enough accuracy.
>
> To improve the accuracy of these time measurements to within a few us,
> add a query that returns the engine and cpu timestamps captured as
> close to each other as possible.
>
> v2: (Tvrtko)
> - document clock reference used
> - return cpu timestamp always
> - capture cpu time just before lower dword of cs timestamp
>
> v3: (Chris)
> - use uncore-rpm
> - use __query_cs_timestamp helper
>
> v4: (Lionel)
> - Kernel perf subsytem allows users to specify the clock id to be used
>   in perf_event_open. This clock id is used by the perf subsystem to
>   return the appropriate cpu timestamp in perf events. Similarly, let
>   the user pass the clockid to this query so that cpu timestamp
>   corresponds to the clock id requested.
>
> v5: (Tvrtko)
> - Use normal ktime accessors instead of fast versions
> - Add more uApi documentation
>
> v6: (Lionel)
> - Move switch out of spinlock
>
> v7: (Chris)
> - cs_timestamp is a misnomer, use cs_cycles instead
> - return the cs cycle frequency as well in the query
>
> v8:
> - Add platform and engine specific checks
>
> v9: (Lionel)
> - Return 2 cpu timestamps in the query - captured before and after the
>   register read
>
> v10: (Chris)
> - Use local_clock() to measure time taken to read lower dword of
>   register and return it to user.
>
> v11: (Jani)
> - IS_GEN deprecated. User GRAPHICS_VER instead.
>
> v12: (Jason)
> - Split cpu timestamp array into timestamp and delta for cleaner API
>
> Signed-off-by: Umesh Nerlige Ramappa 
> Reviewed-by: Lionel Landwerlin 
> ---
>  drivers/gpu/drm/i915/i915_query.c | 148 ++
>  include/uapi/drm/i915_drm.h   |  52 +++
>  2 files changed, 200 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_query.c 
> b/drivers/gpu/drm/i915/i915_query.c
> index fed337ad7b68..357c44e8177c 100644
> --- a/drivers/gpu/drm/i915/i915_query.c
> +++ b/drivers/gpu/drm/i915/i915_query.c
> @@ -6,6 +6,8 @@
>
>  #include 
>
> +#include "gt/intel_engine_pm.h"
> +#include "gt/intel_engine_user.h"
>  #include "i915_drv.h"
>  #include "i915_perf.h"
>  #include "i915_query.h"
> @@ -90,6 +92,151 @@ static int query_topology_info(struct drm_i915_private 
> *dev_priv,
> return total_length;
>  }
>
> +typedef u64 (*__ktime_func_t)(void);
> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> +{
> +   /*
> +* Use logic same as the perf subsystem to allow user to select the
> +* reference clock id to be used for timestamps.
> +*/
> +   switch (clk_id) {
> +   case CLOCK_MONOTONIC:
> +   return _get_ns;
> +   case CLOCK_MONOTONIC_RAW:
> +   return _get_raw_ns;
> +   case CLOCK_REALTIME:
> +   return _get_real_ns;
> +   case CLOCK_BOOTTIME:
> +   return _get_boottime_ns;
> +   case CLOCK_TAI:
> +   return _get_clocktai_ns;
> +   default:
> +   return NULL;
> +   }
> +}
> +
> +static inline int
> +__read_timestamps(struct intel_uncore *uncore,
> + i915_reg_t lower_reg,
> + i915_reg_t upper_reg,
> + u64 *cs_ts,
> + u64 *cpu_ts,
> + u64 *cpu_delta,
> + __ktime_func_t cpu_clock)
> +{
> +   u32 upper, lower, old_upper, loop = 0;
> +
> +   upper = intel_uncore_read_fw(uncore, upper_reg);
> +   do {
> +   *cpu_delta = local_clock();
> +   *cpu_ts = cpu_clock();
> +   lower = intel_uncore_read_fw(uncore, lower_reg);
> +   *cpu_delta = local_clock() - *cpu_delta;
> +   old_upper = upper;
> +   upper = intel_uncore_read_fw(uncore, upper_reg);
> +   } while (upper != old_upper && loop++ < 2);
> +
> +   *cs_ts = (u64)upper << 32 | lower;
> +
> +   return 0;
> +}
> +
> +static int
> +__query_cs_cycles(struct intel_engine_cs *engine,
> + u64 *cs_ts, u64 *cpu_ts, u64 *cpu_delta,
> + __ktime_func_t cpu_clock)
> +{
> +   struct intel_uncore *uncore = engine->uncore;
> +   enum forcewake_domains fw_domains;
> +   u32 base = engine->mmio_base;
> +   intel_wakeref_t wakeref;
> +   int ret;
> +
> +   fw_domains = intel_uncore_forcewake_for_reg(uncore,
> +   RING_TIMESTAMP(base),
> +   FW_REG_READ);
> +
> +   with_intel_runtime_pm(uncore->rpm, wakeref) {
> +   spin_lock_irq(>lock);
> +   intel_uncore_forcewake_get__locked(uncore, fw_domains);
> +
> +   ret = __read_timestamps(uncore,
> +   

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-29 Thread Daniel Vetter
On Wed, Apr 28, 2021 at 11:43:22AM +0300, Jani Nikula wrote:
> On Tue, 27 Apr 2021, Umesh Nerlige Ramappa  
> wrote:
> > Perf measurements rely on CPU and engine timestamps to correlate
> > events of interest across these time domains. Current mechanisms get
> > these timestamps separately and the calculated delta between these
> > timestamps lack enough accuracy.
> >
> > To improve the accuracy of these time measurements to within a few us,
> > add a query that returns the engine and cpu timestamps captured as
> > close to each other as possible.
> 
> Cc: dri-devel, Jason and Daniel for review.

Yeah going forward pls cc: dri-devel on everything touching
gem/core/scheduler and anything related. Review for these is supposed to
be cc: dri-devel, including anything that's in-flight right now.

Thanks, Daniel

> 
> >
> > v2: (Tvrtko)
> > - document clock reference used
> > - return cpu timestamp always
> > - capture cpu time just before lower dword of cs timestamp
> >
> > v3: (Chris)
> > - use uncore-rpm
> > - use __query_cs_timestamp helper
> >
> > v4: (Lionel)
> > - Kernel perf subsytem allows users to specify the clock id to be used
> >   in perf_event_open. This clock id is used by the perf subsystem to
> >   return the appropriate cpu timestamp in perf events. Similarly, let
> >   the user pass the clockid to this query so that cpu timestamp
> >   corresponds to the clock id requested.
> >
> > v5: (Tvrtko)
> > - Use normal ktime accessors instead of fast versions
> > - Add more uApi documentation
> >
> > v6: (Lionel)
> > - Move switch out of spinlock
> >
> > v7: (Chris)
> > - cs_timestamp is a misnomer, use cs_cycles instead
> > - return the cs cycle frequency as well in the query
> >
> > v8:
> > - Add platform and engine specific checks
> >
> > v9: (Lionel)
> > - Return 2 cpu timestamps in the query - captured before and after the
> >   register read
> >
> > v10: (Chris)
> > - Use local_clock() to measure time taken to read lower dword of
> >   register and return it to user.
> >
> > v11: (Jani)
> > - IS_GEN deprecated. User GRAPHICS_VER instead.
> >
> > Signed-off-by: Umesh Nerlige Ramappa 
> > ---
> >  drivers/gpu/drm/i915/i915_query.c | 145 ++
> >  include/uapi/drm/i915_drm.h   |  48 ++
> >  2 files changed, 193 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_query.c 
> > b/drivers/gpu/drm/i915/i915_query.c
> > index fed337ad7b68..2594b93901ac 100644
> > --- a/drivers/gpu/drm/i915/i915_query.c
> > +++ b/drivers/gpu/drm/i915/i915_query.c
> > @@ -6,6 +6,8 @@
> >  
> >  #include 
> >  
> > +#include "gt/intel_engine_pm.h"
> > +#include "gt/intel_engine_user.h"
> >  #include "i915_drv.h"
> >  #include "i915_perf.h"
> >  #include "i915_query.h"
> > @@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
> > *dev_priv,
> > return total_length;
> >  }
> >  
> > +typedef u64 (*__ktime_func_t)(void);
> > +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> > +{
> > +   /*
> > +* Use logic same as the perf subsystem to allow user to select the
> > +* reference clock id to be used for timestamps.
> > +*/
> > +   switch (clk_id) {
> > +   case CLOCK_MONOTONIC:
> > +   return _get_ns;
> > +   case CLOCK_MONOTONIC_RAW:
> > +   return _get_raw_ns;
> > +   case CLOCK_REALTIME:
> > +   return _get_real_ns;
> > +   case CLOCK_BOOTTIME:
> > +   return _get_boottime_ns;
> > +   case CLOCK_TAI:
> > +   return _get_clocktai_ns;
> > +   default:
> > +   return NULL;
> > +   }
> > +}
> > +
> > +static inline int
> > +__read_timestamps(struct intel_uncore *uncore,
> > + i915_reg_t lower_reg,
> > + i915_reg_t upper_reg,
> > + u64 *cs_ts,
> > + u64 *cpu_ts,
> > + __ktime_func_t cpu_clock)
> > +{
> > +   u32 upper, lower, old_upper, loop = 0;
> > +
> > +   upper = intel_uncore_read_fw(uncore, upper_reg);
> > +   do {
> > +   cpu_ts[1] = local_clock();
> > +   cpu_ts[0] = cpu_clock();
> > +   lower = intel_uncore_read_fw(uncore, lower_reg);
> > +   cpu_ts[1] = local_clock() - cpu_ts[1];
> > +   old_upper = upper;
> > +   upper = intel_uncore_read_fw(uncore, upper_reg);
> > +   } while (upper != old_upper && loop++ < 2);
> > +
> > +   *cs_ts = (u64)upper << 32 | lower;
> > +
> > +   return 0;
> > +}
> > +
> > +static int
> > +__query_cs_cycles(struct intel_engine_cs *engine,
> > + u64 *cs_ts, u64 *cpu_ts,
> > + __ktime_func_t cpu_clock)
> > +{
> > +   struct intel_uncore *uncore = engine->uncore;
> > +   enum forcewake_domains fw_domains;
> > +   u32 base = engine->mmio_base;
> > +   intel_wakeref_t wakeref;
> > +   int ret;
> > +
> > +   fw_domains = intel_uncore_forcewake_for_reg(uncore,
> > +   RING_TIMESTAMP(base),
> > +   FW_REG_READ);
> > +
> > +   

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-29 Thread Lionel Landwerlin

On 29/04/2021 03:34, Umesh Nerlige Ramappa wrote:

Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
   in perf_event_open. This clock id is used by the perf subsystem to
   return the appropriate cpu timestamp in perf events. Similarly, let
   the user pass the clockid to this query so that cpu timestamp
   corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
   register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
   register and return it to user.

v11: (Jani)
- IS_GEN deprecated. User GRAPHICS_VER instead.

v12: (Jason)
- Split cpu timestamp array into timestamp and delta for cleaner API

Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Lionel Landwerlin 



Thanks for the update :


Reviewed-by: Lionel Landwerlin 



---
  drivers/gpu/drm/i915/i915_query.c | 148 ++
  include/uapi/drm/i915_drm.h   |  52 +++
  2 files changed, 200 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..357c44e8177c 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
  
  #include 
  
+#include "gt/intel_engine_pm.h"

+#include "gt/intel_engine_user.h"
  #include "i915_drv.h"
  #include "i915_perf.h"
  #include "i915_query.h"
@@ -90,6 +92,151 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
return total_length;
  }
  
+typedef u64 (*__ktime_func_t)(void);

+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+   /*
+* Use logic same as the perf subsystem to allow user to select the
+* reference clock id to be used for timestamps.
+*/
+   switch (clk_id) {
+   case CLOCK_MONOTONIC:
+   return _get_ns;
+   case CLOCK_MONOTONIC_RAW:
+   return _get_raw_ns;
+   case CLOCK_REALTIME:
+   return _get_real_ns;
+   case CLOCK_BOOTTIME:
+   return _get_boottime_ns;
+   case CLOCK_TAI:
+   return _get_clocktai_ns;
+   default:
+   return NULL;
+   }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+ i915_reg_t lower_reg,
+ i915_reg_t upper_reg,
+ u64 *cs_ts,
+ u64 *cpu_ts,
+ u64 *cpu_delta,
+ __ktime_func_t cpu_clock)
+{
+   u32 upper, lower, old_upper, loop = 0;
+
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   do {
+   *cpu_delta = local_clock();
+   *cpu_ts = cpu_clock();
+   lower = intel_uncore_read_fw(uncore, lower_reg);
+   *cpu_delta = local_clock() - *cpu_delta;
+   old_upper = upper;
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   } while (upper != old_upper && loop++ < 2);
+
+   *cs_ts = (u64)upper << 32 | lower;
+
+   return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+ u64 *cs_ts, u64 *cpu_ts, u64 *cpu_delta,
+ __ktime_func_t cpu_clock)
+{
+   struct intel_uncore *uncore = engine->uncore;
+   enum forcewake_domains fw_domains;
+   u32 base = engine->mmio_base;
+   intel_wakeref_t wakeref;
+   int ret;
+
+   fw_domains = intel_uncore_forcewake_for_reg(uncore,
+   RING_TIMESTAMP(base),
+   FW_REG_READ);
+
+   with_intel_runtime_pm(uncore->rpm, wakeref) {
+   spin_lock_irq(>lock);
+   intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+   ret = __read_timestamps(uncore,
+   RING_TIMESTAMP(base),
+   RING_TIMESTAMP_UDW(base),
+   cs_ts,
+   cpu_ts,
+

[Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-28 Thread Umesh Nerlige Ramappa
Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
  register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
  register and return it to user.

v11: (Jani)
- IS_GEN deprecated. User GRAPHICS_VER instead.

v12: (Jason)
- Split cpu timestamp array into timestamp and delta for cleaner API

Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Lionel Landwerlin 
---
 drivers/gpu/drm/i915/i915_query.c | 148 ++
 include/uapi/drm/i915_drm.h   |  52 +++
 2 files changed, 200 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..357c44e8177c 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
 
 #include 
 
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,151 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
return total_length;
 }
 
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+   /*
+* Use logic same as the perf subsystem to allow user to select the
+* reference clock id to be used for timestamps.
+*/
+   switch (clk_id) {
+   case CLOCK_MONOTONIC:
+   return _get_ns;
+   case CLOCK_MONOTONIC_RAW:
+   return _get_raw_ns;
+   case CLOCK_REALTIME:
+   return _get_real_ns;
+   case CLOCK_BOOTTIME:
+   return _get_boottime_ns;
+   case CLOCK_TAI:
+   return _get_clocktai_ns;
+   default:
+   return NULL;
+   }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+ i915_reg_t lower_reg,
+ i915_reg_t upper_reg,
+ u64 *cs_ts,
+ u64 *cpu_ts,
+ u64 *cpu_delta,
+ __ktime_func_t cpu_clock)
+{
+   u32 upper, lower, old_upper, loop = 0;
+
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   do {
+   *cpu_delta = local_clock();
+   *cpu_ts = cpu_clock();
+   lower = intel_uncore_read_fw(uncore, lower_reg);
+   *cpu_delta = local_clock() - *cpu_delta;
+   old_upper = upper;
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   } while (upper != old_upper && loop++ < 2);
+
+   *cs_ts = (u64)upper << 32 | lower;
+
+   return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+ u64 *cs_ts, u64 *cpu_ts, u64 *cpu_delta,
+ __ktime_func_t cpu_clock)
+{
+   struct intel_uncore *uncore = engine->uncore;
+   enum forcewake_domains fw_domains;
+   u32 base = engine->mmio_base;
+   intel_wakeref_t wakeref;
+   int ret;
+
+   fw_domains = intel_uncore_forcewake_for_reg(uncore,
+   RING_TIMESTAMP(base),
+   FW_REG_READ);
+
+   with_intel_runtime_pm(uncore->rpm, wakeref) {
+   spin_lock_irq(>lock);
+   intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+   ret = __read_timestamps(uncore,
+   RING_TIMESTAMP(base),
+   RING_TIMESTAMP_UDW(base),
+   cs_ts,
+   cpu_ts,
+   cpu_delta,
+   cpu_clock);
+
+   

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-28 Thread Lionel Landwerlin

On 28/04/2021 23:45, Jason Ekstrand wrote:

On Wed, Apr 28, 2021 at 3:14 PM Lionel Landwerlin
 wrote:

On 28/04/2021 22:54, Jason Ekstrand wrote:

On Wed, Apr 28, 2021 at 2:50 PM Lionel Landwerlin
 wrote:

On 28/04/2021 22:24, Jason Ekstrand wrote:

On Wed, Apr 28, 2021 at 3:43 AM Jani Nikula  wrote:

On Tue, 27 Apr 2021, Umesh Nerlige Ramappa  
wrote:

Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

Cc: dri-devel, Jason and Daniel for review.

Thanks!

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
in perf_event_open. This clock id is used by the perf subsystem to
return the appropriate cpu timestamp in perf events. Similarly, let
the user pass the clockid to this query so that cpu timestamp
corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
register and return it to user.

v11: (Jani)
- IS_GEN deprecated. User GRAPHICS_VER instead.

Signed-off-by: Umesh Nerlige Ramappa 
---
   drivers/gpu/drm/i915/i915_query.c | 145 ++
   include/uapi/drm/i915_drm.h   |  48 ++
   2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..2594b93901ac 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@

   #include 

+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
   #include "i915_drv.h"
   #include "i915_perf.h"
   #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
return total_length;
   }

+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+ /*
+  * Use logic same as the perf subsystem to allow user to select the
+  * reference clock id to be used for timestamps.
+  */
+ switch (clk_id) {
+ case CLOCK_MONOTONIC:
+ return _get_ns;
+ case CLOCK_MONOTONIC_RAW:
+ return _get_raw_ns;
+ case CLOCK_REALTIME:
+ return _get_real_ns;
+ case CLOCK_BOOTTIME:
+ return _get_boottime_ns;
+ case CLOCK_TAI:
+ return _get_clocktai_ns;
+ default:
+ return NULL;
+ }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+   i915_reg_t lower_reg,
+   i915_reg_t upper_reg,
+   u64 *cs_ts,
+   u64 *cpu_ts,
+   __ktime_func_t cpu_clock)
+{
+ u32 upper, lower, old_upper, loop = 0;
+
+ upper = intel_uncore_read_fw(uncore, upper_reg);
+ do {
+ cpu_ts[1] = local_clock();
+ cpu_ts[0] = cpu_clock();
+ lower = intel_uncore_read_fw(uncore, lower_reg);
+ cpu_ts[1] = local_clock() - cpu_ts[1];
+ old_upper = upper;
+ upper = intel_uncore_read_fw(uncore, upper_reg);
+ } while (upper != old_upper && loop++ < 2);
+
+ *cs_ts = (u64)upper << 32 | lower;
+
+ return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+   u64 *cs_ts, u64 *cpu_ts,
+   __ktime_func_t cpu_clock)
+{
+ struct intel_uncore *uncore = engine->uncore;
+ enum forcewake_domains fw_domains;
+ u32 base = engine->mmio_base;
+ intel_wakeref_t wakeref;
+ int ret;
+
+ fw_domains = intel_uncore_forcewake_for_reg(uncore,
+ RING_TIMESTAMP(base),
+ FW_REG_READ);
+
+ with_intel_runtime_pm(uncore->rpm, wakeref) {
+ spin_lock_irq(>lock);
+ intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+ ret = __read_timestamps(uncore,
+ RING_TIMESTAMP(base),
+ RING_TIMESTAMP_UDW(base),
+ cs_ts,
+ 

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-28 Thread Jason Ekstrand
On Wed, Apr 28, 2021 at 3:14 PM Lionel Landwerlin
 wrote:
>
> On 28/04/2021 22:54, Jason Ekstrand wrote:
> > On Wed, Apr 28, 2021 at 2:50 PM Lionel Landwerlin
> >  wrote:
> >> On 28/04/2021 22:24, Jason Ekstrand wrote:
> >>
> >> On Wed, Apr 28, 2021 at 3:43 AM Jani Nikula  
> >> wrote:
> >>
> >> On Tue, 27 Apr 2021, Umesh Nerlige Ramappa 
> >>  wrote:
> >>
> >> Perf measurements rely on CPU and engine timestamps to correlate
> >> events of interest across these time domains. Current mechanisms get
> >> these timestamps separately and the calculated delta between these
> >> timestamps lack enough accuracy.
> >>
> >> To improve the accuracy of these time measurements to within a few us,
> >> add a query that returns the engine and cpu timestamps captured as
> >> close to each other as possible.
> >>
> >> Cc: dri-devel, Jason and Daniel for review.
> >>
> >> Thanks!
> >>
> >> v2: (Tvrtko)
> >> - document clock reference used
> >> - return cpu timestamp always
> >> - capture cpu time just before lower dword of cs timestamp
> >>
> >> v3: (Chris)
> >> - use uncore-rpm
> >> - use __query_cs_timestamp helper
> >>
> >> v4: (Lionel)
> >> - Kernel perf subsytem allows users to specify the clock id to be used
> >>in perf_event_open. This clock id is used by the perf subsystem to
> >>return the appropriate cpu timestamp in perf events. Similarly, let
> >>the user pass the clockid to this query so that cpu timestamp
> >>corresponds to the clock id requested.
> >>
> >> v5: (Tvrtko)
> >> - Use normal ktime accessors instead of fast versions
> >> - Add more uApi documentation
> >>
> >> v6: (Lionel)
> >> - Move switch out of spinlock
> >>
> >> v7: (Chris)
> >> - cs_timestamp is a misnomer, use cs_cycles instead
> >> - return the cs cycle frequency as well in the query
> >>
> >> v8:
> >> - Add platform and engine specific checks
> >>
> >> v9: (Lionel)
> >> - Return 2 cpu timestamps in the query - captured before and after the
> >>register read
> >>
> >> v10: (Chris)
> >> - Use local_clock() to measure time taken to read lower dword of
> >>register and return it to user.
> >>
> >> v11: (Jani)
> >> - IS_GEN deprecated. User GRAPHICS_VER instead.
> >>
> >> Signed-off-by: Umesh Nerlige Ramappa 
> >> ---
> >>   drivers/gpu/drm/i915/i915_query.c | 145 ++
> >>   include/uapi/drm/i915_drm.h   |  48 ++
> >>   2 files changed, 193 insertions(+)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_query.c 
> >> b/drivers/gpu/drm/i915/i915_query.c
> >> index fed337ad7b68..2594b93901ac 100644
> >> --- a/drivers/gpu/drm/i915/i915_query.c
> >> +++ b/drivers/gpu/drm/i915/i915_query.c
> >> @@ -6,6 +6,8 @@
> >>
> >>   #include 
> >>
> >> +#include "gt/intel_engine_pm.h"
> >> +#include "gt/intel_engine_user.h"
> >>   #include "i915_drv.h"
> >>   #include "i915_perf.h"
> >>   #include "i915_query.h"
> >> @@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
> >> *dev_priv,
> >>return total_length;
> >>   }
> >>
> >> +typedef u64 (*__ktime_func_t)(void);
> >> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> >> +{
> >> + /*
> >> +  * Use logic same as the perf subsystem to allow user to select the
> >> +  * reference clock id to be used for timestamps.
> >> +  */
> >> + switch (clk_id) {
> >> + case CLOCK_MONOTONIC:
> >> + return _get_ns;
> >> + case CLOCK_MONOTONIC_RAW:
> >> + return _get_raw_ns;
> >> + case CLOCK_REALTIME:
> >> + return _get_real_ns;
> >> + case CLOCK_BOOTTIME:
> >> + return _get_boottime_ns;
> >> + case CLOCK_TAI:
> >> + return _get_clocktai_ns;
> >> + default:
> >> + return NULL;
> >> + }
> >> +}
> >> +
> >> +static inline int
> >> +__read_timestamps(struct intel_uncore *uncore,
> >> +   i915_reg_t lower_reg,
> >> +   i915_reg_t upper_reg,
> >> +   u64 *cs_ts,
> >> +   u64 *cpu_ts,
> >> +   __ktime_func_t cpu_clock)
> >> +{
> >> + u32 upper, lower, old_upper, loop = 0;
> >> +
> >> + upper = intel_uncore_read_fw(uncore, upper_reg);
> >> + do {
> >> + cpu_ts[1] = local_clock();
> >> + cpu_ts[0] = cpu_clock();
> >> + lower = intel_uncore_read_fw(uncore, lower_reg);
> >> + cpu_ts[1] = local_clock() - cpu_ts[1];
> >> + old_upper = upper;
> >> + upper = intel_uncore_read_fw(uncore, upper_reg);
> >> + } while (upper != old_upper && loop++ < 2);
> >> +
> >> + *cs_ts = (u64)upper << 32 | lower;
> >> +
> >> + return 0;
> >> +}
> >> +
> >> +static int
> >> +__query_cs_cycles(struct intel_engine_cs *engine,
> >> +   u64 *cs_ts, u64 *cpu_ts,
> >> +   __ktime_func_t cpu_clock)
> >> +{
> >> + struct intel_uncore *uncore = engine->uncore;
> >> + enum forcewake_domains fw_domains;
> >> + u32 base = engine->mmio_base;
> >> + 

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-28 Thread Lionel Landwerlin

On 28/04/2021 23:14, Lionel Landwerlin wrote:

On 28/04/2021 22:54, Jason Ekstrand wrote:

On Wed, Apr 28, 2021 at 2:50 PM Lionel Landwerlin
 wrote:

On 28/04/2021 22:24, Jason Ekstrand wrote:

On Wed, Apr 28, 2021 at 3:43 AM Jani Nikula 
 wrote:


On Tue, 27 Apr 2021, Umesh Nerlige Ramappa 
 wrote:


Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

Cc: dri-devel, Jason and Daniel for review.

Thanks!

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
   in perf_event_open. This clock id is used by the perf subsystem to
   return the appropriate cpu timestamp in perf events. Similarly, let
   the user pass the clockid to this query so that cpu timestamp
   corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
   register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
   register and return it to user.

v11: (Jani)
- IS_GEN deprecated. User GRAPHICS_VER instead.

Signed-off-by: Umesh Nerlige Ramappa 
---
  drivers/gpu/drm/i915/i915_query.c | 145 
++

  include/uapi/drm/i915_drm.h   |  48 ++
  2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c

index fed337ad7b68..2594b93901ac 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@

  #include 

+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
  #include "i915_drv.h"
  #include "i915_perf.h"
  #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct 
drm_i915_private *dev_priv,

   return total_length;
  }

+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+ /*
+  * Use logic same as the perf subsystem to allow user to 
select the

+  * reference clock id to be used for timestamps.
+  */
+ switch (clk_id) {
+ case CLOCK_MONOTONIC:
+ return _get_ns;
+ case CLOCK_MONOTONIC_RAW:
+ return _get_raw_ns;
+ case CLOCK_REALTIME:
+ return _get_real_ns;
+ case CLOCK_BOOTTIME:
+ return _get_boottime_ns;
+ case CLOCK_TAI:
+ return _get_clocktai_ns;
+ default:
+ return NULL;
+ }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+   i915_reg_t lower_reg,
+   i915_reg_t upper_reg,
+   u64 *cs_ts,
+   u64 *cpu_ts,
+   __ktime_func_t cpu_clock)
+{
+ u32 upper, lower, old_upper, loop = 0;
+
+ upper = intel_uncore_read_fw(uncore, upper_reg);
+ do {
+ cpu_ts[1] = local_clock();
+ cpu_ts[0] = cpu_clock();
+ lower = intel_uncore_read_fw(uncore, lower_reg);
+ cpu_ts[1] = local_clock() - cpu_ts[1];
+ old_upper = upper;
+ upper = intel_uncore_read_fw(uncore, upper_reg);
+ } while (upper != old_upper && loop++ < 2);
+
+ *cs_ts = (u64)upper << 32 | lower;
+
+ return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+   u64 *cs_ts, u64 *cpu_ts,
+   __ktime_func_t cpu_clock)
+{
+ struct intel_uncore *uncore = engine->uncore;
+ enum forcewake_domains fw_domains;
+ u32 base = engine->mmio_base;
+ intel_wakeref_t wakeref;
+ int ret;
+
+ fw_domains = intel_uncore_forcewake_for_reg(uncore,
+ RING_TIMESTAMP(base),
+ FW_REG_READ);
+
+ with_intel_runtime_pm(uncore->rpm, wakeref) {
+ spin_lock_irq(>lock);
+ intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+ ret = __read_timestamps(uncore,
+ RING_TIMESTAMP(base),
+ RING_TIMESTAMP_UDW(base),
+ cs_ts,
+ cpu_ts,
+ cpu_clock);
+
+ intel_uncore_forcewake_put__locked(uncore, fw_domains);
+ spin_unlock_irq(>lock);
+ }
+
+ return ret;
+}

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-28 Thread Lionel Landwerlin

On 28/04/2021 22:54, Jason Ekstrand wrote:

On Wed, Apr 28, 2021 at 2:50 PM Lionel Landwerlin
 wrote:

On 28/04/2021 22:24, Jason Ekstrand wrote:

On Wed, Apr 28, 2021 at 3:43 AM Jani Nikula  wrote:

On Tue, 27 Apr 2021, Umesh Nerlige Ramappa  
wrote:

Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

Cc: dri-devel, Jason and Daniel for review.

Thanks!

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
   in perf_event_open. This clock id is used by the perf subsystem to
   return the appropriate cpu timestamp in perf events. Similarly, let
   the user pass the clockid to this query so that cpu timestamp
   corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
   register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
   register and return it to user.

v11: (Jani)
- IS_GEN deprecated. User GRAPHICS_VER instead.

Signed-off-by: Umesh Nerlige Ramappa 
---
  drivers/gpu/drm/i915/i915_query.c | 145 ++
  include/uapi/drm/i915_drm.h   |  48 ++
  2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..2594b93901ac 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@

  #include 

+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
  #include "i915_drv.h"
  #include "i915_perf.h"
  #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
   return total_length;
  }

+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+ /*
+  * Use logic same as the perf subsystem to allow user to select the
+  * reference clock id to be used for timestamps.
+  */
+ switch (clk_id) {
+ case CLOCK_MONOTONIC:
+ return _get_ns;
+ case CLOCK_MONOTONIC_RAW:
+ return _get_raw_ns;
+ case CLOCK_REALTIME:
+ return _get_real_ns;
+ case CLOCK_BOOTTIME:
+ return _get_boottime_ns;
+ case CLOCK_TAI:
+ return _get_clocktai_ns;
+ default:
+ return NULL;
+ }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+   i915_reg_t lower_reg,
+   i915_reg_t upper_reg,
+   u64 *cs_ts,
+   u64 *cpu_ts,
+   __ktime_func_t cpu_clock)
+{
+ u32 upper, lower, old_upper, loop = 0;
+
+ upper = intel_uncore_read_fw(uncore, upper_reg);
+ do {
+ cpu_ts[1] = local_clock();
+ cpu_ts[0] = cpu_clock();
+ lower = intel_uncore_read_fw(uncore, lower_reg);
+ cpu_ts[1] = local_clock() - cpu_ts[1];
+ old_upper = upper;
+ upper = intel_uncore_read_fw(uncore, upper_reg);
+ } while (upper != old_upper && loop++ < 2);
+
+ *cs_ts = (u64)upper << 32 | lower;
+
+ return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+   u64 *cs_ts, u64 *cpu_ts,
+   __ktime_func_t cpu_clock)
+{
+ struct intel_uncore *uncore = engine->uncore;
+ enum forcewake_domains fw_domains;
+ u32 base = engine->mmio_base;
+ intel_wakeref_t wakeref;
+ int ret;
+
+ fw_domains = intel_uncore_forcewake_for_reg(uncore,
+ RING_TIMESTAMP(base),
+ FW_REG_READ);
+
+ with_intel_runtime_pm(uncore->rpm, wakeref) {
+ spin_lock_irq(>lock);
+ intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+ ret = __read_timestamps(uncore,
+ RING_TIMESTAMP(base),
+ RING_TIMESTAMP_UDW(base),
+ cs_ts,
+ cpu_ts,
+ cpu_clock);
+
+ intel_uncore_forcewake_put__locked(uncore, 

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-28 Thread Jason Ekstrand
On Wed, Apr 28, 2021 at 2:50 PM Lionel Landwerlin
 wrote:
>
> On 28/04/2021 22:24, Jason Ekstrand wrote:
>
> On Wed, Apr 28, 2021 at 3:43 AM Jani Nikula  
> wrote:
>
> On Tue, 27 Apr 2021, Umesh Nerlige Ramappa  
> wrote:
>
> Perf measurements rely on CPU and engine timestamps to correlate
> events of interest across these time domains. Current mechanisms get
> these timestamps separately and the calculated delta between these
> timestamps lack enough accuracy.
>
> To improve the accuracy of these time measurements to within a few us,
> add a query that returns the engine and cpu timestamps captured as
> close to each other as possible.
>
> Cc: dri-devel, Jason and Daniel for review.
>
> Thanks!
>
> v2: (Tvrtko)
> - document clock reference used
> - return cpu timestamp always
> - capture cpu time just before lower dword of cs timestamp
>
> v3: (Chris)
> - use uncore-rpm
> - use __query_cs_timestamp helper
>
> v4: (Lionel)
> - Kernel perf subsytem allows users to specify the clock id to be used
>   in perf_event_open. This clock id is used by the perf subsystem to
>   return the appropriate cpu timestamp in perf events. Similarly, let
>   the user pass the clockid to this query so that cpu timestamp
>   corresponds to the clock id requested.
>
> v5: (Tvrtko)
> - Use normal ktime accessors instead of fast versions
> - Add more uApi documentation
>
> v6: (Lionel)
> - Move switch out of spinlock
>
> v7: (Chris)
> - cs_timestamp is a misnomer, use cs_cycles instead
> - return the cs cycle frequency as well in the query
>
> v8:
> - Add platform and engine specific checks
>
> v9: (Lionel)
> - Return 2 cpu timestamps in the query - captured before and after the
>   register read
>
> v10: (Chris)
> - Use local_clock() to measure time taken to read lower dword of
>   register and return it to user.
>
> v11: (Jani)
> - IS_GEN deprecated. User GRAPHICS_VER instead.
>
> Signed-off-by: Umesh Nerlige Ramappa 
> ---
>  drivers/gpu/drm/i915/i915_query.c | 145 ++
>  include/uapi/drm/i915_drm.h   |  48 ++
>  2 files changed, 193 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_query.c 
> b/drivers/gpu/drm/i915/i915_query.c
> index fed337ad7b68..2594b93901ac 100644
> --- a/drivers/gpu/drm/i915/i915_query.c
> +++ b/drivers/gpu/drm/i915/i915_query.c
> @@ -6,6 +6,8 @@
>
>  #include 
>
> +#include "gt/intel_engine_pm.h"
> +#include "gt/intel_engine_user.h"
>  #include "i915_drv.h"
>  #include "i915_perf.h"
>  #include "i915_query.h"
> @@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
> *dev_priv,
>   return total_length;
>  }
>
> +typedef u64 (*__ktime_func_t)(void);
> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> +{
> + /*
> +  * Use logic same as the perf subsystem to allow user to select the
> +  * reference clock id to be used for timestamps.
> +  */
> + switch (clk_id) {
> + case CLOCK_MONOTONIC:
> + return _get_ns;
> + case CLOCK_MONOTONIC_RAW:
> + return _get_raw_ns;
> + case CLOCK_REALTIME:
> + return _get_real_ns;
> + case CLOCK_BOOTTIME:
> + return _get_boottime_ns;
> + case CLOCK_TAI:
> + return _get_clocktai_ns;
> + default:
> + return NULL;
> + }
> +}
> +
> +static inline int
> +__read_timestamps(struct intel_uncore *uncore,
> +   i915_reg_t lower_reg,
> +   i915_reg_t upper_reg,
> +   u64 *cs_ts,
> +   u64 *cpu_ts,
> +   __ktime_func_t cpu_clock)
> +{
> + u32 upper, lower, old_upper, loop = 0;
> +
> + upper = intel_uncore_read_fw(uncore, upper_reg);
> + do {
> + cpu_ts[1] = local_clock();
> + cpu_ts[0] = cpu_clock();
> + lower = intel_uncore_read_fw(uncore, lower_reg);
> + cpu_ts[1] = local_clock() - cpu_ts[1];
> + old_upper = upper;
> + upper = intel_uncore_read_fw(uncore, upper_reg);
> + } while (upper != old_upper && loop++ < 2);
> +
> + *cs_ts = (u64)upper << 32 | lower;
> +
> + return 0;
> +}
> +
> +static int
> +__query_cs_cycles(struct intel_engine_cs *engine,
> +   u64 *cs_ts, u64 *cpu_ts,
> +   __ktime_func_t cpu_clock)
> +{
> + struct intel_uncore *uncore = engine->uncore;
> + enum forcewake_domains fw_domains;
> + u32 base = engine->mmio_base;
> + intel_wakeref_t wakeref;
> + int ret;
> +
> + fw_domains = intel_uncore_forcewake_for_reg(uncore,
> + RING_TIMESTAMP(base),
> + FW_REG_READ);
> +
> + with_intel_runtime_pm(uncore->rpm, wakeref) {
> + spin_lock_irq(>lock);
> + intel_uncore_forcewake_get__locked(uncore, fw_domains);
> +
> + ret = __read_timestamps(uncore,
> + RING_TIMESTAMP(base),
> +  

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-28 Thread Lionel Landwerlin

On 28/04/2021 22:24, Jason Ekstrand wrote:

On Wed, Apr 28, 2021 at 3:43 AM Jani Nikula  wrote:

On Tue, 27 Apr 2021, Umesh Nerlige Ramappa  
wrote:

Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

Cc: dri-devel, Jason and Daniel for review.

Thanks!


v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
   in perf_event_open. This clock id is used by the perf subsystem to
   return the appropriate cpu timestamp in perf events. Similarly, let
   the user pass the clockid to this query so that cpu timestamp
   corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
   register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
   register and return it to user.

v11: (Jani)
- IS_GEN deprecated. User GRAPHICS_VER instead.

Signed-off-by: Umesh Nerlige Ramappa 
---
  drivers/gpu/drm/i915/i915_query.c | 145 ++
  include/uapi/drm/i915_drm.h   |  48 ++
  2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..2594b93901ac 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@

  #include 

+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
  #include "i915_drv.h"
  #include "i915_perf.h"
  #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
   return total_length;
  }

+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+ /*
+  * Use logic same as the perf subsystem to allow user to select the
+  * reference clock id to be used for timestamps.
+  */
+ switch (clk_id) {
+ case CLOCK_MONOTONIC:
+ return _get_ns;
+ case CLOCK_MONOTONIC_RAW:
+ return _get_raw_ns;
+ case CLOCK_REALTIME:
+ return _get_real_ns;
+ case CLOCK_BOOTTIME:
+ return _get_boottime_ns;
+ case CLOCK_TAI:
+ return _get_clocktai_ns;
+ default:
+ return NULL;
+ }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+   i915_reg_t lower_reg,
+   i915_reg_t upper_reg,
+   u64 *cs_ts,
+   u64 *cpu_ts,
+   __ktime_func_t cpu_clock)
+{
+ u32 upper, lower, old_upper, loop = 0;
+
+ upper = intel_uncore_read_fw(uncore, upper_reg);
+ do {
+ cpu_ts[1] = local_clock();
+ cpu_ts[0] = cpu_clock();
+ lower = intel_uncore_read_fw(uncore, lower_reg);
+ cpu_ts[1] = local_clock() - cpu_ts[1];
+ old_upper = upper;
+ upper = intel_uncore_read_fw(uncore, upper_reg);
+ } while (upper != old_upper && loop++ < 2);
+
+ *cs_ts = (u64)upper << 32 | lower;
+
+ return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+   u64 *cs_ts, u64 *cpu_ts,
+   __ktime_func_t cpu_clock)
+{
+ struct intel_uncore *uncore = engine->uncore;
+ enum forcewake_domains fw_domains;
+ u32 base = engine->mmio_base;
+ intel_wakeref_t wakeref;
+ int ret;
+
+ fw_domains = intel_uncore_forcewake_for_reg(uncore,
+ RING_TIMESTAMP(base),
+ FW_REG_READ);
+
+ with_intel_runtime_pm(uncore->rpm, wakeref) {
+ spin_lock_irq(>lock);
+ intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+ ret = __read_timestamps(uncore,
+ RING_TIMESTAMP(base),
+ RING_TIMESTAMP_UDW(base),
+ cs_ts,
+ cpu_ts,
+ cpu_clock);
+
+ intel_uncore_forcewake_put__locked(uncore, fw_domains);
+ spin_unlock_irq(>lock);
+ }
+
+ return ret;
+}
+
+static int

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-28 Thread Jason Ekstrand
On Wed, Apr 28, 2021 at 3:43 AM Jani Nikula  wrote:
>
> On Tue, 27 Apr 2021, Umesh Nerlige Ramappa  
> wrote:
> > Perf measurements rely on CPU and engine timestamps to correlate
> > events of interest across these time domains. Current mechanisms get
> > these timestamps separately and the calculated delta between these
> > timestamps lack enough accuracy.
> >
> > To improve the accuracy of these time measurements to within a few us,
> > add a query that returns the engine and cpu timestamps captured as
> > close to each other as possible.
>
> Cc: dri-devel, Jason and Daniel for review.

Thanks!

> >
> > v2: (Tvrtko)
> > - document clock reference used
> > - return cpu timestamp always
> > - capture cpu time just before lower dword of cs timestamp
> >
> > v3: (Chris)
> > - use uncore-rpm
> > - use __query_cs_timestamp helper
> >
> > v4: (Lionel)
> > - Kernel perf subsytem allows users to specify the clock id to be used
> >   in perf_event_open. This clock id is used by the perf subsystem to
> >   return the appropriate cpu timestamp in perf events. Similarly, let
> >   the user pass the clockid to this query so that cpu timestamp
> >   corresponds to the clock id requested.
> >
> > v5: (Tvrtko)
> > - Use normal ktime accessors instead of fast versions
> > - Add more uApi documentation
> >
> > v6: (Lionel)
> > - Move switch out of spinlock
> >
> > v7: (Chris)
> > - cs_timestamp is a misnomer, use cs_cycles instead
> > - return the cs cycle frequency as well in the query
> >
> > v8:
> > - Add platform and engine specific checks
> >
> > v9: (Lionel)
> > - Return 2 cpu timestamps in the query - captured before and after the
> >   register read
> >
> > v10: (Chris)
> > - Use local_clock() to measure time taken to read lower dword of
> >   register and return it to user.
> >
> > v11: (Jani)
> > - IS_GEN deprecated. User GRAPHICS_VER instead.
> >
> > Signed-off-by: Umesh Nerlige Ramappa 
> > ---
> >  drivers/gpu/drm/i915/i915_query.c | 145 ++
> >  include/uapi/drm/i915_drm.h   |  48 ++
> >  2 files changed, 193 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_query.c 
> > b/drivers/gpu/drm/i915/i915_query.c
> > index fed337ad7b68..2594b93901ac 100644
> > --- a/drivers/gpu/drm/i915/i915_query.c
> > +++ b/drivers/gpu/drm/i915/i915_query.c
> > @@ -6,6 +6,8 @@
> >
> >  #include 
> >
> > +#include "gt/intel_engine_pm.h"
> > +#include "gt/intel_engine_user.h"
> >  #include "i915_drv.h"
> >  #include "i915_perf.h"
> >  #include "i915_query.h"
> > @@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
> > *dev_priv,
> >   return total_length;
> >  }
> >
> > +typedef u64 (*__ktime_func_t)(void);
> > +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> > +{
> > + /*
> > +  * Use logic same as the perf subsystem to allow user to select the
> > +  * reference clock id to be used for timestamps.
> > +  */
> > + switch (clk_id) {
> > + case CLOCK_MONOTONIC:
> > + return _get_ns;
> > + case CLOCK_MONOTONIC_RAW:
> > + return _get_raw_ns;
> > + case CLOCK_REALTIME:
> > + return _get_real_ns;
> > + case CLOCK_BOOTTIME:
> > + return _get_boottime_ns;
> > + case CLOCK_TAI:
> > + return _get_clocktai_ns;
> > + default:
> > + return NULL;
> > + }
> > +}
> > +
> > +static inline int
> > +__read_timestamps(struct intel_uncore *uncore,
> > +   i915_reg_t lower_reg,
> > +   i915_reg_t upper_reg,
> > +   u64 *cs_ts,
> > +   u64 *cpu_ts,
> > +   __ktime_func_t cpu_clock)
> > +{
> > + u32 upper, lower, old_upper, loop = 0;
> > +
> > + upper = intel_uncore_read_fw(uncore, upper_reg);
> > + do {
> > + cpu_ts[1] = local_clock();
> > + cpu_ts[0] = cpu_clock();
> > + lower = intel_uncore_read_fw(uncore, lower_reg);
> > + cpu_ts[1] = local_clock() - cpu_ts[1];
> > + old_upper = upper;
> > + upper = intel_uncore_read_fw(uncore, upper_reg);
> > + } while (upper != old_upper && loop++ < 2);
> > +
> > + *cs_ts = (u64)upper << 32 | lower;
> > +
> > + return 0;
> > +}
> > +
> > +static int
> > +__query_cs_cycles(struct intel_engine_cs *engine,
> > +   u64 *cs_ts, u64 *cpu_ts,
> > +   __ktime_func_t cpu_clock)
> > +{
> > + struct intel_uncore *uncore = engine->uncore;
> > + enum forcewake_domains fw_domains;
> > + u32 base = engine->mmio_base;
> > + intel_wakeref_t wakeref;
> > + int ret;
> > +
> > + fw_domains = intel_uncore_forcewake_for_reg(uncore,
> > + RING_TIMESTAMP(base),
> > + FW_REG_READ);
> > +
> > + with_intel_runtime_pm(uncore->rpm, wakeref) {
> > + spin_lock_irq(>lock);
> > + 

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-28 Thread Jani Nikula
On Tue, 27 Apr 2021, Umesh Nerlige Ramappa  
wrote:
> Perf measurements rely on CPU and engine timestamps to correlate
> events of interest across these time domains. Current mechanisms get
> these timestamps separately and the calculated delta between these
> timestamps lack enough accuracy.
>
> To improve the accuracy of these time measurements to within a few us,
> add a query that returns the engine and cpu timestamps captured as
> close to each other as possible.

Cc: dri-devel, Jason and Daniel for review.

>
> v2: (Tvrtko)
> - document clock reference used
> - return cpu timestamp always
> - capture cpu time just before lower dword of cs timestamp
>
> v3: (Chris)
> - use uncore-rpm
> - use __query_cs_timestamp helper
>
> v4: (Lionel)
> - Kernel perf subsytem allows users to specify the clock id to be used
>   in perf_event_open. This clock id is used by the perf subsystem to
>   return the appropriate cpu timestamp in perf events. Similarly, let
>   the user pass the clockid to this query so that cpu timestamp
>   corresponds to the clock id requested.
>
> v5: (Tvrtko)
> - Use normal ktime accessors instead of fast versions
> - Add more uApi documentation
>
> v6: (Lionel)
> - Move switch out of spinlock
>
> v7: (Chris)
> - cs_timestamp is a misnomer, use cs_cycles instead
> - return the cs cycle frequency as well in the query
>
> v8:
> - Add platform and engine specific checks
>
> v9: (Lionel)
> - Return 2 cpu timestamps in the query - captured before and after the
>   register read
>
> v10: (Chris)
> - Use local_clock() to measure time taken to read lower dword of
>   register and return it to user.
>
> v11: (Jani)
> - IS_GEN deprecated. User GRAPHICS_VER instead.
>
> Signed-off-by: Umesh Nerlige Ramappa 
> ---
>  drivers/gpu/drm/i915/i915_query.c | 145 ++
>  include/uapi/drm/i915_drm.h   |  48 ++
>  2 files changed, 193 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_query.c 
> b/drivers/gpu/drm/i915/i915_query.c
> index fed337ad7b68..2594b93901ac 100644
> --- a/drivers/gpu/drm/i915/i915_query.c
> +++ b/drivers/gpu/drm/i915/i915_query.c
> @@ -6,6 +6,8 @@
>  
>  #include 
>  
> +#include "gt/intel_engine_pm.h"
> +#include "gt/intel_engine_user.h"
>  #include "i915_drv.h"
>  #include "i915_perf.h"
>  #include "i915_query.h"
> @@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
> *dev_priv,
>   return total_length;
>  }
>  
> +typedef u64 (*__ktime_func_t)(void);
> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> +{
> + /*
> +  * Use logic same as the perf subsystem to allow user to select the
> +  * reference clock id to be used for timestamps.
> +  */
> + switch (clk_id) {
> + case CLOCK_MONOTONIC:
> + return _get_ns;
> + case CLOCK_MONOTONIC_RAW:
> + return _get_raw_ns;
> + case CLOCK_REALTIME:
> + return _get_real_ns;
> + case CLOCK_BOOTTIME:
> + return _get_boottime_ns;
> + case CLOCK_TAI:
> + return _get_clocktai_ns;
> + default:
> + return NULL;
> + }
> +}
> +
> +static inline int
> +__read_timestamps(struct intel_uncore *uncore,
> +   i915_reg_t lower_reg,
> +   i915_reg_t upper_reg,
> +   u64 *cs_ts,
> +   u64 *cpu_ts,
> +   __ktime_func_t cpu_clock)
> +{
> + u32 upper, lower, old_upper, loop = 0;
> +
> + upper = intel_uncore_read_fw(uncore, upper_reg);
> + do {
> + cpu_ts[1] = local_clock();
> + cpu_ts[0] = cpu_clock();
> + lower = intel_uncore_read_fw(uncore, lower_reg);
> + cpu_ts[1] = local_clock() - cpu_ts[1];
> + old_upper = upper;
> + upper = intel_uncore_read_fw(uncore, upper_reg);
> + } while (upper != old_upper && loop++ < 2);
> +
> + *cs_ts = (u64)upper << 32 | lower;
> +
> + return 0;
> +}
> +
> +static int
> +__query_cs_cycles(struct intel_engine_cs *engine,
> +   u64 *cs_ts, u64 *cpu_ts,
> +   __ktime_func_t cpu_clock)
> +{
> + struct intel_uncore *uncore = engine->uncore;
> + enum forcewake_domains fw_domains;
> + u32 base = engine->mmio_base;
> + intel_wakeref_t wakeref;
> + int ret;
> +
> + fw_domains = intel_uncore_forcewake_for_reg(uncore,
> + RING_TIMESTAMP(base),
> + FW_REG_READ);
> +
> + with_intel_runtime_pm(uncore->rpm, wakeref) {
> + spin_lock_irq(>lock);
> + intel_uncore_forcewake_get__locked(uncore, fw_domains);
> +
> + ret = __read_timestamps(uncore,
> + RING_TIMESTAMP(base),
> + RING_TIMESTAMP_UDW(base),
> + cs_ts,
> + cpu_ts,
> + cpu_clock);
> +

[Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-27 Thread Umesh Nerlige Ramappa
Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
  register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
  register and return it to user.

v11: (Jani)
- IS_GEN deprecated. User GRAPHICS_VER instead.

Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Lionel Landwerlin 
---
 drivers/gpu/drm/i915/i915_query.c | 145 ++
 include/uapi/drm/i915_drm.h   |  48 ++
 2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..2594b93901ac 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
 
 #include 
 
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
return total_length;
 }
 
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+   /*
+* Use logic same as the perf subsystem to allow user to select the
+* reference clock id to be used for timestamps.
+*/
+   switch (clk_id) {
+   case CLOCK_MONOTONIC:
+   return _get_ns;
+   case CLOCK_MONOTONIC_RAW:
+   return _get_raw_ns;
+   case CLOCK_REALTIME:
+   return _get_real_ns;
+   case CLOCK_BOOTTIME:
+   return _get_boottime_ns;
+   case CLOCK_TAI:
+   return _get_clocktai_ns;
+   default:
+   return NULL;
+   }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+ i915_reg_t lower_reg,
+ i915_reg_t upper_reg,
+ u64 *cs_ts,
+ u64 *cpu_ts,
+ __ktime_func_t cpu_clock)
+{
+   u32 upper, lower, old_upper, loop = 0;
+
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   do {
+   cpu_ts[1] = local_clock();
+   cpu_ts[0] = cpu_clock();
+   lower = intel_uncore_read_fw(uncore, lower_reg);
+   cpu_ts[1] = local_clock() - cpu_ts[1];
+   old_upper = upper;
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   } while (upper != old_upper && loop++ < 2);
+
+   *cs_ts = (u64)upper << 32 | lower;
+
+   return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+ u64 *cs_ts, u64 *cpu_ts,
+ __ktime_func_t cpu_clock)
+{
+   struct intel_uncore *uncore = engine->uncore;
+   enum forcewake_domains fw_domains;
+   u32 base = engine->mmio_base;
+   intel_wakeref_t wakeref;
+   int ret;
+
+   fw_domains = intel_uncore_forcewake_for_reg(uncore,
+   RING_TIMESTAMP(base),
+   FW_REG_READ);
+
+   with_intel_runtime_pm(uncore->rpm, wakeref) {
+   spin_lock_irq(>lock);
+   intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+   ret = __read_timestamps(uncore,
+   RING_TIMESTAMP(base),
+   RING_TIMESTAMP_UDW(base),
+   cs_ts,
+   cpu_ts,
+   cpu_clock);
+
+   intel_uncore_forcewake_put__locked(uncore, fw_domains);
+   spin_unlock_irq(>lock);
+   }
+
+   return ret;
+}
+
+static int
+query_cs_cycles(struct drm_i915_private *i915,
+   struct 

[Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-27 Thread Umesh Nerlige Ramappa
Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
  register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
  register and return it to user.

v11: (Jani)
- IS_GEN deprecated. User GRAPHICS_VER instead.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_query.c | 145 ++
 include/uapi/drm/i915_drm.h   |  48 ++
 2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..2594b93901ac 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
 
 #include 
 
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
return total_length;
 }
 
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+   /*
+* Use logic same as the perf subsystem to allow user to select the
+* reference clock id to be used for timestamps.
+*/
+   switch (clk_id) {
+   case CLOCK_MONOTONIC:
+   return _get_ns;
+   case CLOCK_MONOTONIC_RAW:
+   return _get_raw_ns;
+   case CLOCK_REALTIME:
+   return _get_real_ns;
+   case CLOCK_BOOTTIME:
+   return _get_boottime_ns;
+   case CLOCK_TAI:
+   return _get_clocktai_ns;
+   default:
+   return NULL;
+   }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+ i915_reg_t lower_reg,
+ i915_reg_t upper_reg,
+ u64 *cs_ts,
+ u64 *cpu_ts,
+ __ktime_func_t cpu_clock)
+{
+   u32 upper, lower, old_upper, loop = 0;
+
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   do {
+   cpu_ts[1] = local_clock();
+   cpu_ts[0] = cpu_clock();
+   lower = intel_uncore_read_fw(uncore, lower_reg);
+   cpu_ts[1] = local_clock() - cpu_ts[1];
+   old_upper = upper;
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   } while (upper != old_upper && loop++ < 2);
+
+   *cs_ts = (u64)upper << 32 | lower;
+
+   return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+ u64 *cs_ts, u64 *cpu_ts,
+ __ktime_func_t cpu_clock)
+{
+   struct intel_uncore *uncore = engine->uncore;
+   enum forcewake_domains fw_domains;
+   u32 base = engine->mmio_base;
+   intel_wakeref_t wakeref;
+   int ret;
+
+   fw_domains = intel_uncore_forcewake_for_reg(uncore,
+   RING_TIMESTAMP(base),
+   FW_REG_READ);
+
+   with_intel_runtime_pm(uncore->rpm, wakeref) {
+   spin_lock_irq(>lock);
+   intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+   ret = __read_timestamps(uncore,
+   RING_TIMESTAMP(base),
+   RING_TIMESTAMP_UDW(base),
+   cs_ts,
+   cpu_ts,
+   cpu_clock);
+
+   intel_uncore_forcewake_put__locked(uncore, fw_domains);
+   spin_unlock_irq(>lock);
+   }
+
+   return ret;
+}
+
+static int
+query_cs_cycles(struct drm_i915_private *i915,
+   struct drm_i915_query_item *query_item)
+{
+

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-26 Thread Jani Nikula
On Fri, 23 Apr 2021, Lionel Landwerlin  wrote:
> On 21/04/2021 20:28, Umesh Nerlige Ramappa wrote:
>> +static int
>> +query_cs_cycles(struct drm_i915_private *i915,
>> +struct drm_i915_query_item *query_item)
>> +{
>> +struct drm_i915_query_cs_cycles __user *query_ptr;
>> +struct drm_i915_query_cs_cycles query;
>> +struct intel_engine_cs *engine;
>> +__ktime_func_t cpu_clock;
>> +int ret;
>> +
>> +if (INTEL_GEN(i915) < 6)
>> +return -ENODEV;
>> +

[...]

>> +
>> +if (IS_GEN(i915, 6) &&
>> +query.engine.engine_class != I915_ENGINE_CLASS_RENDER)
>> +return -ENODEV;
>
>
> Thanks a bunch for rebasing this.
>
> My only comment on this patch would be : don't we want 
> IS_GEN_RANGE(i915, 1, 6) instead of IS_GEN(i915, 6) ?

Please see the new deprecation comments in i915_drv.h. We're moving from
GEN to VER. In short, please use the new VER macros for individual
components instead of the generic GEN.

Thanks,
Jani.


-- 
Jani Nikula, Intel Open Source Graphics Center
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-23 Thread Lionel Landwerlin

On 23/04/2021 18:11, Umesh Nerlige Ramappa wrote:

On Fri, Apr 23, 2021 at 10:05:34AM +0300, Lionel Landwerlin wrote:

On 21/04/2021 20:28, Umesh Nerlige Ramappa wrote:

Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
  register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
  register and return it to user.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_query.c | 145 ++
 include/uapi/drm/i915_drm.h   |  48 ++
 2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c

index fed337ad7b68..25b96927ab92 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
 #include 
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct 
drm_i915_private *dev_priv,

 return total_length;
 }
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+    /*
+ * Use logic same as the perf subsystem to allow user to select 
the

+ * reference clock id to be used for timestamps.
+ */
+    switch (clk_id) {
+    case CLOCK_MONOTONIC:
+    return _get_ns;
+    case CLOCK_MONOTONIC_RAW:
+    return _get_raw_ns;
+    case CLOCK_REALTIME:
+    return _get_real_ns;
+    case CLOCK_BOOTTIME:
+    return _get_boottime_ns;
+    case CLOCK_TAI:
+    return _get_clocktai_ns;
+    default:
+    return NULL;
+    }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+  i915_reg_t lower_reg,
+  i915_reg_t upper_reg,
+  u64 *cs_ts,
+  u64 *cpu_ts,
+  __ktime_func_t cpu_clock)
+{
+    u32 upper, lower, old_upper, loop = 0;
+
+    upper = intel_uncore_read_fw(uncore, upper_reg);
+    do {
+    cpu_ts[1] = local_clock();
+    cpu_ts[0] = cpu_clock();
+    lower = intel_uncore_read_fw(uncore, lower_reg);
+    cpu_ts[1] = local_clock() - cpu_ts[1];
+    old_upper = upper;
+    upper = intel_uncore_read_fw(uncore, upper_reg);
+    } while (upper != old_upper && loop++ < 2);
+
+    *cs_ts = (u64)upper << 32 | lower;
+
+    return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+  u64 *cs_ts, u64 *cpu_ts,
+  __ktime_func_t cpu_clock)
+{
+    struct intel_uncore *uncore = engine->uncore;
+    enum forcewake_domains fw_domains;
+    u32 base = engine->mmio_base;
+    intel_wakeref_t wakeref;
+    int ret;
+
+    fw_domains = intel_uncore_forcewake_for_reg(uncore,
+    RING_TIMESTAMP(base),
+    FW_REG_READ);
+
+    with_intel_runtime_pm(uncore->rpm, wakeref) {
+    spin_lock_irq(>lock);
+    intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+    ret = __read_timestamps(uncore,
+    RING_TIMESTAMP(base),
+    RING_TIMESTAMP_UDW(base),
+    cs_ts,
+    cpu_ts,
+    cpu_clock);
+
+    intel_uncore_forcewake_put__locked(uncore, fw_domains);
+    spin_unlock_irq(>lock);
+    }
+
+    return ret;
+}
+
+static int
+query_cs_cycles(struct drm_i915_private *i915,
+    struct drm_i915_query_item *query_item)
+{
+    struct drm_i915_query_cs_cycles __user *query_ptr;
+    struct drm_i915_query_cs_cycles query;
+    struct intel_engine_cs *engine;
+    __ktime_func_t cpu_clock;
+    int ret;
+
+    if (INTEL_GEN(i915) < 6)
+    return -ENODEV;


Less than gen6 is handled here early on.


+
+    

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-23 Thread Umesh Nerlige Ramappa

On Fri, Apr 23, 2021 at 10:05:34AM +0300, Lionel Landwerlin wrote:

On 21/04/2021 20:28, Umesh Nerlige Ramappa wrote:

Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
  register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
  register and return it to user.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_query.c | 145 ++
 include/uapi/drm/i915_drm.h   |  48 ++
 2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..25b96927ab92 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
 #include 
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
return total_length;
 }
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+   /*
+* Use logic same as the perf subsystem to allow user to select the
+* reference clock id to be used for timestamps.
+*/
+   switch (clk_id) {
+   case CLOCK_MONOTONIC:
+   return _get_ns;
+   case CLOCK_MONOTONIC_RAW:
+   return _get_raw_ns;
+   case CLOCK_REALTIME:
+   return _get_real_ns;
+   case CLOCK_BOOTTIME:
+   return _get_boottime_ns;
+   case CLOCK_TAI:
+   return _get_clocktai_ns;
+   default:
+   return NULL;
+   }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+ i915_reg_t lower_reg,
+ i915_reg_t upper_reg,
+ u64 *cs_ts,
+ u64 *cpu_ts,
+ __ktime_func_t cpu_clock)
+{
+   u32 upper, lower, old_upper, loop = 0;
+
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   do {
+   cpu_ts[1] = local_clock();
+   cpu_ts[0] = cpu_clock();
+   lower = intel_uncore_read_fw(uncore, lower_reg);
+   cpu_ts[1] = local_clock() - cpu_ts[1];
+   old_upper = upper;
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   } while (upper != old_upper && loop++ < 2);
+
+   *cs_ts = (u64)upper << 32 | lower;
+
+   return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+ u64 *cs_ts, u64 *cpu_ts,
+ __ktime_func_t cpu_clock)
+{
+   struct intel_uncore *uncore = engine->uncore;
+   enum forcewake_domains fw_domains;
+   u32 base = engine->mmio_base;
+   intel_wakeref_t wakeref;
+   int ret;
+
+   fw_domains = intel_uncore_forcewake_for_reg(uncore,
+   RING_TIMESTAMP(base),
+   FW_REG_READ);
+
+   with_intel_runtime_pm(uncore->rpm, wakeref) {
+   spin_lock_irq(>lock);
+   intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+   ret = __read_timestamps(uncore,
+   RING_TIMESTAMP(base),
+   RING_TIMESTAMP_UDW(base),
+   cs_ts,
+   cpu_ts,
+   cpu_clock);
+
+   intel_uncore_forcewake_put__locked(uncore, fw_domains);
+   spin_unlock_irq(>lock);
+   }
+
+   return ret;
+}
+
+static int
+query_cs_cycles(struct drm_i915_private *i915,
+  

Re: [Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-23 Thread Lionel Landwerlin

On 21/04/2021 20:28, Umesh Nerlige Ramappa wrote:

Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
   in perf_event_open. This clock id is used by the perf subsystem to
   return the appropriate cpu timestamp in perf events. Similarly, let
   the user pass the clockid to this query so that cpu timestamp
   corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
   register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
   register and return it to user.

Signed-off-by: Umesh Nerlige Ramappa 
---
  drivers/gpu/drm/i915/i915_query.c | 145 ++
  include/uapi/drm/i915_drm.h   |  48 ++
  2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..25b96927ab92 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
  
  #include 
  
+#include "gt/intel_engine_pm.h"

+#include "gt/intel_engine_user.h"
  #include "i915_drv.h"
  #include "i915_perf.h"
  #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
return total_length;
  }
  
+typedef u64 (*__ktime_func_t)(void);

+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+   /*
+* Use logic same as the perf subsystem to allow user to select the
+* reference clock id to be used for timestamps.
+*/
+   switch (clk_id) {
+   case CLOCK_MONOTONIC:
+   return _get_ns;
+   case CLOCK_MONOTONIC_RAW:
+   return _get_raw_ns;
+   case CLOCK_REALTIME:
+   return _get_real_ns;
+   case CLOCK_BOOTTIME:
+   return _get_boottime_ns;
+   case CLOCK_TAI:
+   return _get_clocktai_ns;
+   default:
+   return NULL;
+   }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+ i915_reg_t lower_reg,
+ i915_reg_t upper_reg,
+ u64 *cs_ts,
+ u64 *cpu_ts,
+ __ktime_func_t cpu_clock)
+{
+   u32 upper, lower, old_upper, loop = 0;
+
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   do {
+   cpu_ts[1] = local_clock();
+   cpu_ts[0] = cpu_clock();
+   lower = intel_uncore_read_fw(uncore, lower_reg);
+   cpu_ts[1] = local_clock() - cpu_ts[1];
+   old_upper = upper;
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   } while (upper != old_upper && loop++ < 2);
+
+   *cs_ts = (u64)upper << 32 | lower;
+
+   return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+ u64 *cs_ts, u64 *cpu_ts,
+ __ktime_func_t cpu_clock)
+{
+   struct intel_uncore *uncore = engine->uncore;
+   enum forcewake_domains fw_domains;
+   u32 base = engine->mmio_base;
+   intel_wakeref_t wakeref;
+   int ret;
+
+   fw_domains = intel_uncore_forcewake_for_reg(uncore,
+   RING_TIMESTAMP(base),
+   FW_REG_READ);
+
+   with_intel_runtime_pm(uncore->rpm, wakeref) {
+   spin_lock_irq(>lock);
+   intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+   ret = __read_timestamps(uncore,
+   RING_TIMESTAMP(base),
+   RING_TIMESTAMP_UDW(base),
+   cs_ts,
+   cpu_ts,
+   cpu_clock);
+
+   intel_uncore_forcewake_put__locked(uncore, fw_domains);
+   spin_unlock_irq(>lock);
+   }
+
+   return ret;
+}
+
+static int
+query_cs_cycles(struct drm_i915_private *i915,
+   struct drm_i915_query_item 

[Intel-gfx] [PATCH 1/1] i915/query: Correlate engine and cpu timestamps with better accuracy

2021-04-21 Thread Umesh Nerlige Ramappa
Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
  register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
  register and return it to user.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_query.c | 145 ++
 include/uapi/drm/i915_drm.h   |  48 ++
 2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..25b96927ab92 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
 
 #include 
 
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private 
*dev_priv,
return total_length;
 }
 
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+   /*
+* Use logic same as the perf subsystem to allow user to select the
+* reference clock id to be used for timestamps.
+*/
+   switch (clk_id) {
+   case CLOCK_MONOTONIC:
+   return _get_ns;
+   case CLOCK_MONOTONIC_RAW:
+   return _get_raw_ns;
+   case CLOCK_REALTIME:
+   return _get_real_ns;
+   case CLOCK_BOOTTIME:
+   return _get_boottime_ns;
+   case CLOCK_TAI:
+   return _get_clocktai_ns;
+   default:
+   return NULL;
+   }
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+ i915_reg_t lower_reg,
+ i915_reg_t upper_reg,
+ u64 *cs_ts,
+ u64 *cpu_ts,
+ __ktime_func_t cpu_clock)
+{
+   u32 upper, lower, old_upper, loop = 0;
+
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   do {
+   cpu_ts[1] = local_clock();
+   cpu_ts[0] = cpu_clock();
+   lower = intel_uncore_read_fw(uncore, lower_reg);
+   cpu_ts[1] = local_clock() - cpu_ts[1];
+   old_upper = upper;
+   upper = intel_uncore_read_fw(uncore, upper_reg);
+   } while (upper != old_upper && loop++ < 2);
+
+   *cs_ts = (u64)upper << 32 | lower;
+
+   return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+ u64 *cs_ts, u64 *cpu_ts,
+ __ktime_func_t cpu_clock)
+{
+   struct intel_uncore *uncore = engine->uncore;
+   enum forcewake_domains fw_domains;
+   u32 base = engine->mmio_base;
+   intel_wakeref_t wakeref;
+   int ret;
+
+   fw_domains = intel_uncore_forcewake_for_reg(uncore,
+   RING_TIMESTAMP(base),
+   FW_REG_READ);
+
+   with_intel_runtime_pm(uncore->rpm, wakeref) {
+   spin_lock_irq(>lock);
+   intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+   ret = __read_timestamps(uncore,
+   RING_TIMESTAMP(base),
+   RING_TIMESTAMP_UDW(base),
+   cs_ts,
+   cpu_ts,
+   cpu_clock);
+
+   intel_uncore_forcewake_put__locked(uncore, fw_domains);
+   spin_unlock_irq(>lock);
+   }
+
+   return ret;
+}
+
+static int
+query_cs_cycles(struct drm_i915_private *i915,
+   struct drm_i915_query_item *query_item)
+{
+   struct drm_i915_query_cs_cycles __user *query_ptr;
+