[PATCH] i915/perf: Remove code to update PWR_CLK_STATE for gen12

2024-06-28 Thread Umesh Nerlige Ramappa
PWR_CLK_STATE only needs to be modified up until gen11. For gen12 this
code is not applicable. Remove code to update context image with
PWR_CLK_STATE for gen12.

Fixes: 00a7f0d7155c ("drm/i915/tgl: Add perf support on TGL")
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c | 33 
 1 file changed, 33 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 0b1cd4c7a525..025a79fe5920 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -2748,26 +2748,6 @@ oa_configure_all_contexts(struct i915_perf_stream 
*stream,
return 0;
 }
 
-static int
-gen12_configure_all_contexts(struct i915_perf_stream *stream,
-const struct i915_oa_config *oa_config,
-struct i915_active *active)
-{
-   struct flex regs[] = {
-   {
-   GEN8_R_PWR_CLK_STATE(RENDER_RING_BASE),
-   CTX_R_PWR_CLK_STATE,
-   },
-   };
-
-   if (stream->engine->class != RENDER_CLASS)
-   return 0;
-
-   return oa_configure_all_contexts(stream,
-regs, ARRAY_SIZE(regs),
-active);
-}
-
 static int
 lrc_configure_all_contexts(struct i915_perf_stream *stream,
   const struct i915_oa_config *oa_config,
@@ -2874,7 +2854,6 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
 {
struct drm_i915_private *i915 = stream->perf->i915;
struct intel_uncore *uncore = stream->uncore;
-   struct i915_oa_config *oa_config = stream->oa_config;
bool periodic = stream->periodic;
u32 period_exponent = stream->period_exponent;
u32 sqcnt1;
@@ -2918,15 +2897,6 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
 
intel_uncore_rmw(uncore, GEN12_SQCNT1, 0, sqcnt1);
 
-   /*
-* Update all contexts prior writing the mux configurations as we need
-* to make sure all slices/subslices are ON before writing to NOA
-* registers.
-*/
-   ret = gen12_configure_all_contexts(stream, oa_config, active);
-   if (ret)
-   return ret;
-
/*
 * For Gen12, performance counters are context
 * saved/restored. Only enable it for the context that
@@ -2980,9 +2950,6 @@ static void gen12_disable_metric_set(struct 
i915_perf_stream *stream)
   
_MASKED_BIT_DISABLE(GEN12_DISABLE_DOP_GATING));
}
 
-   /* Reset all contexts' slices/subslices configurations. */
-   gen12_configure_all_contexts(stream, NULL, NULL);
-
/* disable the context save/restore or OAR counters */
if (stream->ctx)
gen12_configure_oar_context(stream, NULL);
-- 
2.38.1



Re: [PATCH] drm/i915/gt: Fix CCS id's calculation for CCS mode setting

2024-05-17 Thread Umesh Nerlige Ramappa

On Fri, May 17, 2024 at 11:06:16AM +0200, Andi Shyti wrote:

The whole point of the previous fixes has been to change the CCS
hardware configuration to generate only one stream available to
the compute users. We did this by changing the info.engine_mask
that is set during device probe, reset during the detection of
the fused engines, and finally reset again when choosing the CCS
mode.

We can't use the engine_mask variable anymore, as with the
current configuration, it imposes only one CCS no matter what the
hardware configuration is.

Before changing the engine_mask for the third time, save it and
use it for calculating the CCS mode.

After the previous changes, the user reported a performance drop
to around 1/4. We have tested that the compute operations, with
the current patch, have improved by the same factor.

Fixes: 6db31251bb26 ("drm/i915/gt: Enable only one CCS for compute workload")
Cc: Chris Wilson 
Cc: Gnattu OC 
Cc: Joonas Lahtinen 
Cc: Matt Roper 
Tested-by: Jian Ye 
---
Hi,

This ensures that all four CCS engines work properly. However,
during the tests, Jian detected that the performance during
memory copy assigned to the CCS engines is negatively impacted.

I believe this might be expected, considering that based on the
engines' availability, the media user might decide to reduce the
copy in multitasking.

With the upcoming work that will give the user the chance to
configure the CCS mode, this might improve.

Gnattu, can I use your kindness to ask for a test on this patch
and check whether the performance improve on your side as well?

Thanks,
Andi

drivers/gpu/drm/i915/gt/intel_engine_cs.c   | 6 ++
drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c | 2 +-
drivers/gpu/drm/i915/gt/intel_gt_types.h| 8 
3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 5c8e9ee3b008..3b740ca25000 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -885,6 +885,12 @@ static intel_engine_mask_t init_engine_mask(struct 
intel_gt *gt)
if (IS_DG2(gt->i915)) {
u8 first_ccs = __ffs(CCS_MASK(gt));

+   /*
+* Store the number of active cslices before
+* changing the CCS engine configuration
+*/
+   gt->ccs.cslices = CCS_MASK(gt);
+
/* Mask off all the CCS engine */
info->engine_mask &= ~GENMASK(CCS3, CCS0);
/* Put back in the first CCS engine */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c 
b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c
index 99b71bb7da0a..3c62a44e9106 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c
@@ -19,7 +19,7 @@ unsigned int intel_gt_apply_ccs_mode(struct intel_gt *gt)

/* Build the value for the fixed CCS load balancing */
for (cslice = 0; cslice < I915_MAX_CCS; cslice++) {
-   if (CCS_MASK(gt) & BIT(cslice))
+   if (gt->ccs.cslices & BIT(cslice))
/*
 * If available, assign the cslice
 * to the first available engine...
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h 
b/drivers/gpu/drm/i915/gt/intel_gt_types.h
index def7dd0eb6f1..cfdd2ad5e954 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
@@ -207,6 +207,14 @@ struct intel_gt {
[MAX_ENGINE_INSTANCE + 1];
enum intel_submission_method submission_method;

+   struct {
+   /*
+* Mask of the non fused CCS slices
+* to be used for the load balancing
+*/
+   intel_engine_mask_t cslices;
+   } ccs;
+


LGTM,

Reviewed-by: Umesh Nerlige Ramappa 


/*
 * Default address space (either GGTT or ppGTT depending on arch).
 *
--
2.43.0



Re: [PATCH 2/2] i915/pmu: Cleanup pending events on unbind

2024-02-14 Thread Umesh Nerlige Ramappa

On Wed, Feb 14, 2024 at 08:21:21AM +, Tvrtko Ursulin wrote:


On 13/02/2024 18:03, Umesh Nerlige Ramappa wrote:

Once a user opens an fd for a perf event, if the driver undergoes a
function level reset (FLR), the resources are not cleaned up as
expected. For this discussion FLR is defined as a PCI unbind followed by
a bind. perf_pmu_unregister() would cleanup everything, but when the user
closes the perf fd, perf_release is executed and we encounter null
pointer dereferences and/or list corruption in that path which require a
reboot to recover.

The only approach that worked to resolve this was to close the file
associated with the event such that the relevant cleanup happens w.r.t.
the open file. To do so, use the event->owner task and find the file
relevant to the event and close it. This relies on the
file->private_data matching the event object.

Note:
- Closing the event file is a delayed work that gets queued to system_wq.
The close is seen to happen when kernel returns to user space following
the unbind.

- perf framework will access the pmu object after the last event has
been destroyed. The drm device is refcounted in the init and destroy
hooks, so this causes a use after free if we are releasing the drm
device reference after unbind has been called. To work around this, we
take an extra reference in the unbind path and release it using a
delayed work in the destroy patch. The delayed work is queued to
system_wq.

Ref: 
https://lore.kernel.org/lkml/20240115170120.662220-1-tvrtko.ursu...@linux.intel.com/T/#me72abfa2771e6fc94b167ce47efdbf391cc313ab

Opens:
- Synchronization may be needed between i915_pmu_unregister and
i915_pmu_event_destroy to avoid any races.

- If unbind and bind happen from the same process the event fd is closed
after bind completes. This means that the cleanup would not happen
until bind completes. In this case, i915 loads fine, but pmu
registration fails with an error that the sysfs entries are already
present. There is no solution feasible here. Since this is not a fatal
error (reloading i915 works fine) and the usual case is to have bind and
unbind in separate processes, there is no intention to solve this.

Other solutions/aspects tried:
- Call perf_event_disable() followed by perf_event_release_kernel() in
the unbind path to clean up the events. This still causes issues when
user closes the fd since perf_event_release_kernel() is called again and
fails requiring reboot.

- Close all event fds in unbind and wait for the close to complete by
checking if list is empty. This wait does not work since the files
are actually closed when unbind returns to user space.

Testing:
- New IGT tests have been added for this and are run with KASAN and
  kmemleak enabled.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 96 -
 drivers/gpu/drm/i915/i915_pmu.h | 15 ++
 2 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 4d2a289f848a..2f365c7f5db7 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -4,6 +4,8 @@
  * Copyright © 2017-2018 Intel Corporation
  */
+#include 
+#include 
 #include 
 #include "gt/intel_engine.h"
@@ -573,9 +575,21 @@ static void i915_pmu_event_destroy(struct perf_event 
*event)
 {
struct i915_pmu *pmu = event_to_pmu(event);
struct drm_i915_private *i915 = pmu_to_i915(pmu);
+   struct i915_event *e = event->pmu_private;
drm_WARN_ON(&i915->drm, event->parent);
+   if (e) {
+   event->pmu_private = NULL;
+   list_del(&e->link);
+   kfree(e);
+   }
+
+   if (i915->pmu.closed && list_empty(&i915->pmu.initialized_events)) {
+   pmu_teardown(&i915->pmu);
+   mod_delayed_work(system_wq, &i915->pmu.work, 50);
+   }
+
drm_dev_put(&i915->drm);
 }
@@ -684,6 +698,14 @@ static int i915_pmu_event_init(struct perf_event *event)
return ret;
if (!event->parent) {
+   struct i915_event *e = kzalloc(sizeof(*e), GFP_KERNEL);
+
+   if (!e)
+   return -ENOMEM;
+
+   e->event = event;
+   list_add(&e->link, &pmu->initialized_events);
+   event->pmu_private = e;
drm_dev_get(&i915->drm);
event->destroy = i915_pmu_event_destroy;
}
@@ -1256,6 +1278,14 @@ void i915_pmu_exit(void)
cpuhp_remove_multi_state(cpuhp_slot);
 }
+static void i915_pmu_release(struct work_struct *work)
+{
+   struct i915_pmu *pmu = container_of(work, typeof(*pmu), work.work);
+   struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
+
+   drm_dev_put(&i915->drm);
+}
+
 void i915_pmu_register(struct drm_i915_private *i915)
 {

Re: [PATCH 2/2] i915/pmu: Cleanup pending events on unbind

2024-02-13 Thread Umesh Nerlige Ramappa

On Tue, Feb 13, 2024 at 08:36:43PM +0200, Jani Nikula wrote:

On Tue, 13 Feb 2024, Umesh Nerlige Ramappa  
wrote:

Once a user opens an fd for a perf event, if the driver undergoes a
function level reset (FLR), the resources are not cleaned up as
expected. For this discussion FLR is defined as a PCI unbind followed by
a bind. perf_pmu_unregister() would cleanup everything, but when the user
closes the perf fd, perf_release is executed and we encounter null
pointer dereferences and/or list corruption in that path which require a
reboot to recover.

The only approach that worked to resolve this was to close the file
associated with the event such that the relevant cleanup happens w.r.t.
the open file. To do so, use the event->owner task and find the file
relevant to the event and close it. This relies on the
file->private_data matching the event object.

Note:
- Closing the event file is a delayed work that gets queued to system_wq.
The close is seen to happen when kernel returns to user space following
the unbind.

- perf framework will access the pmu object after the last event has
been destroyed. The drm device is refcounted in the init and destroy
hooks, so this causes a use after free if we are releasing the drm
device reference after unbind has been called. To work around this, we
take an extra reference in the unbind path and release it using a
delayed work in the destroy patch. The delayed work is queued to
system_wq.

Ref: 
https://lore.kernel.org/lkml/20240115170120.662220-1-tvrtko.ursu...@linux.intel.com/T/#me72abfa2771e6fc94b167ce47efdbf391cc313ab

Opens:
- Synchronization may be needed between i915_pmu_unregister and
i915_pmu_event_destroy to avoid any races.

- If unbind and bind happen from the same process the event fd is closed
after bind completes. This means that the cleanup would not happen
until bind completes. In this case, i915 loads fine, but pmu
registration fails with an error that the sysfs entries are already
present. There is no solution feasible here. Since this is not a fatal
error (reloading i915 works fine) and the usual case is to have bind and
unbind in separate processes, there is no intention to solve this.

Other solutions/aspects tried:
- Call perf_event_disable() followed by perf_event_release_kernel() in
the unbind path to clean up the events. This still causes issues when
user closes the fd since perf_event_release_kernel() is called again and
fails requiring reboot.

- Close all event fds in unbind and wait for the close to complete by
checking if list is empty. This wait does not work since the files
are actually closed when unbind returns to user space.

Testing:
- New IGT tests have been added for this and are run with KASAN and
  kmemleak enabled.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 96 -
 drivers/gpu/drm/i915/i915_pmu.h | 15 ++
 2 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 4d2a289f848a..2f365c7f5db7 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -4,6 +4,8 @@
  * Copyright © 2017-2018 Intel Corporation
  */

+#include 
+#include 
 #include 

 #include "gt/intel_engine.h"
@@ -573,9 +575,21 @@ static void i915_pmu_event_destroy(struct perf_event 
*event)
 {
struct i915_pmu *pmu = event_to_pmu(event);
struct drm_i915_private *i915 = pmu_to_i915(pmu);
+   struct i915_event *e = event->pmu_private;

drm_WARN_ON(&i915->drm, event->parent);

+   if (e) {
+   event->pmu_private = NULL;
+   list_del(&e->link);
+   kfree(e);
+   }
+
+   if (i915->pmu.closed && list_empty(&i915->pmu.initialized_events)) {
+   pmu_teardown(&i915->pmu);
+   mod_delayed_work(system_wq, &i915->pmu.work, 50);
+   }
+
drm_dev_put(&i915->drm);
 }

@@ -684,6 +698,14 @@ static int i915_pmu_event_init(struct perf_event *event)
return ret;

if (!event->parent) {
+   struct i915_event *e = kzalloc(sizeof(*e), GFP_KERNEL);
+
+   if (!e)
+   return -ENOMEM;
+
+   e->event = event;
+   list_add(&e->link, &pmu->initialized_events);
+   event->pmu_private = e;
drm_dev_get(&i915->drm);
event->destroy = i915_pmu_event_destroy;
}
@@ -1256,6 +1278,14 @@ void i915_pmu_exit(void)
cpuhp_remove_multi_state(cpuhp_slot);
 }

+static void i915_pmu_release(struct work_struct *work)
+{
+   struct i915_pmu *pmu = container_of(work, typeof(*pmu), work.work);
+   struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
+
+   drm_dev_put(&i915->drm);
+}
+
 void i915_pmu_register(struct drm_i915_private *i9

Re: [PATCH 0/2] Fix crash due to open pmu events during unbind

2024-02-13 Thread Umesh Nerlige Ramappa

Resending to include patch 2/2. Please ignore this series.

On Mon, Feb 12, 2024 at 10:46:48PM -0800, Umesh Nerlige Ramappa wrote:

Once a user opens an fd for a perf event, if the driver undergoes a
function level reset (FLR), the resources are not cleaned up as
expected. For this discussion FLR is defined as a PCI unbind followed by
a bind. perf_pmu_unregister() would cleanup everything, but when the
user closes the perf fd much later, perf_release() is called and we
encounter null pointer dereferences and/or list corruption in that path
which require a reboot to recover.

The only approach that worked to resolve this was to close the file
associated with the event such that the relevant cleanup happens w.r.t.
the open file. To do so, use the event->owner task and find the file
relevant to the event and close it. This relies on the
file->private_data matching the event object.

Test-with: 20240213062948.32735-1-umesh.nerlige.rama...@intel.com
Signed-off-by: Umesh Nerlige Ramappa 

Umesh Nerlige Ramappa (2):
 i915/pmu: Add pmu_teardown helper
 INTEL_DII: i915/pmu: Cleanup pending events on unbind

drivers/gpu/drm/i915/i915_pmu.c | 192 
drivers/gpu/drm/i915/i915_pmu.h |  15 +++
2 files changed, 161 insertions(+), 46 deletions(-)

--
2.34.1



[PATCH 0/2] Fix crash due to open pmu events during unbind

2024-02-13 Thread Umesh Nerlige Ramappa
Once a user opens an fd for a perf event, if the driver undergoes a
function level reset (FLR), the resources are not cleaned up as
expected. For this discussion FLR is defined as a PCI unbind followed by
a bind. perf_pmu_unregister() would cleanup everything, but when the
user closes the perf fd much later, perf_release() is called and we
encounter null pointer dereferences and/or list corruption in that path
which require a reboot to recover.

The only approach that worked to resolve this was to close the file
associated with the event such that the relevant cleanup happens w.r.t.
the open file. To do so, use the event->owner task and find the file
relevant to the event and close it. This relies on the
file->private_data matching the event object.

Test-with: 20240213062948.32735-1-umesh.nerlige.rama...@intel.com
Signed-off-by: Umesh Nerlige Ramappa 

Umesh Nerlige Ramappa (2):
  i915/pmu: Add pmu_teardown helper
  i915/pmu: Cleanup pending events on unbind

 drivers/gpu/drm/i915/i915_pmu.c | 192 
 drivers/gpu/drm/i915/i915_pmu.h |  15 +++
 2 files changed, 161 insertions(+), 46 deletions(-)

-- 
2.34.1



[PATCH 1/2] i915/pmu: Add pmu_teardown helper

2024-02-13 Thread Umesh Nerlige Ramappa
Move pmu teardown to a helper and place it above the destroy hook so
that teardown can also happen inside destroy when events are closed
after i915 pmu is unregistered.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 106 +---
 1 file changed, 56 insertions(+), 50 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 21eb0c5b320d..4d2a289f848a 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -514,6 +514,61 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
return HRTIMER_RESTART;
 }
 
+static enum cpuhp_state cpuhp_slot = CPUHP_INVALID;
+
+static int i915_pmu_register_cpuhp_state(struct i915_pmu *pmu)
+{
+   if (cpuhp_slot == CPUHP_INVALID)
+   return -EINVAL;
+
+   return cpuhp_state_add_instance(cpuhp_slot, &pmu->cpuhp.node);
+}
+
+static void i915_pmu_unregister_cpuhp_state(struct i915_pmu *pmu)
+{
+   cpuhp_state_remove_instance(cpuhp_slot, &pmu->cpuhp.node);
+}
+
+static void free_event_attributes(struct i915_pmu *pmu)
+{
+   struct attribute **attr_iter = pmu->events_attr_group.attrs;
+
+   for (; *attr_iter; attr_iter++)
+   kfree((*attr_iter)->name);
+
+   kfree(pmu->events_attr_group.attrs);
+   kfree(pmu->i915_attr);
+   kfree(pmu->pmu_attr);
+
+   pmu->events_attr_group.attrs = NULL;
+   pmu->i915_attr = NULL;
+   pmu->pmu_attr = NULL;
+}
+
+static bool is_igp(struct drm_i915_private *i915)
+{
+   struct pci_dev *pdev = to_pci_dev(i915->drm.dev);
+
+   /* IGP is :00:02.0 */
+   return pci_domain_nr(pdev->bus) == 0 &&
+  pdev->bus->number == 0 &&
+  PCI_SLOT(pdev->devfn) == 2 &&
+  PCI_FUNC(pdev->devfn) == 0;
+}
+
+static void pmu_teardown(struct i915_pmu *pmu)
+{
+   struct drm_i915_private *i915 = pmu_to_i915(pmu);
+
+   i915_pmu_unregister_cpuhp_state(pmu);
+   perf_pmu_unregister(&pmu->base);
+   pmu->base.event_init = NULL;
+   kfree(pmu->base.attr_groups);
+   if (!is_igp(i915))
+   kfree(pmu->name);
+   free_event_attributes(pmu);
+}
+
 static void i915_pmu_event_destroy(struct perf_event *event)
 {
struct i915_pmu *pmu = event_to_pmu(event);
@@ -1133,22 +1188,6 @@ err:;
return NULL;
 }
 
-static void free_event_attributes(struct i915_pmu *pmu)
-{
-   struct attribute **attr_iter = pmu->events_attr_group.attrs;
-
-   for (; *attr_iter; attr_iter++)
-   kfree((*attr_iter)->name);
-
-   kfree(pmu->events_attr_group.attrs);
-   kfree(pmu->i915_attr);
-   kfree(pmu->pmu_attr);
-
-   pmu->events_attr_group.attrs = NULL;
-   pmu->i915_attr = NULL;
-   pmu->pmu_attr = NULL;
-}
-
 static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
 {
struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node);
@@ -1194,8 +1233,6 @@ static int i915_pmu_cpu_offline(unsigned int cpu, struct 
hlist_node *node)
return 0;
 }
 
-static enum cpuhp_state cpuhp_slot = CPUHP_INVALID;
-
 int i915_pmu_init(void)
 {
int ret;
@@ -1219,30 +1256,6 @@ void i915_pmu_exit(void)
cpuhp_remove_multi_state(cpuhp_slot);
 }
 
-static int i915_pmu_register_cpuhp_state(struct i915_pmu *pmu)
-{
-   if (cpuhp_slot == CPUHP_INVALID)
-   return -EINVAL;
-
-   return cpuhp_state_add_instance(cpuhp_slot, &pmu->cpuhp.node);
-}
-
-static void i915_pmu_unregister_cpuhp_state(struct i915_pmu *pmu)
-{
-   cpuhp_state_remove_instance(cpuhp_slot, &pmu->cpuhp.node);
-}
-
-static bool is_igp(struct drm_i915_private *i915)
-{
-   struct pci_dev *pdev = to_pci_dev(i915->drm.dev);
-
-   /* IGP is :00:02.0 */
-   return pci_domain_nr(pdev->bus) == 0 &&
-  pdev->bus->number == 0 &&
-  PCI_SLOT(pdev->devfn) == 2 &&
-  PCI_FUNC(pdev->devfn) == 0;
-}
-
 void i915_pmu_register(struct drm_i915_private *i915)
 {
struct i915_pmu *pmu = &i915->pmu;
@@ -1341,12 +1354,5 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
 
hrtimer_cancel(&pmu->timer);
 
-   i915_pmu_unregister_cpuhp_state(pmu);
-
-   perf_pmu_unregister(&pmu->base);
-   pmu->base.event_init = NULL;
-   kfree(pmu->base.attr_groups);
-   if (!is_igp(i915))
-   kfree(pmu->name);
-   free_event_attributes(pmu);
+   pmu_teardown(pmu);
 }
-- 
2.34.1



[PATCH 2/2] i915/pmu: Cleanup pending events on unbind

2024-02-13 Thread Umesh Nerlige Ramappa
Once a user opens an fd for a perf event, if the driver undergoes a
function level reset (FLR), the resources are not cleaned up as
expected. For this discussion FLR is defined as a PCI unbind followed by
a bind. perf_pmu_unregister() would cleanup everything, but when the user
closes the perf fd, perf_release is executed and we encounter null
pointer dereferences and/or list corruption in that path which require a
reboot to recover.

The only approach that worked to resolve this was to close the file
associated with the event such that the relevant cleanup happens w.r.t.
the open file. To do so, use the event->owner task and find the file
relevant to the event and close it. This relies on the
file->private_data matching the event object.

Note:
- Closing the event file is a delayed work that gets queued to system_wq.
The close is seen to happen when kernel returns to user space following
the unbind.

- perf framework will access the pmu object after the last event has
been destroyed. The drm device is refcounted in the init and destroy
hooks, so this causes a use after free if we are releasing the drm
device reference after unbind has been called. To work around this, we
take an extra reference in the unbind path and release it using a
delayed work in the destroy patch. The delayed work is queued to
system_wq.

Ref: 
https://lore.kernel.org/lkml/20240115170120.662220-1-tvrtko.ursu...@linux.intel.com/T/#me72abfa2771e6fc94b167ce47efdbf391cc313ab

Opens:
- Synchronization may be needed between i915_pmu_unregister and
i915_pmu_event_destroy to avoid any races.

- If unbind and bind happen from the same process the event fd is closed
after bind completes. This means that the cleanup would not happen
until bind completes. In this case, i915 loads fine, but pmu
registration fails with an error that the sysfs entries are already
present. There is no solution feasible here. Since this is not a fatal
error (reloading i915 works fine) and the usual case is to have bind and
unbind in separate processes, there is no intention to solve this.

Other solutions/aspects tried:
- Call perf_event_disable() followed by perf_event_release_kernel() in
the unbind path to clean up the events. This still causes issues when
user closes the fd since perf_event_release_kernel() is called again and
fails requiring reboot.

- Close all event fds in unbind and wait for the close to complete by
checking if list is empty. This wait does not work since the files
are actually closed when unbind returns to user space.

Testing:
- New IGT tests have been added for this and are run with KASAN and
  kmemleak enabled.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 96 -
 drivers/gpu/drm/i915/i915_pmu.h | 15 ++
 2 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 4d2a289f848a..2f365c7f5db7 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -4,6 +4,8 @@
  * Copyright © 2017-2018 Intel Corporation
  */
 
+#include 
+#include 
 #include 
 
 #include "gt/intel_engine.h"
@@ -573,9 +575,21 @@ static void i915_pmu_event_destroy(struct perf_event 
*event)
 {
struct i915_pmu *pmu = event_to_pmu(event);
struct drm_i915_private *i915 = pmu_to_i915(pmu);
+   struct i915_event *e = event->pmu_private;
 
drm_WARN_ON(&i915->drm, event->parent);
 
+   if (e) {
+   event->pmu_private = NULL;
+   list_del(&e->link);
+   kfree(e);
+   }
+
+   if (i915->pmu.closed && list_empty(&i915->pmu.initialized_events)) {
+   pmu_teardown(&i915->pmu);
+   mod_delayed_work(system_wq, &i915->pmu.work, 50);
+   }
+
drm_dev_put(&i915->drm);
 }
 
@@ -684,6 +698,14 @@ static int i915_pmu_event_init(struct perf_event *event)
return ret;
 
if (!event->parent) {
+   struct i915_event *e = kzalloc(sizeof(*e), GFP_KERNEL);
+
+   if (!e)
+   return -ENOMEM;
+
+   e->event = event;
+   list_add(&e->link, &pmu->initialized_events);
+   event->pmu_private = e;
drm_dev_get(&i915->drm);
event->destroy = i915_pmu_event_destroy;
}
@@ -1256,6 +1278,14 @@ void i915_pmu_exit(void)
cpuhp_remove_multi_state(cpuhp_slot);
 }
 
+static void i915_pmu_release(struct work_struct *work)
+{
+   struct i915_pmu *pmu = container_of(work, typeof(*pmu), work.work);
+   struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
+
+   drm_dev_put(&i915->drm);
+}
+
 void i915_pmu_register(struct drm_i915_private *i915)
 {
struct i915_pmu *pmu = &i915->pmu;
@@ -1313,6 +1343,9 @@ void i915_p

[PATCH 1/2] i915/pmu: Add pmu_teardown helper

2024-02-12 Thread Umesh Nerlige Ramappa
Move pmu teardown to a helper and place it above the destroy hook so
that teardown can also happen inside destroy when events are closed
after i915 pmu is unregistered.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 106 +---
 1 file changed, 56 insertions(+), 50 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 21eb0c5b320d..4d2a289f848a 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -514,6 +514,61 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
return HRTIMER_RESTART;
 }
 
+static enum cpuhp_state cpuhp_slot = CPUHP_INVALID;
+
+static int i915_pmu_register_cpuhp_state(struct i915_pmu *pmu)
+{
+   if (cpuhp_slot == CPUHP_INVALID)
+   return -EINVAL;
+
+   return cpuhp_state_add_instance(cpuhp_slot, &pmu->cpuhp.node);
+}
+
+static void i915_pmu_unregister_cpuhp_state(struct i915_pmu *pmu)
+{
+   cpuhp_state_remove_instance(cpuhp_slot, &pmu->cpuhp.node);
+}
+
+static void free_event_attributes(struct i915_pmu *pmu)
+{
+   struct attribute **attr_iter = pmu->events_attr_group.attrs;
+
+   for (; *attr_iter; attr_iter++)
+   kfree((*attr_iter)->name);
+
+   kfree(pmu->events_attr_group.attrs);
+   kfree(pmu->i915_attr);
+   kfree(pmu->pmu_attr);
+
+   pmu->events_attr_group.attrs = NULL;
+   pmu->i915_attr = NULL;
+   pmu->pmu_attr = NULL;
+}
+
+static bool is_igp(struct drm_i915_private *i915)
+{
+   struct pci_dev *pdev = to_pci_dev(i915->drm.dev);
+
+   /* IGP is :00:02.0 */
+   return pci_domain_nr(pdev->bus) == 0 &&
+  pdev->bus->number == 0 &&
+  PCI_SLOT(pdev->devfn) == 2 &&
+  PCI_FUNC(pdev->devfn) == 0;
+}
+
+static void pmu_teardown(struct i915_pmu *pmu)
+{
+   struct drm_i915_private *i915 = pmu_to_i915(pmu);
+
+   i915_pmu_unregister_cpuhp_state(pmu);
+   perf_pmu_unregister(&pmu->base);
+   pmu->base.event_init = NULL;
+   kfree(pmu->base.attr_groups);
+   if (!is_igp(i915))
+   kfree(pmu->name);
+   free_event_attributes(pmu);
+}
+
 static void i915_pmu_event_destroy(struct perf_event *event)
 {
struct i915_pmu *pmu = event_to_pmu(event);
@@ -1133,22 +1188,6 @@ err:;
return NULL;
 }
 
-static void free_event_attributes(struct i915_pmu *pmu)
-{
-   struct attribute **attr_iter = pmu->events_attr_group.attrs;
-
-   for (; *attr_iter; attr_iter++)
-   kfree((*attr_iter)->name);
-
-   kfree(pmu->events_attr_group.attrs);
-   kfree(pmu->i915_attr);
-   kfree(pmu->pmu_attr);
-
-   pmu->events_attr_group.attrs = NULL;
-   pmu->i915_attr = NULL;
-   pmu->pmu_attr = NULL;
-}
-
 static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
 {
struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node);
@@ -1194,8 +1233,6 @@ static int i915_pmu_cpu_offline(unsigned int cpu, struct 
hlist_node *node)
return 0;
 }
 
-static enum cpuhp_state cpuhp_slot = CPUHP_INVALID;
-
 int i915_pmu_init(void)
 {
int ret;
@@ -1219,30 +1256,6 @@ void i915_pmu_exit(void)
cpuhp_remove_multi_state(cpuhp_slot);
 }
 
-static int i915_pmu_register_cpuhp_state(struct i915_pmu *pmu)
-{
-   if (cpuhp_slot == CPUHP_INVALID)
-   return -EINVAL;
-
-   return cpuhp_state_add_instance(cpuhp_slot, &pmu->cpuhp.node);
-}
-
-static void i915_pmu_unregister_cpuhp_state(struct i915_pmu *pmu)
-{
-   cpuhp_state_remove_instance(cpuhp_slot, &pmu->cpuhp.node);
-}
-
-static bool is_igp(struct drm_i915_private *i915)
-{
-   struct pci_dev *pdev = to_pci_dev(i915->drm.dev);
-
-   /* IGP is :00:02.0 */
-   return pci_domain_nr(pdev->bus) == 0 &&
-  pdev->bus->number == 0 &&
-  PCI_SLOT(pdev->devfn) == 2 &&
-  PCI_FUNC(pdev->devfn) == 0;
-}
-
 void i915_pmu_register(struct drm_i915_private *i915)
 {
struct i915_pmu *pmu = &i915->pmu;
@@ -1341,12 +1354,5 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
 
hrtimer_cancel(&pmu->timer);
 
-   i915_pmu_unregister_cpuhp_state(pmu);
-
-   perf_pmu_unregister(&pmu->base);
-   pmu->base.event_init = NULL;
-   kfree(pmu->base.attr_groups);
-   if (!is_igp(i915))
-   kfree(pmu->name);
-   free_event_attributes(pmu);
+   pmu_teardown(pmu);
 }
-- 
2.34.1



[PATCH 0/2] Fix crash due to open pmu events during unbind

2024-02-12 Thread Umesh Nerlige Ramappa
Once a user opens an fd for a perf event, if the driver undergoes a
function level reset (FLR), the resources are not cleaned up as
expected. For this discussion FLR is defined as a PCI unbind followed by
a bind. perf_pmu_unregister() would cleanup everything, but when the
user closes the perf fd much later, perf_release() is called and we
encounter null pointer dereferences and/or list corruption in that path
which require a reboot to recover.

The only approach that worked to resolve this was to close the file
associated with the event such that the relevant cleanup happens w.r.t.
the open file. To do so, use the event->owner task and find the file
relevant to the event and close it. This relies on the
file->private_data matching the event object.

Test-with: 20240213062948.32735-1-umesh.nerlige.rama...@intel.com
Signed-off-by: Umesh Nerlige Ramappa 

Umesh Nerlige Ramappa (2):
  i915/pmu: Add pmu_teardown helper
  INTEL_DII: i915/pmu: Cleanup pending events on unbind

 drivers/gpu/drm/i915/i915_pmu.c | 192 
 drivers/gpu/drm/i915/i915_pmu.h |  15 +++
 2 files changed, 161 insertions(+), 46 deletions(-)

-- 
2.34.1



Re: [PATCH] drm/i915/perf: Update handling of MMIO triggered reports

2023-12-22 Thread Umesh Nerlige Ramappa

On Mon, Dec 18, 2023 at 04:05:43PM -0800, Umesh Nerlige Ramappa wrote:

On XEHP platforms user is not able to find MMIO triggered reports in the
OA buffer since i915 squashes the context ID fields. These context ID
fields hold the MMIO trigger markers.

Update logic to not squash the context ID fields of MMIO triggered
reports.

Fixes: cba94bbcff08 ("drm/i915/perf: Determine context valid in OA reports")
Signed-off-by: Umesh Nerlige Ramappa 
---



Fi.CI.IGT failures.

Possible new issues
Here are the unknown changes that may have been introduced in 
Patchwork_127946v2_full:

IGT changes
Possible regressions
igt@gem_exec_suspend@basic-s3-devices@smem:
shard-mtlp: NOTRUN -> ABORT

igt@i915_selftest@live@gt_pm:
shard-rkl: PASS -> DMESG-FAIL


The above are unrelated and do not exercise the code path that has the fix.

Thanks,
Umesh



Known issues


Re: ✗ Fi.CI.BAT: failure for drm/i915/perf: Update handling of MMIO triggered reports

2023-12-20 Thread Umesh Nerlige Ramappa

On Tue, Dec 19, 2023 at 04:57:10AM +, Patchwork wrote:

  Patch Details

Series:  drm/i915/perf: Update handling of MMIO triggered reports
URL: [1]https://patchwork.freedesktop.org/series/127946/
State:   failure
Details: 
[2]https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_127946v1/index.html

 CI Bug Log - changes from CI_DRM_14041 -> Patchwork_127946v1

Summary

  FAILURE

  Serious unknown changes coming with Patchwork_127946v1 absolutely need to
  be
  verified manually.

  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_127946v1, please notify your bug team
  (i915-ci-in...@lists.freedesktop.org) to allow them
  to document this new failure mode, which will reduce false positives in
  CI.

  External URL:
  https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_127946v1/index.html

Participating hosts (36 -> 34)

  Additional (1): bat-mtlp-8
  Missing (3): fi-bsw-nick fi-snb-2520m fi-pnv-d510

Possible new issues

  Here are the unknown changes that may have been introduced in
  Patchwork_127946v1:

 IGT changes

   Possible regressions

* igt@i915_selftest@live@hugepages:

 * bat-jsl-3: [3]PASS -> [4]INCOMPLETE


This is unrelated since the OA specific change is not exercised in BAT 
tests.


Umesh



Known issues



Re: [PATCH] drm/i915/perf: Update handling of MMIO triggered reports

2023-12-18 Thread Umesh Nerlige Ramappa

On Mon, Dec 18, 2023 at 09:48:39PM -0800, Dixit, Ashutosh wrote:

On Mon, 18 Dec 2023 21:28:33 -0800, Dixit, Ashutosh wrote:


On Mon, 18 Dec 2023 16:05:43 -0800, Umesh Nerlige Ramappa wrote:
>

Hi Umesh,

> On XEHP platforms user is not able to find MMIO triggered reports in the
> OA buffer since i915 squashes the context ID fields. These context ID
> fields hold the MMIO trigger markers.
>
> Update logic to not squash the context ID fields of MMIO triggered
> reports.
>
> Fixes: cba94bbcff08 ("drm/i915/perf: Determine context valid in OA reports")
> Signed-off-by: Umesh Nerlige Ramappa 
> ---
>  drivers/gpu/drm/i915/i915_perf.c | 39 
>  1 file changed, 34 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_perf.c 
b/drivers/gpu/drm/i915/i915_perf.c
> index 7b1c8de2f9cb..2d695818f006 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -772,10 +772,6 @@ static int gen8_append_oa_reports(struct 
i915_perf_stream *stream,
> * The reason field includes flags identifying what
> * triggered this specific report (mostly timer
> * triggered or e.g. due to a context switch).
> -   *
> -   * In MMIO triggered reports, some platforms do not set the
> -   * reason bit in this field and it is valid to have a reason
> -   * field of zero.
> */
>reason = oa_report_reason(stream, report);
>ctx_id = oa_context_id(stream, report32);
> @@ -787,8 +783,41 @@ static int gen8_append_oa_reports(struct 
i915_perf_stream *stream,
> *
> * Note: that we don't clear the valid_ctx_bit so userspace can
> * understand that the ID has been squashed by the kernel.
> +   *
> +   * Update:
> +   *
> +   * On XEHP platforms the behavior of context id valid bit has
> +   * changed compared to prior platforms. To describe this, we
> +   * define a few terms:
> +   *
> +   * context-switch-report: This is a report with the reason type
> +   * being context-switch. It is generated when a context switches
> +   * out.
> +   *
> +   * context-valid-bit: A bit that is set in the report ID field
> +   * to indicate that a valid context has been loaded.
> +   *
> +   * gpu-idle: A condition characterized by a
> +   * context-switch-report with context-valid-bit set to 0.
> +   *
> +   * On prior platforms, context-id-valid bit is set to 0 only
> +   * when GPU goes idle. In all other reports, it is set to 1.
> +   *
> +   * On XEHP platforms, context-valid-bit is set to 1 in a context
> +   * switch report if a new context switched in. For all other
> +   * reports it is set to 0.
> +   *
> +   * This change in behavior causes an issue with MMIO triggered
> +   * reports. MMIO triggered reports have the markers in the
> +   * context ID field and the context-valid-bit is 0. The logic
> +   * below to squash the context ID would render the report
> +   * useless since the user will not be able to find it in the OA
> +   * buffer. Since MMIO triggered reports exist only on XEHP,
> +   * we should avoid squashing these for XEHP platforms.

Hmm I am wondering if this is over-information and this comment should be
made brief.


Let me try: "For Gen's >= 12.50, the context id valid bit is reset when a
context switches out, but the context id is still valid. Because of this we
cannot squash the context id in this case".

So this should affect both the regular as well as MMIO triggered cases
afaiu.

Anyway, please do what you think is right with the comment. I just thought
I'll chime in.


The long and descriptive comment is entirely for my benefit. There is a 
very good chance I will forget this, so putting it down in the code.  
Also, I don't see this described in the spec, so thinking that we will 
benefit from it by having it here. I can put it in the commit msg 
instead if that helps.


Thanks,
Umesh




For the record, here's the explanation of what is happening from Robert
Krzemien's email (which at least makes it simpler for me to understand
what is happening):

For Gen12HP+ (ATS/DG2/PVC/MTL+) platforms, context id valid bit is
set only for context switch reports and when a context is being
loaded. When exiting a context, a context switch report is
generated, ctx id is not zero, but the bit is not set. It allows us
to distinguish whether context switch reports are generated due to
ente

[PATCH] drm/i915/perf: Update handling of MMIO triggered reports

2023-12-18 Thread Umesh Nerlige Ramappa
On XEHP platforms user is not able to find MMIO triggered reports in the
OA buffer since i915 squashes the context ID fields. These context ID
fields hold the MMIO trigger markers.

Update logic to not squash the context ID fields of MMIO triggered
reports.

Fixes: cba94bbcff08 ("drm/i915/perf: Determine context valid in OA reports")
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c | 39 
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 7b1c8de2f9cb..2d695818f006 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -772,10 +772,6 @@ static int gen8_append_oa_reports(struct i915_perf_stream 
*stream,
 * The reason field includes flags identifying what
 * triggered this specific report (mostly timer
 * triggered or e.g. due to a context switch).
-*
-* In MMIO triggered reports, some platforms do not set the
-* reason bit in this field and it is valid to have a reason
-* field of zero.
 */
reason = oa_report_reason(stream, report);
ctx_id = oa_context_id(stream, report32);
@@ -787,8 +783,41 @@ static int gen8_append_oa_reports(struct i915_perf_stream 
*stream,
 *
 * Note: that we don't clear the valid_ctx_bit so userspace can
 * understand that the ID has been squashed by the kernel.
+*
+* Update:
+*
+* On XEHP platforms the behavior of context id valid bit has
+* changed compared to prior platforms. To describe this, we
+* define a few terms:
+*
+* context-switch-report: This is a report with the reason type
+* being context-switch. It is generated when a context switches
+* out.
+*
+* context-valid-bit: A bit that is set in the report ID field
+* to indicate that a valid context has been loaded.
+*
+* gpu-idle: A condition characterized by a
+* context-switch-report with context-valid-bit set to 0.
+*
+* On prior platforms, context-id-valid bit is set to 0 only
+* when GPU goes idle. In all other reports, it is set to 1.
+*
+* On XEHP platforms, context-valid-bit is set to 1 in a context
+* switch report if a new context switched in. For all other
+* reports it is set to 0.
+*
+* This change in behavior causes an issue with MMIO triggered
+* reports. MMIO triggered reports have the markers in the
+* context ID field and the context-valid-bit is 0. The logic
+* below to squash the context ID would render the report
+* useless since the user will not be able to find it in the OA
+* buffer. Since MMIO triggered reports exist only on XEHP,
+* we should avoid squashing these for XEHP platforms.
 */
-   if (oa_report_ctx_invalid(stream, report)) {
+
+   if (oa_report_ctx_invalid(stream, report) &&
+   GRAPHICS_VER_FULL(stream->engine->i915) < IP_VER(12, 50)) {
ctx_id = INVALID_CTX_ID;
oa_context_id_squash(stream, report32);
}
-- 
2.38.1



Re: [Intel-gfx] [PATCH] drm/i915/pmu: Check if pmu is closed before stopping event

2023-10-25 Thread Umesh Nerlige Ramappa

On Tue, Oct 24, 2023 at 02:20:33PM +0200, Andi Shyti wrote:

Hi Umesh,

On Fri, Oct 20, 2023 at 08:24:41AM -0700, Umesh Nerlige Ramappa wrote:

When the driver unbinds, pmu is unregistered and i915->uabi_engines is
set to RB_ROOT. Due to this, when i915 PMU tries to stop the engine
events, it issues a warn_on because engine lookup fails.

All perf hooks are taking care of this using a pmu->closed flag that is
set when PMU unregisters. The stop event seems to have been left out.

Check for pmu->closed in pmu_event_stop as well.

Based on discussion here -
https://patchwork.freedesktop.org/patch/492079/?series=105790&rev=2

v2: s/is/if/ in commit title
v3: Add fixes tag and cc stable

Cc:  # v5.11+
Fixes: b00bccb3f0bb ("drm/i915/pmu: Handle PCI unbind")
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Tvrtko Ursulin 


argh! 4th time that this patch has been sent. Please next time
use:

  git format-patch -v 

Reviewed-by: Andi Shyti 

No need to resend :-)


Sorry about the versioning. Will keep inn mind next time.

Thanks,
Umesh


Andi


---
 drivers/gpu/drm/i915/i915_pmu.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 108b675088ba..f861863eb7c1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -831,9 +831,18 @@ static void i915_pmu_event_start(struct perf_event *event, 
int flags)

 static void i915_pmu_event_stop(struct perf_event *event, int flags)
 {
+   struct drm_i915_private *i915 =
+   container_of(event->pmu, typeof(*i915), pmu.base);
+   struct i915_pmu *pmu = &i915->pmu;
+
+   if (pmu->closed)
+   goto out;
+
if (flags & PERF_EF_UPDATE)
i915_pmu_event_read(event);
i915_pmu_disable(event);
+
+out:
event->hw.state = PERF_HES_STOPPED;
 }

--
2.38.1


[Intel-gfx] [PATCH] drm/i915/pmu: Check if pmu is closed before stopping event

2023-10-20 Thread Umesh Nerlige Ramappa
When the driver unbinds, pmu is unregistered and i915->uabi_engines is
set to RB_ROOT. Due to this, when i915 PMU tries to stop the engine
events, it issues a warn_on because engine lookup fails.

All perf hooks are taking care of this using a pmu->closed flag that is
set when PMU unregisters. The stop event seems to have been left out.

Check for pmu->closed in pmu_event_stop as well.

Based on discussion here -
https://patchwork.freedesktop.org/patch/492079/?series=105790&rev=2

v2: s/is/if/ in commit title
v3: Add fixes tag and cc stable

Cc:  # v5.11+
Fixes: b00bccb3f0bb ("drm/i915/pmu: Handle PCI unbind")
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_pmu.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 108b675088ba..f861863eb7c1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -831,9 +831,18 @@ static void i915_pmu_event_start(struct perf_event *event, 
int flags)
 
 static void i915_pmu_event_stop(struct perf_event *event, int flags)
 {
+   struct drm_i915_private *i915 =
+   container_of(event->pmu, typeof(*i915), pmu.base);
+   struct i915_pmu *pmu = &i915->pmu;
+
+   if (pmu->closed)
+   goto out;
+
if (flags & PERF_EF_UPDATE)
i915_pmu_event_read(event);
i915_pmu_disable(event);
+
+out:
event->hw.state = PERF_HES_STOPPED;
 }
 
-- 
2.38.1



[Intel-gfx] [PATCH] drm/i915/pmu: Check if pmu is closed before stopping event

2023-10-19 Thread Umesh Nerlige Ramappa
When the driver unbinds, pmu is unregistered and i915->uabi_engines is
set to RB_ROOT. Due to this, when i915 PMU tries to stop the engine
events, it issues a warn_on because engine lookup fails.

All perf hooks are taking care of this using a pmu->closed flag that is
set when PMU unregisters. The stop event seems to have been left out.

Check for pmu->closed in pmu_event_stop as well.

Based on discussion here -
https://patchwork.freedesktop.org/patch/492079/?series=105790&rev=2

v2: s/is/if/ in commit title

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 108b675088ba..f861863eb7c1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -831,9 +831,18 @@ static void i915_pmu_event_start(struct perf_event *event, 
int flags)
 
 static void i915_pmu_event_stop(struct perf_event *event, int flags)
 {
+   struct drm_i915_private *i915 =
+   container_of(event->pmu, typeof(*i915), pmu.base);
+   struct i915_pmu *pmu = &i915->pmu;
+
+   if (pmu->closed)
+   goto out;
+
if (flags & PERF_EF_UPDATE)
i915_pmu_event_read(event);
i915_pmu_disable(event);
+
+out:
event->hw.state = PERF_HES_STOPPED;
 }
 
-- 
2.38.1



[Intel-gfx] [PATCH] drm/i915/pmu: Check is pmu is closed before stopping event

2023-10-19 Thread Umesh Nerlige Ramappa
When the driver unbinds, pmu is unregistered and i915->uabi_engines is
set to RB_ROOT. Due to this, when i915 PMU tries to stop the engine
events, it issues a warn_on because engine lookup fails.

All perf hooks are taking care of this using a pmu->closed flag that is
set when PMU unregisters. The stop event seems to have been left out.

Check for pmu->closed in pmu_event_stop as well.

Based on discussion here -
https://patchwork.freedesktop.org/patch/492079/?series=105790&rev=2

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 108b675088ba..f861863eb7c1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -831,9 +831,18 @@ static void i915_pmu_event_start(struct perf_event *event, 
int flags)
 
 static void i915_pmu_event_stop(struct perf_event *event, int flags)
 {
+   struct drm_i915_private *i915 =
+   container_of(event->pmu, typeof(*i915), pmu.base);
+   struct i915_pmu *pmu = &i915->pmu;
+
+   if (pmu->closed)
+   goto out;
+
if (flags & PERF_EF_UPDATE)
i915_pmu_event_read(event);
i915_pmu_disable(event);
+
+out:
event->hw.state = PERF_HES_STOPPED;
 }
 
-- 
2.38.1



[Intel-gfx] [PATCH] drm/i915/pmu: Check is pmu is closed before stopping event

2023-10-19 Thread Umesh Nerlige Ramappa
When the driver unbinds, pmu is unregistered and i915->uabi_engines is
set to RB_ROOT. Due to this, when i915 PMU tries to stop the engine
events, it issues a warn_on because engine lookup fails.

All perf hooks are taking care of this using a pmu->closed flag that is
set when PMU unregisters. The stop event seems to have been left out.

Check for pmu->closed in pmu_event_stop as well.

Based on discussion here -
https://patchwork.freedesktop.org/patch/492079/?series=105790&rev=2

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 108b675088ba..f861863eb7c1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -831,9 +831,18 @@ static void i915_pmu_event_start(struct perf_event *event, 
int flags)
 
 static void i915_pmu_event_stop(struct perf_event *event, int flags)
 {
+   struct drm_i915_private *i915 =
+   container_of(event->pmu, typeof(*i915), pmu.base);
+   struct i915_pmu *pmu = &i915->pmu;
+
+   if (pmu->closed)
+   goto out;
+
if (flags & PERF_EF_UPDATE)
i915_pmu_event_read(event);
i915_pmu_disable(event);
+
+out:
event->hw.state = PERF_HES_STOPPED;
 }
 
-- 
2.38.1



Re: [Intel-gfx] [PATCH 1/3] drm/i915/guc: Support new and improved engine busyness

2023-10-03 Thread Umesh Nerlige Ramappa

On Fri, Sep 22, 2023 at 03:25:08PM -0700, john.c.harri...@intel.com wrote:

From: John Harrison 

The GuC has been extended to support a much more friendly engine
busyness interface. So partition the old interface into a 'busy_v1'
space and add 'busy_v2' support alongside. And if v2 is available, use
that in preference to v1. Note that v2 provides extra features over
and above v1 which will be exposed via PMU in subsequent patches.


Since we are thinking of using the existing busyness counter to expose 
the v2 values, we can drop the last sentence from above.




Signed-off-by: John Harrison 
---
drivers/gpu/drm/i915/gt/intel_engine_types.h  |   4 +-
.../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   4 +-
drivers/gpu/drm/i915/gt/uc/intel_guc.h|  82 ++--
drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c|  55 ++-
drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h|   9 +-
drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
.../gpu/drm/i915/gt/uc/intel_guc_submission.c | 381 ++
7 files changed, 427 insertions(+), 131 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index a7e6775980043..40fd8f984d64b 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -323,7 +323,7 @@ struct intel_engine_execlists_stats {
ktime_t start;
};

-struct intel_engine_guc_stats {
+struct intel_engine_guc_stats_v1 {
/**
 * @running: Active state of the engine when busyness was last sampled.
 */
@@ -603,7 +603,7 @@ struct intel_engine_cs {
struct {
union {
struct intel_engine_execlists_stats execlists;
-   struct intel_engine_guc_stats guc;
+   struct intel_engine_guc_stats_v1 guc_v1;
};


Overall, I would suggest having the renames as a separate patch. Would 
make the review easier.




/**
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h 
b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index f359bef046e0b..c190a99a36c38 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -137,7 +137,9 @@ enum intel_guc_action {
INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
INTEL_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507,
-   INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
+   INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF_V1 = 0x550A,
+   INTEL_GUC_ACTION_SET_DEVICE_ENGINE_UTILIZATION_V2 = 0x550C,
+   INTEL_GUC_ACTION_SET_FUNCTION_ENGINE_UTILIZATION_V2 = 0x550D,
INTEL_GUC_ACTION_STATE_CAPTURE_NOTIFICATION = 0x8002,
INTEL_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE = 0x8003,
INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED = 0x8004,
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 6c392bad29c19..e6502ab5f049f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -226,45 +226,61 @@ struct intel_guc {
struct mutex send_mutex;

/**
-* @timestamp: GT timestamp object that stores a copy of the timestamp
-* and adjusts it for overflow using a worker.
+* @busy: Data used by the different versions of engine busyness 
implementations.
 */
-   struct {
-   /**
-* @lock: Lock protecting the below fields and the engine stats.
-*/
-   spinlock_t lock;
-
-   /**
-* @gt_stamp: 64 bit extended value of the GT timestamp.
-*/
-   u64 gt_stamp;
-
-   /**
-* @ping_delay: Period for polling the GT timestamp for
-* overflow.
-*/
-   unsigned long ping_delay;
-
-   /**
-* @work: Periodic work to adjust GT timestamp, engine and
-* context usage for overflows.
-*/
-   struct delayed_work work;
-
+   union {
/**
-* @shift: Right shift value for the gpm timestamp
+* @v1: Data used by v1 engine busyness implementation. Mostly 
a copy
+* of the GT timestamp extended to 64 bits and the worker for 
maintaining it.
 */
-   u32 shift;
+   struct {
+   /**
+* @lock: Lock protecting the below fields and the 
engine stats.
+*/
+   spinlock_t lock;
+
+   /**
+* @gt_stamp: 64 bit extended value of the GT timestamp.
+*/
+   u64 gt_stamp;
+
+   /**
+* @ping_delay: Period for polling the GT timestam

Re: [Intel-gfx] [PATCH i-g-t] tools/intel_gpu_top: Restore user friendly error message

2023-09-29 Thread Umesh Nerlige Ramappa

On Fri, Sep 29, 2023 at 12:09:49PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

We have a nice error message displayed when an user with insufficient
permissions tries to run the tool, but that got lost while Meteorlake
support was added. Bring it back in.

v2:
* Propagate unexpected errno on multi-tile systems too. (Umesh)

Signed-off-by: Tvrtko Ursulin 
Cc: Umesh Nerlige Ramappa 
---
tools/intel_gpu_top.c | 8 +---
1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 87e9681e53b4..10601e66b18e 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -554,9 +554,9 @@ static int get_num_gts(uint64_t type)

close(fd);
}
-   assert(!errno || errno == ENOENT);
-   assert(cnt > 0);
-   errno = 0;
+
+   if (!cnt || (errno && errno != ENOENT))
+   cnt = -errno;


Reviewed-by: Umesh Nerlige Ramappa 

Thanks,
Umesh


return cnt;
}
@@ -590,6 +590,8 @@ static int pmu_init(struct engines *engines)
engines->fd = -1;
engines->num_counters = 0;
engines->num_gts = get_num_gts(type);
+   if (engines->num_gts <= 0)
+   return -1;

engines->irq.config = I915_PMU_INTERRUPTS;
fd = _open_pmu(type, engines->num_counters, &engines->irq, engines->fd);
--
2.39.2



Re: [Intel-gfx] [PATCH i-g-t 03/12] tools/intel_gpu_top: Restore user friendly error message

2023-09-28 Thread Umesh Nerlige Ramappa

On Thu, Sep 28, 2023 at 09:16:23AM +0100, Tvrtko Ursulin wrote:


On 27/09/2023 21:13, Umesh Nerlige Ramappa wrote:

On Fri, Sep 22, 2023 at 02:44:28PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

We have a nice error message displayed when an user with insufficient
permissions tries to run the tool, but that got lost while Meteorlake
support was added. Bring it back in.

Signed-off-by: Tvrtko Ursulin 
Cc: Umesh Nerlige Ramappa 
---
tools/intel_gpu_top.c | 10 +++---
1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 87e9681e53b4..e01355f90458 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -554,9 +554,11 @@ static int get_num_gts(uint64_t type)

    close(fd);
}
-    assert(!errno || errno == ENOENT);
-    assert(cnt > 0);
-    errno = 0;
+
+    if (!cnt)
+    cnt = errno;
+    else
+    errno = 0;


ENOENT is the only way this logic is checking for num_gts.

In this case error is propagated only if cnt == 0. What if cnt=1 and 
we get an error (other than ENOENT)? Should we ignore that?


If cnt >= 1 then at least one tile was found so the errno happened 
while probing for further tiles. So on single tile parts it can be 
ignored.


I am actually only referring to single tile parts. The for loop iterates 
over MAX_GTs (4), so I am expecting an ENOENT from a single tile part 
when cnt >= 1. Anything else is an error/failure that we should flag.


On multi-tile parts it cannot really happen, or even if it happens 
situation would simply be "why is only one tile showing". If we want to 
cover this impossible/unlikely case then maybe like this:


if (!cnt || (errno && errno != ENOENT))
cnt = -errno;


If you agree to the above logic, then this condition should do the 
trick.


Regards,
Umesh


I had something like this in mind for the regression (and sorry this 
fell through the cracks)


https://patchwork.freedesktop.org/patch/541406/?series=118973&rev=1


Oh back in June!

I think yours work too, in which case it's a matter of a style choice 
with which one to go. I don't have a strong preference - above would 
be a bit more compact, while I think it still succinctly expresses the 
failure condition ("nothing found or unexpected error while probing 
for remote tiles").


Regards,

Tvrtko



Regards,
Umesh



return cnt;
}
@@ -590,6 +592,8 @@ static int pmu_init(struct engines *engines)
engines->fd = -1;
engines->num_counters = 0;
engines->num_gts = get_num_gts(type);
+    if (engines->num_gts <= 0)
+    return -1;

engines->irq.config = I915_PMU_INTERRUPTS;
fd = _open_pmu(type, engines->num_counters, &engines->irq, 
engines->fd);

--
2.39.2



Re: [Intel-gfx] [PATCH 2/3] drm/i915/mtl: Add a PMU counter for total active ticks

2023-09-27 Thread Umesh Nerlige Ramappa

On Mon, Sep 25, 2023 at 09:40:46AM +0100, Tvrtko Ursulin wrote:


On 22/09/2023 23:25, john.c.harri...@intel.com wrote:

From: Umesh Nerlige Ramappa 

Current engine busyness interface exposed by GuC has a few issues:

- The busyness of active engine is calculated using 2 values provided by
  GuC and is prone to race between CPU reading those values and GuC
  updating them. Any sort of HW synchronization would be at the cost of
  scheduling latencies.

- GuC provides only 32 bit values for busyness and KMD has to run a
  worker to extend the values to 64 bit. In addition KMD also needs to
  extend the GT timestamp to 64 bits so that it can be used to calculate
  active busyness for an engine.

To address these issues, GuC provides a new interface to calculate
engine busyness. GuC accumulates the busyness ticks in a 64 bit value
and also internally updates the busyness for an active context using a
periodic timer. This simplifies the KMD implementation such that KMD
only needs to relay the busyness value to the user.

In addition to fixing the interface, GuC also provides a periodically
total active ticks that the GT has been running for. This counter is
exposed to the user so that the % busyness can be calculated as follows:

busyness % = (engine active ticks/total active ticks) * 100.


AFAIU I915_PMU_TOTAL_ACTIVE_TICKS only runs when GT is awake, right?

So if GT is awake 10% of the time, and engine is busy that 100% of 
that time, which is 10% of the real/wall time, the busyness by this 
formula comes up as 100%. Which wouldn't be useful for intel_gpu_top 
and alike.


How to scale it back to wall time? Again AFAIU there is no info about 
tick frequency, so how does one know what a delta in total active 
ticks means?


Looks like I got this wrong. The implementation is actually updating the 
total active ticks even when idle and that addresses the concern above.




Going back on the higher level, I am not convinced we need to add a 
new uapi just for MTL. If the tick period is known internally we could 
just use v2 internally and expose the current uapi using it.


We did plan to support the total active ticks in future platforms for 
other use cases and thought this would be a good place to initiate the 
support. At the same time, I agree that existing interface can still 
work with the v2 GuC interface. I will post that.




Any timebase conversion error is unlikely to be relevant because 
userspace only looks at deltas over relatively short periods 
(seconds). Ie. I don't think that the clock drift error would 
accumulate so it would need to be really huge to be relevant over 
short sampling periods.


At some point we may need to think about long running workloads, but 
that may require a different counter anyways, so I would not address it 
here.


Thanks,
Umesh



Regards,

Tvrtko



Implement the new interface and start by adding a new counter for total
active ticks.

Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: John Harrison 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 24 +++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |  1 +
 drivers/gpu/drm/i915/i915_pmu.c   |  6 +
 include/uapi/drm/i915_drm.h   |  2 ++
 4 files changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 88465d701c278..0c1fee5360777 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1607,6 +1607,30 @@ static ktime_t busy_v2_guc_engine_busyness(struct 
intel_engine_cs *engine, ktime
return ns_to_ktime(total);
 }
+static u64 busy_v1_intel_guc_total_active_ticks(struct intel_guc *guc)
+{
+   return guc->busy.v1.gt_stamp;
+}
+
+static u64 busy_v2_intel_guc_total_active_ticks(struct intel_guc *guc)
+{
+   u64 ticks_gt;
+
+   __busy_v2_get_engine_usage_record(guc, NULL, NULL, NULL, &ticks_gt);
+
+   return ticks_gt;
+}
+
+u64 intel_guc_total_active_ticks(struct intel_gt *gt)
+{
+   struct intel_guc *guc = >->uc.guc;
+
+   if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 3, 1))
+   return busy_v1_intel_guc_total_active_ticks(guc);
+   else
+   return busy_v2_intel_guc_total_active_ticks(guc);
+}
+
 static int busy_v2_guc_action_enable_usage_stats_device(struct intel_guc *guc)
 {
u32 offset = guc_engine_usage_offset_v2_device(guc);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c57b29cdb1a64..f6d42838825f2 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -30,6 +30,7 @@ void intel_guc_dump_active_requests(struct intel_engine_cs 
*engine,
struct drm_printer *m);
 void intel_guc_busyness_park(struct intel_gt *gt);
 void intel_guc_busyness_

Re: [Intel-gfx] [PATCH i-g-t 03/12] tools/intel_gpu_top: Restore user friendly error message

2023-09-27 Thread Umesh Nerlige Ramappa

On Fri, Sep 22, 2023 at 02:44:28PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

We have a nice error message displayed when an user with insufficient
permissions tries to run the tool, but that got lost while Meteorlake
support was added. Bring it back in.

Signed-off-by: Tvrtko Ursulin 
Cc: Umesh Nerlige Ramappa 
---
tools/intel_gpu_top.c | 10 +++---
1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 87e9681e53b4..e01355f90458 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -554,9 +554,11 @@ static int get_num_gts(uint64_t type)

close(fd);
}
-   assert(!errno || errno == ENOENT);
-   assert(cnt > 0);
-   errno = 0;
+
+   if (!cnt)
+   cnt = errno;
+   else
+   errno = 0;


ENOENT is the only way this logic is checking for num_gts.

In this case error is propagated only if cnt == 0. What if cnt=1 and we 
get an error (other than ENOENT)? Should we ignore that?


I had something like this in mind for the regression (and sorry this 
fell through the cracks)


https://patchwork.freedesktop.org/patch/541406/?series=118973&rev=1

Regards,
Umesh



return cnt;
}
@@ -590,6 +592,8 @@ static int pmu_init(struct engines *engines)
engines->fd = -1;
engines->num_counters = 0;
engines->num_gts = get_num_gts(type);
+   if (engines->num_gts <= 0)
+   return -1;

engines->irq.config = I915_PMU_INTERRUPTS;
fd = _open_pmu(type, engines->num_counters, &engines->irq, engines->fd);
--
2.39.2



[Intel-gfx] [PATCH] i915/guc: Get runtime pm in busyness worker only if already active

2023-09-25 Thread Umesh Nerlige Ramappa
Ideally the busyness worker should take a gt pm wakeref because the
worker only needs to be active while gt is awake. However, the gt_park
path cancels the worker synchronously and this complicates the flow if
the worker is also running at the same time. The cancel waits for the
worker and when the worker releases the wakeref, that would call gt_park
and would lead to a deadlock.

The resolution is to take the global pm wakeref if runtime pm is already
active. If not, we don't need to update the busyness stats as the stats
would already be updated when the gt was parked.

Note:
- We do not requeue the worker if we cannot take a reference to runtime
  pm since intel_guc_busyness_unpark would requeue the worker in the
  resume path.

- If the gt was parked longer than time taken for GT timestamp to roll
  over, we ignore those rollovers since we don't care about tracking the
  exact GT time. We only care about roll overs when the gt is active and
  running workloads.

- There is a window of time between gt_park and runtime suspend, where
  the worker may run. This is acceptable since the worker will not find
  any new data to update busyness.

v2: (Daniele)
- Edit commit message and code comment
- Use runtime pm in the worker
- Put runtime pm after enabling the worker
- Use Link tag and add Fixes tag

v3: (Daniele)
- Reword commit and comments and add details

Link: https://gitlab.freedesktop.org/drm/intel/-/issues/7077
Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to 
pmu")
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Daniele Ceraolo Spurio 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 38 +--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index cabdc645fcdd..ae3495a9c814 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1432,6 +1432,36 @@ static void guc_timestamp_ping(struct work_struct *wrk)
unsigned long index;
int srcu, ret;
 
+   /*
+* Ideally the busyness worker should take a gt pm wakeref because the
+* worker only needs to be active while gt is awake. However, the
+* gt_park path cancels the worker synchronously and this complicates
+* the flow if the worker is also running at the same time. The cancel
+* waits for the worker and when the worker releases the wakeref, that
+* would call gt_park and would lead to a deadlock.
+*
+* The resolution is to take the global pm wakeref if runtime pm is
+* already active. If not, we don't need to update the busyness stats as
+* the stats would already be updated when the gt was parked.
+*
+* Note:
+* - We do not requeue the worker if we cannot take a reference to 
runtime
+*   pm since intel_guc_busyness_unpark would requeue the worker in the
+*   resume path.
+*
+* - If the gt was parked longer than time taken for GT timestamp to 
roll
+*   over, we ignore those rollovers since we don't care about tracking
+*   the exact GT time. We only care about roll overs when the gt is
+*   active and running workloads.
+*
+* - There is a window of time between gt_park and runtime suspend,
+*   where the worker may run. This is acceptable since the worker will
+*   not find any new data to update busyness.
+*/
+   wakeref = intel_runtime_pm_get_if_active(>->i915->runtime_pm);
+   if (!wakeref)
+   return;
+
/*
 * Synchronize with gt reset to make sure the worker does not
 * corrupt the engine/guc stats. NB: can't actually block waiting
@@ -1440,10 +1470,9 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 */
ret = intel_gt_reset_trylock(gt, &srcu);
if (ret)
-   return;
+   goto err_trylock;
 
-   with_intel_runtime_pm(>->i915->runtime_pm, wakeref)
-   __update_guc_busyness_stats(guc);
+   __update_guc_busyness_stats(guc);
 
/* adjust context stats for overflow */
xa_for_each(&guc->context_lookup, index, ce)
@@ -1452,6 +1481,9 @@ static void guc_timestamp_ping(struct work_struct *wrk)
intel_gt_reset_unlock(gt, srcu);
 
guc_enable_busyness_worker(guc);
+
+err_trylock:
+   intel_runtime_pm_put(>->i915->runtime_pm, wakeref);
 }
 
 static int guc_action_enable_usage_stats(struct intel_guc *guc)
-- 
2.38.1



Re: [Intel-gfx] ✗ Fi.CI.BAT: failure for i915/guc: Get runtime pm in busyness worker only if already active

2023-09-15 Thread Umesh Nerlige Ramappa

On Fri, Sep 15, 2023 at 05:37:53AM +, Patchwork wrote:

  Patch Details

Series:  i915/guc: Get runtime pm in busyness worker only if already active
URL: [1]https://patchwork.freedesktop.org/series/123744/
State:   failure
Details: 
[2]https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_123744v1/index.html

 CI Bug Log - changes from CI_DRM_13635 -> Patchwork_123744v1

Summary

  FAILURE

  Serious unknown changes coming with Patchwork_123744v1 absolutely need to
  be
  verified manually.

  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_123744v1, please notify your bug team
  (lgci.bug.fil...@intel.com) to allow them
  to document this new failure mode, which will reduce false positives in
  CI.

  External URL:
  https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_123744v1/index.html

Participating hosts (40 -> 40)

  Additional (1): fi-kbl-soraka
  Missing (1): fi-snb-2520m

Possible new issues

  Here are the unknown changes that may have been introduced in
  Patchwork_123744v1:

 IGT changes

   Possible regressions

* igt@i915_selftest@live@hangcheck:

 * fi-skl-guc: [3]PASS -> [4]DMESG-FAIL


Not related to changes in this patch. Ran the test a couple times on MTL 
and not seeing any failures.




* igt@i915_selftest@live@perf:

 * fi-kbl-soraka: NOTRUN -> [5]ABORT


Unrelated since this does not use GuC.

Umesh



Known issues



Re: [Intel-gfx] ✗ Fi.CI.IGT: failure for i915/pmu: Move execlist stats initialization to execlist specific setup (rev2)

2023-09-15 Thread Umesh Nerlige Ramappa

On Fri, Sep 15, 2023 at 09:02:05AM -0700, Umesh Nerlige Ramappa wrote:

On Thu, Sep 14, 2023 at 04:18:34AM +, Patchwork wrote:

 Patch Details

Series:  i915/pmu: Move execlist stats initialization to execlist specific setup
   (rev2)
URL: [1]https://patchwork.freedesktop.org/series/123616/
State:   failure
Details: 
[2]https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_123616v2/index.html

   CI Bug Log - changes from CI_DRM_13627_full -> Patchwork_123616v2_full

Summary

 FAILURE

 Serious unknown changes coming with Patchwork_123616v2_full absolutely
 need to be
 verified manually.

 If you think the reported changes have nothing to do with the changes
 introduced in Patchwork_123616v2_full, please notify your bug team
 (lgci.bug.fil...@intel.com) to allow them
 to document this new failure mode, which will reduce false positives in
 CI.

Participating hosts (9 -> 10)

 Additional (1): shard-tglu0

Possible new issues

 Here are the unknown changes that may have been introduced in
 Patchwork_123616v2_full:

IGT changes

  Possible regressions

   * igt@perf_pmu@rc6-all-gts:

* shard-mtlp: NOTRUN -> [3]ABORT


This is an existing bug and not related to this patch:

https://patchwork.freedesktop.org/series/123616/#rev2


Bug link: https://gitlab.freedesktop.org/drm/intel/issues/9335


Thanks,
Umesh


  Suppressed

 The following results come from untrusted machines, tests, or statuses.
 They do not affect the overall result.

   * {igt@kms_feature_discovery@display-4x}:

* shard-mtlp: NOTRUN -> [4]SKIP

New tests

 New tests have been introduced between CI_DRM_13627_full and
 Patchwork_123616v2_full:

New IGT tests (4)

   * igt@kms_atomic_transition@plane-all-transition-nonblocking@pipe-a-hdmi-a-4:

* Statuses : 1 pass(s)
* Exec time: [0.0] s

   * igt@kms_atomic_transition@plane-all-transition-nonblocking@pipe-b-hdmi-a-4:

* Statuses : 1 pass(s)
* Exec time: [0.0] s

   * igt@kms_cursor_crc@cursor-onscreen-128x128@pipe-a-hdmi-a-4:

* Statuses : 1 pass(s)
* Exec time: [0.0] s

   * igt@kms_cursor_crc@cursor-onscreen-128x128@pipe-d-hdmi-a-4:

* Statuses : 1 pass(s)
* Exec time: [0.0] s

Known issues


...


Re: [Intel-gfx] ✗ Fi.CI.IGT: failure for i915/pmu: Move execlist stats initialization to execlist specific setup (rev2)

2023-09-15 Thread Umesh Nerlige Ramappa

On Thu, Sep 14, 2023 at 04:18:34AM +, Patchwork wrote:

  Patch Details

Series:  i915/pmu: Move execlist stats initialization to execlist specific setup
(rev2)
URL: [1]https://patchwork.freedesktop.org/series/123616/
State:   failure
Details: 
[2]https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_123616v2/index.html

CI Bug Log - changes from CI_DRM_13627_full -> Patchwork_123616v2_full

Summary

  FAILURE

  Serious unknown changes coming with Patchwork_123616v2_full absolutely
  need to be
  verified manually.

  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_123616v2_full, please notify your bug team
  (lgci.bug.fil...@intel.com) to allow them
  to document this new failure mode, which will reduce false positives in
  CI.

Participating hosts (9 -> 10)

  Additional (1): shard-tglu0

Possible new issues

  Here are the unknown changes that may have been introduced in
  Patchwork_123616v2_full:

 IGT changes

   Possible regressions

* igt@perf_pmu@rc6-all-gts:

 * shard-mtlp: NOTRUN -> [3]ABORT


This is an existing bug and not related to this patch:

https://patchwork.freedesktop.org/series/123616/#rev2

Thanks,
Umesh


   Suppressed

  The following results come from untrusted machines, tests, or statuses.
  They do not affect the overall result.

* {igt@kms_feature_discovery@display-4x}:

 * shard-mtlp: NOTRUN -> [4]SKIP

New tests

  New tests have been introduced between CI_DRM_13627_full and
  Patchwork_123616v2_full:

 New IGT tests (4)

* 
igt@kms_atomic_transition@plane-all-transition-nonblocking@pipe-a-hdmi-a-4:

 * Statuses : 1 pass(s)
 * Exec time: [0.0] s

* 
igt@kms_atomic_transition@plane-all-transition-nonblocking@pipe-b-hdmi-a-4:

 * Statuses : 1 pass(s)
 * Exec time: [0.0] s

* igt@kms_cursor_crc@cursor-onscreen-128x128@pipe-a-hdmi-a-4:

 * Statuses : 1 pass(s)
 * Exec time: [0.0] s

* igt@kms_cursor_crc@cursor-onscreen-128x128@pipe-d-hdmi-a-4:

 * Statuses : 1 pass(s)
 * Exec time: [0.0] s

Known issues


...


[Intel-gfx] [PATCH] i915/guc: Get runtime pm in busyness worker only if already active

2023-09-14 Thread Umesh Nerlige Ramappa
Ideally the busyness worker should take a gt pm wakeref because the
worker only needs to be active while gt is awake. However, the gt_park
path cancels the worker synchronously and this complicates the flow if
the worker is also running at the same time. The cancel waits for the
worker and when the worker releases the wakeref, that would call gt_park
and would lead to a deadlock.

The resolution is to take the global pm wakeref if runtime pm is already
active. If not, we don't need to update the busyness stats as the stats
would already be updated when the gt was parked.

Note:
- We do not requeue the worker if we cannot take a reference to runtime
  pm since intel_guc_busyness_unpark would requeue the worker in the
  resume path.

- If the gt was parked longer than time taken for GT timestamp to roll
  over, we ignore those rollovers since we don't care about tracking the
  exact GT time. We only care about roll overs when the gt is active and
  running workloads.

- There is a window of time between gt_park and runtime suspend, where
  the worker may run. This is acceptable since the worker will not find
  any new data to update busyness.

v2: (Daniele)
- Edit commit message and code comment
- Use runtime pm in the worker
- Put runtime pm after enabling the worker
- Use Link tag and add Fixes tag

v3: (Daniele)
- Reword commit and comments and add details

Link: https://gitlab.freedesktop.org/drm/intel/-/issues/7077
Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to 
pmu")
Signed-off-by: Umesh Nerlige Ramappa 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 38 +--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index cabdc645fcdd..ae3495a9c814 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1432,6 +1432,36 @@ static void guc_timestamp_ping(struct work_struct *wrk)
unsigned long index;
int srcu, ret;
 
+   /*
+* Ideally the busyness worker should take a gt pm wakeref because the
+* worker only needs to be active while gt is awake. However, the
+* gt_park path cancels the worker synchronously and this complicates
+* the flow if the worker is also running at the same time. The cancel
+* waits for the worker and when the worker releases the wakeref, that
+* would call gt_park and would lead to a deadlock.
+*
+* The resolution is to take the global pm wakeref if runtime pm is
+* already active. If not, we don't need to update the busyness stats as
+* the stats would already be updated when the gt was parked.
+*
+* Note:
+* - We do not requeue the worker if we cannot take a reference to 
runtime
+*   pm since intel_guc_busyness_unpark would requeue the worker in the
+*   resume path.
+*
+* - If the gt was parked longer than time taken for GT timestamp to 
roll
+*   over, we ignore those rollovers since we don't care about tracking
+*   the exact GT time. We only care about roll overs when the gt is
+*   active and running workloads.
+*
+* - There is a window of time between gt_park and runtime suspend,
+*   where the worker may run. This is acceptable since the worker will
+*   not find any new data to update busyness.
+*/
+   wakeref = intel_runtime_pm_get_if_active(>->i915->runtime_pm);
+   if (!wakeref)
+   return;
+
/*
 * Synchronize with gt reset to make sure the worker does not
 * corrupt the engine/guc stats. NB: can't actually block waiting
@@ -1440,10 +1470,9 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 */
ret = intel_gt_reset_trylock(gt, &srcu);
if (ret)
-   return;
+   goto err_trylock;
 
-   with_intel_runtime_pm(>->i915->runtime_pm, wakeref)
-   __update_guc_busyness_stats(guc);
+   __update_guc_busyness_stats(guc);
 
/* adjust context stats for overflow */
xa_for_each(&guc->context_lookup, index, ce)
@@ -1452,6 +1481,9 @@ static void guc_timestamp_ping(struct work_struct *wrk)
intel_gt_reset_unlock(gt, srcu);
 
guc_enable_busyness_worker(guc);
+
+err_trylock:
+   intel_runtime_pm_put(>->i915->runtime_pm, wakeref);
 }
 
 static int guc_action_enable_usage_stats(struct intel_guc *guc)
-- 
2.38.1



Re: [Intel-gfx] [PATCH 3/3] drm/i915/perf: Initialize gen12 OA buffer unconditionally

2023-09-12 Thread Umesh Nerlige Ramappa

On Fri, Sep 08, 2023 at 06:24:16PM -0700, Dixit, Ashutosh wrote:

On Fri, 08 Sep 2023 18:16:26 -0700, Ashutosh Dixit wrote:




Hi Umesh,


From: Umesh Nerlige Ramappa 

Correct values for OAR counters are still dependent on enabling the
GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE in OAG_OACONTROL. Enabling this
bit means OAG unit will write reports to the OAG buffer, so
initialize the OAG buffer unconditionally for all use cases.

BSpec: 46822

Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_perf.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 1347e4ec9dd5a..30cf37d6e79be 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -3032,12 +3032,12 @@ static void gen12_oa_enable(struct i915_perf_stream 
*stream)
u32 val;

/*
-* If we don't want OA reports from the OA buffer, then we don't even
-* need to program the OAG unit.
+* BSpec: 46822
+* Correct values for OAR counters are still dependent on enabling the
+* GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE in OAG_OACONTROL. Enabling this
+* bit means OAG unit will write reports to the OAG buffer, so
+* initialize the OAG buffer correctly.
 */
-   if (!(stream->sample_flags & SAMPLE_OA_REPORT))
-   return;
-
gen12_init_oa_buffer(stream);

regs = __oa_regs(stream);


Looks like this should be needed, I can R-b it.

However, gen12_test_mi_rpc IGT says:

/* OA unit configuration:
 * DRM_I915_PERF_PROP_SAMPLE_OA is no longer required for Gen12
 * because the OAR unit increments counters only for the
 * relevant context. No other parameters are needed since we do
 * not rely on the OA buffer anymore to normalize the counter
 * values.
 */


That's wrong. When TGL support was added, this was misunderstood and I 
removed the OAR-OAG dependency. Ideally we should enforce user to pass 
SAMPLE_OA always, but now that will break uabi.


For for the OAR case, let's just enable OAG unconditionally so that the 
OAR counters tick correctly. While we do that, we should disable all 
events that trigger a report into the OA buffer. In addition, I would 
also allocate the smallest OA buffer size for this case, so that memory 
impact is low.


Needs a Fixes tag with the commit that enabled OA for TGL.

Regards,
Umesh



So gen12_test_mi_rpc doesn't set DRM_I915_PERF_PROP_SAMPLE_OA and also
seems to be passing in CI (don't see it but there seem to be no open
bugs). Thoughts?

Thanks.
--
Ashutosh


Re: [Intel-gfx] [PATCH 2/3] drm/i915/perf: Remove gtt_offset from stream->oa_buffer.head/.tail

2023-09-12 Thread Umesh Nerlige Ramappa
qrestore(&stream->oa_buffer.ptr_lock, flags);

-   /* NB: oa_buffer.head/tail include the gtt_offset which we don't want
-* while indexing relative to oa_buf_base.
-*/
-   head -= gtt_offset;
-   tail -= gtt_offset;
-
/* An out of bounds or misaligned head or tail pointer implies a driver
 * bug since we validate + align the tail pointers we read from the
 * hardware and we are in full control of the head pointer which should
@@ -,13 +1089,8 @@ static int gen7_append_oa_reports(struct 
i915_perf_stream *stream,
if (start_offset != *offset) {
spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);

-   /* We removed the gtt_offset for the copy loop above, indexing
-* relative to oa_buf_base so put back here...
-*/
-   head += gtt_offset;
-
intel_uncore_write(uncore, GEN7_OASTATUS2,
-  (head & GEN7_OASTATUS2_HEAD_MASK) |
+  ((head + gtt_offset) & 
GEN7_OASTATUS2_HEAD_MASK) |
   GEN7_OASTATUS2_MEM_SELECT_GGTT);
stream->oa_buffer.head = head;

@@ -1705,7 +1678,7 @@ static void gen7_init_oa_buffer(struct i915_perf_stream 
*stream)
 */
intel_uncore_write(uncore, GEN7_OASTATUS2, /* head */
   gtt_offset | GEN7_OASTATUS2_MEM_SELECT_GGTT);
-   stream->oa_buffer.head = gtt_offset;
+   stream->oa_buffer.head = 0;

intel_uncore_write(uncore, GEN7_OABUFFER, gtt_offset);

@@ -1713,7 +1686,7 @@ static void gen7_init_oa_buffer(struct i915_perf_stream 
*stream)
   gtt_offset | OABUFFER_SIZE_16M);

/* Mark that we need updated tail pointers to read from... */
-   stream->oa_buffer.tail = gtt_offset;
+   stream->oa_buffer.tail = 0;

spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);

@@ -1747,7 +1720,7 @@ static void gen8_init_oa_buffer(struct i915_perf_stream 
*stream)

intel_uncore_write(uncore, GEN8_OASTATUS, 0);
intel_uncore_write(uncore, GEN8_OAHEADPTR, gtt_offset);
-   stream->oa_buffer.head = gtt_offset;
+   stream->oa_buffer.head = 0;

intel_uncore_write(uncore, GEN8_OABUFFER_UDW, 0);

@@ -1764,7 +1737,7 @@ static void gen8_init_oa_buffer(struct i915_perf_stream 
*stream)
intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & 
GEN8_OATAILPTR_MASK);

/* Mark that we need updated tail pointers to read from... */
-   stream->oa_buffer.tail = gtt_offset;
+   stream->oa_buffer.tail = 0;

/*
 * Reset state used to recognise context switches, affecting which
@@ -1801,7 +1774,7 @@ static void gen12_init_oa_buffer(struct i915_perf_stream 
*stream)
intel_uncore_write(uncore, __oa_regs(stream)->oa_status, 0);
intel_uncore_write(uncore, __oa_regs(stream)->oa_head_ptr,
   gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
-   stream->oa_buffer.head = gtt_offset;
+   stream->oa_buffer.head = 0;

/*
 * PRM says:
@@ -1817,7 +1790,7 @@ static void gen12_init_oa_buffer(struct i915_perf_stream 
*stream)
   gtt_offset & GEN12_OAG_OATAILPTR_MASK);

/* Mark that we need updated tail pointers to read from... */
-   stream->oa_buffer.tail = gtt_offset;
+   stream->oa_buffer.tail = 0;



Looks correct.

Reviewed-by: Umesh Nerlige Ramappa 

Thanks,
Umesh

/*
 * Reset state used to recognise context switches, affecting which
--
2.41.0



Re: [Intel-gfx] [PATCH 1/3] drm/i915/perf: Subtract gtt_offset from hw_tail

2023-09-12 Thread Umesh Nerlige Ramappa

On Fri, Sep 08, 2023 at 06:16:24PM -0700, Ashutosh Dixit wrote:

The code in oa_buffer_check_unlocked() is correct only if the OA buffer is
16 MB aligned (which seems to be the case today in i915). However when the
16 MB alignment is dropped, when we "Subtract partial amount off the tail",
the "& (OA_BUFFER_SIZE - 1)" operation in OA_TAKEN() will result in an
incorrect hw_tail value.

Therefore hw_tail must be brought to the same base as head and read_tail
prior to OA_TAKEN by subtracting gtt_offset from hw_tail.

Signed-off-by: Ashutosh Dixit 
---
drivers/gpu/drm/i915/i915_perf.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 018f42fff4cc0..ec0fc2934045a 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -565,6 +565,7 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
partial_report_size %= report_size;

/* Subtract partial amount off the tail */
+   hw_tail -= gtt_offset;
hw_tail = OA_TAKEN(hw_tail, partial_report_size);


I see partial_report_size is a value in the 0 - report_size range and it 
may not have the gtt_offset added to it, so I guess the OA_TAKEN may 
result in a bad value, but I am not able to visualize what the specific 
issue is. Can you please provide an example with numbers?


Also, slightly confused about the need for this patch. Are we dropping 
the 16 MB alignment for some reason?  If not, I suggest we can add this 
patch later with any series that drops it.


Thanks,
Umesh



/* NB: The head we observe here might effectively be a little
--
2.41.0



[Intel-gfx] [PATCH] i915/pmu: Move execlist stats initialization to execlist specific setup

2023-09-12 Thread Umesh Nerlige Ramappa
engine->stats is a union of execlist and guc stat objects. When execlist
specific fields are initialized, the initial state of guc stats is
affected. This results in bad busyness values when using GuC mode. Move
the execlist initialization from common code to execlist specific code.

Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to 
pmu")
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c| 1 -
 drivers/gpu/drm/i915/gt/intel_execlists_submission.c | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index dfb69fc977a0..84a75c95f3f7 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -558,7 +558,6 @@ static int intel_engine_setup(struct intel_gt *gt, enum 
intel_engine_id id,
DRIVER_CAPS(i915)->has_logical_contexts = true;
 
ewma__engine_latency_init(&engine->latency);
-   seqcount_init(&engine->stats.execlists.lock);
 
ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 4d05321dc5b5..e8f42ec6b1b4 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3548,6 +3548,8 @@ int intel_execlists_submission_setup(struct 
intel_engine_cs *engine)
logical_ring_default_vfuncs(engine);
logical_ring_default_irqs(engine);
 
+   seqcount_init(&engine->stats.execlists.lock);
+
if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)
rcs_submission_override(engine);
 
-- 
2.38.1



[Intel-gfx] [PATCH] i915/guc: Run busyness worker only if gt is awake

2023-09-11 Thread Umesh Nerlige Ramappa
The worker is canceled in the __gt_park path, but we still see it
running sometimes during suspend.

Only update stats if gt is awake. If not, intel_guc_busyness_park would
have already updated the stats. Note that we do not requeue the worker
if gt is not awake since intel_guc_busyness_unpark would do that at some
point.

If the gt was parked longer than time taken for GT timestamp to roll
over, we ignore those rollovers since we don't care about tracking the
exact GT time. We only care about roll overs when the gt is active and
running workloads.

v2 (Daniele)
- Edit commit message and code comment
- Use runtime pm in the worker
- Put runtime pm after enabling the worker
- Use Link tag and add Fixes tag

Link: https://gitlab.freedesktop.org/drm/intel/-/issues/7077
Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to 
pmu")
Signed-off-by: Umesh Nerlige Ramappa 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 26 ---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index e250bedf90fb..d37b29a0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1461,6 +1461,24 @@ static void guc_timestamp_ping(struct work_struct *wrk)
unsigned long index;
int srcu, ret;
 
+   /*
+* The worker is canceled in the __gt_park path, but we still see it
+* running sometimes during suspend.
+*
+* Only update stats if gt is awake. If not, intel_guc_busyness_park
+* would have already updated the stats. Note that we do not requeue the
+* worker in this case since intel_guc_busyness_unpark would do that at
+* some point.
+*
+* If the gt was parked longer than time taken for GT timestamp to roll
+* over, we ignore those rollovers since we don't care about tracking
+* the exact GT time. We only care about roll overs when the gt is
+* active and running workloads.
+*/
+   wakeref = intel_runtime_pm_get_if_active(>->i915->runtime_pm);
+   if (!wakeref)
+   return;
+
/*
 * Synchronize with gt reset to make sure the worker does not
 * corrupt the engine/guc stats. NB: can't actually block waiting
@@ -1469,10 +1487,9 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 */
ret = intel_gt_reset_trylock(gt, &srcu);
if (ret)
-   return;
+   goto err_trylock;
 
-   with_intel_runtime_pm(>->i915->runtime_pm, wakeref)
-   __update_guc_busyness_stats(guc);
+   __update_guc_busyness_stats(guc);
 
/* adjust context stats for overflow */
xa_for_each(&guc->context_lookup, index, ce)
@@ -1481,6 +1498,9 @@ static void guc_timestamp_ping(struct work_struct *wrk)
intel_gt_reset_unlock(gt, srcu);
 
guc_enable_busyness_worker(guc);
+
+err_trylock:
+   intel_runtime_pm_put(>->i915->runtime_pm, wakeref);
 }
 
 static int guc_action_enable_usage_stats(struct intel_guc *guc)
-- 
2.38.1



Re: [Intel-gfx] [PATCH] i915/guc: Run busyness worker only if gt is awake

2023-09-11 Thread Umesh Nerlige Ramappa

On Mon, Sep 11, 2023 at 08:44:39AM -0700, Daniele Ceraolo Spurio wrote:

  On 9/8/2023 10:16 PM, Umesh Nerlige Ramappa wrote:

The worker is canceled in the __gt_park path, but we still see it
running sometimes during suspend. This is likely because some code is
getting a gt wakeref in the __gt_park path.

  This possible root-cause doesn't seem plausible to me, because a gt
  wakeref would cause an unpark, so taking it within the park would probably
  cause a deadlock. Is it not more likely that the worker re-queued itself?


Will drop the likely part. The worker running during suspend is the 
issue, so keeping that part.




Only update stats if gt is awake. If not, intel_guc_busyness_park would
have already updated the stats. Note that we do not requeue the worker
if gt is not awake since intel_guc_busyness_unpark would do that at some
point.

If the gt was parked longer than time taken for GT timestamp to roll
over, we ignore those rollovers since we don't care about tracking the
exact GT time. We only care about roll overs when the gt is active and
running workloads.

Closes: [1]https://gitlab.freedesktop.org/drm/intel/-/issues/7077

  This needs a fixes tag. Also, I'm not 100% sure but I believe we prefer
  "Link" to "Closes".


I thought Link was mostly for the patchworks link. I can change this to 
Link.


Any idea if there is a document/link that explains which tag to use for 
what? I have been confused by this before.




Signed-off-by: Umesh Nerlige Ramappa [2]
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 27 ---
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index e250bedf90fb..df31d6047ce9 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1457,10 +1457,27 @@ static void guc_timestamp_ping(struct work_struct *wrk)
struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
struct intel_gt *gt = guc_to_gt(guc);
struct intel_context *ce;
-   intel_wakeref_t wakeref;
unsigned long index;
int srcu, ret;

+   /*
+* The worker is canceled in the __gt_park path, but we still see it
+* running sometimes during suspend. This is likely because some code
+* is getting a gt wakeref in the __gt_park path.

  Same comment from before about this explanation. I would just remove this
  part from the comment.

+*
+* Only update stats if gt is awake. If not, intel_guc_busyness_park
+* would have already updated the stats. Note that we do not requeue the
+* worker in this case since intel_guc_busyness_unpark would do that at
+* some point.
+*
+* If the gt was parked longer than time taken for GT timestamp to roll
+* over, we ignore those rollovers since we don't care about tracking
+* the exact GT time. We only care about roll overs when the gt is
+* active and running workloads.
+*/
+   if (!intel_gt_pm_get_if_awake(gt))
+   return;
+

  Do we need to drop the _sync from the busyness stats worker parking if we
  take the gt_pm wakeref in here, instead of an rpm one? because if the
  gt_pm_put below causes a park and the park waits on this worker to
  complete then we'll deadlock.


Hmm, My bad, That's not what I intended. It should be 
intel_runtime_pm_get_if_active(). I will change that




/*
 * Synchronize with gt reset to make sure the worker does not
 * corrupt the engine/guc stats. NB: can't actually block waiting
@@ -1468,17 +1485,19 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 * this worker thread if started. So waiting would deadlock.
 */
ret = intel_gt_reset_trylock(gt, &srcu);
-   if (ret)
+   if (ret) {
+   intel_gt_pm_put(gt);
return;
+   }

-   with_intel_runtime_pm(>->i915->runtime_pm, wakeref)
-   __update_guc_busyness_stats(guc);
+   __update_guc_busyness_stats(guc);

/* adjust context stats for overflow */
xa_for_each(&guc->context_lookup, index, ce)
guc_context_update_stats(ce);

intel_gt_reset_unlock(gt, srcu);
+   intel_gt_pm_put(gt);

  I think this needs to go after the queuing, because it could cause a park
  and if it does we don't want to re-queue the worker immediately after,
  while if we queue it before then the park will cancel it.
  Non-blocking style comment: with gt_pm_put the last thing in function, you
  can also transform that early return in a "goto put;" and have a single
  place for the gt_put.


Will change, although I am not sure if the runtime pm put may also cause 
a gt park. Assuming it can, I will make these changes.


Thanks
Ume

[Intel-gfx] [PATCH] i915/guc: Run busyness worker only if gt is awake

2023-09-08 Thread Umesh Nerlige Ramappa
The worker is canceled in the __gt_park path, but we still see it
running sometimes during suspend. This is likely because some code is
getting a gt wakeref in the __gt_park path.

Only update stats if gt is awake. If not, intel_guc_busyness_park would
have already updated the stats. Note that we do not requeue the worker
if gt is not awake since intel_guc_busyness_unpark would do that at some
point.

If the gt was parked longer than time taken for GT timestamp to roll
over, we ignore those rollovers since we don't care about tracking the
exact GT time. We only care about roll overs when the gt is active and
running workloads.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7077
Signed-off-by: Umesh Nerlige Ramappa 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 27 ---
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index e250bedf90fb..df31d6047ce9 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1457,10 +1457,27 @@ static void guc_timestamp_ping(struct work_struct *wrk)
struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
struct intel_gt *gt = guc_to_gt(guc);
struct intel_context *ce;
-   intel_wakeref_t wakeref;
unsigned long index;
int srcu, ret;
 
+   /*
+* The worker is canceled in the __gt_park path, but we still see it
+* running sometimes during suspend. This is likely because some code
+* is getting a gt wakeref in the __gt_park path.
+*
+* Only update stats if gt is awake. If not, intel_guc_busyness_park
+* would have already updated the stats. Note that we do not requeue the
+* worker in this case since intel_guc_busyness_unpark would do that at
+* some point.
+*
+* If the gt was parked longer than time taken for GT timestamp to roll
+* over, we ignore those rollovers since we don't care about tracking
+* the exact GT time. We only care about roll overs when the gt is
+* active and running workloads.
+*/
+   if (!intel_gt_pm_get_if_awake(gt))
+   return;
+
/*
 * Synchronize with gt reset to make sure the worker does not
 * corrupt the engine/guc stats. NB: can't actually block waiting
@@ -1468,17 +1485,19 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 * this worker thread if started. So waiting would deadlock.
 */
ret = intel_gt_reset_trylock(gt, &srcu);
-   if (ret)
+   if (ret) {
+   intel_gt_pm_put(gt);
return;
+   }
 
-   with_intel_runtime_pm(>->i915->runtime_pm, wakeref)
-   __update_guc_busyness_stats(guc);
+   __update_guc_busyness_stats(guc);
 
/* adjust context stats for overflow */
xa_for_each(&guc->context_lookup, index, ce)
guc_context_update_stats(ce);
 
intel_gt_reset_unlock(gt, srcu);
+   intel_gt_pm_put(gt);
 
guc_enable_busyness_worker(guc);
 }
-- 
2.38.1



[Intel-gfx] [PATCH] drm/i915/perf: Determine context valid in OA reports

2023-08-02 Thread Umesh Nerlige Ramappa
When supporting OA for TGL, it was seen that the context valid bit in
the report ID was not defined, however revisiting the spec seems to have
this bit defined. The bit is used to determine if a context is valid on
a context switch and is essential to determine active and idle periods
for a context. Re-enable the context valid bit for gen12 platforms.

BSpec: 52196 (description of report_id)

v2: Include BSpec reference (Ashutosh)

Fixes: 00a7f0d7155c ("drm/i915/tgl: Add perf support on TGL")
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_perf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 04bc1f4a1115..59e1e21df271 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -482,8 +482,7 @@ static void oa_report_id_clear(struct i915_perf_stream 
*stream, u32 *report)
 static bool oa_report_ctx_invalid(struct i915_perf_stream *stream, void 
*report)
 {
return !(oa_report_id(stream, report) &
-  stream->perf->gen8_valid_ctx_bit) &&
-  GRAPHICS_VER(stream->perf->i915) <= 11;
+  stream->perf->gen8_valid_ctx_bit);
 }
 
 static u64 oa_timestamp(struct i915_perf_stream *stream, void *report)
@@ -5106,6 +5105,7 @@ static void i915_perf_init_info(struct drm_i915_private 
*i915)
perf->gen8_valid_ctx_bit = BIT(16);
break;
case 12:
+   perf->gen8_valid_ctx_bit = BIT(16);
/*
 * Calculate offset at runtime in oa_pin_context for gen12 and
 * cache the value in perf->ctx_oactxctrl_offset.
-- 
2.36.1



[Intel-gfx] [PATCH] drm/i915/perf: Consider OA buffer boundary when zeroing out reports

2023-06-16 Thread Umesh Nerlige Ramappa
For reports that are not powers of 2, reports at the end of the OA
buffer may get split across the buffer boundary. When zeroing out such
reports, take the split into consideration.

v2: Use OA_BUFFER_SIZE (Ashutosh)

Fixes: 09a36015d9a0 ("drm/i915/perf: Clear out entire reports after reading if 
not power of 2 size")
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index b5491a382bfd..66ab6e1d5c7b 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -867,8 +867,17 @@ static int gen8_append_oa_reports(struct i915_perf_stream 
*stream,
oa_report_id_clear(stream, report32);
oa_timestamp_clear(stream, report32);
} else {
+   u8 *oa_buf_end = stream->oa_buffer.vaddr +
+OA_BUFFER_SIZE;
+   u32 part = oa_buf_end - (u8 *)report32;
+
/* Zero out the entire report */
-   memset(report32, 0, report_size);
+   if (report_size <= part) {
+   memset(report32, 0, report_size);
+   } else {
+   memset(report32, 0, part);
+   memset(oa_buf_base, 0, report_size - part);
+   }
}
}
 
-- 
2.36.1



[Intel-gfx] [PATCH] drm/i915/perf: Consider OA buffer boundary when zeroing out reports

2023-06-15 Thread Umesh Nerlige Ramappa
For reports that are not powers of 2, reports at the end of the OA
buffer may get split across the buffer boundary. When zeroing out such
reports, take the split into consideration.

Fixes: 09a36015d9a0 ("drm/i915/perf: Clear out entire reports after reading if 
not power of 2 size")
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index b5491a382bfd..9a8e329c5b5e 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -867,8 +867,17 @@ static int gen8_append_oa_reports(struct i915_perf_stream 
*stream,
oa_report_id_clear(stream, report32);
oa_timestamp_clear(stream, report32);
} else {
+   u8 *oa_buf_end = stream->oa_buffer.vaddr +
+stream->oa_buffer.vma->size;
+   u32 part = (u32)((void *)oa_buf_end - (void *)report32);
+
/* Zero out the entire report */
-   memset(report32, 0, report_size);
+   if (report_size <= part) {
+   memset(report32, 0, report_size);
+   } else {
+   memset(report32, 0, part);
+   memset(oa_buf_base, 0, report_size - part);
+   }
}
}
 
-- 
2.36.1



[Intel-gfx] [PATCH] drm/i915/perf: Determine context valid in OA reports

2023-06-15 Thread Umesh Nerlige Ramappa
When supporting OA for TGL, it was seen that the context valid bit in
the report ID was not defined, however revisiting the spec seems to have
this bit defined. The bit is used to determine if a context is valid on
a context switch and is essential to determine active and idle periods
for a context. Re-enable the context valid bit for gen12 platforms.

Fixes: 00a7f0d7155c ("drm/i915/tgl: Add perf support on TGL")
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 0a111b281578..b5491a382bfd 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -482,8 +482,7 @@ static void oa_report_id_clear(struct i915_perf_stream 
*stream, u32 *report)
 static bool oa_report_ctx_invalid(struct i915_perf_stream *stream, void 
*report)
 {
return !(oa_report_id(stream, report) &
-  stream->perf->gen8_valid_ctx_bit) &&
-  GRAPHICS_VER(stream->perf->i915) <= 11;
+  stream->perf->gen8_valid_ctx_bit);
 }
 
 static u64 oa_timestamp(struct i915_perf_stream *stream, void *report)
@@ -5096,6 +5095,7 @@ static void i915_perf_init_info(struct drm_i915_private 
*i915)
perf->gen8_valid_ctx_bit = BIT(16);
break;
case 12:
+   perf->gen8_valid_ctx_bit = BIT(16);
/*
 * Calculate offset at runtime in oa_pin_context for gen12 and
 * cache the value in perf->ctx_oactxctrl_offset.
-- 
2.36.1



Re: [Intel-gfx] ✗ Fi.CI.IGT: failure for Avoid reading OA reports before they land (rev2)

2023-06-07 Thread Umesh Nerlige Ramappa

On Wed, Jun 07, 2023 at 05:40:28PM +, Patchwork wrote:

  Patch Details

Series:  Avoid reading OA reports before they land (rev2)
URL: [1]https://patchwork.freedesktop.org/series/118886/
State:   failure
Details: 
[2]https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118886v2/index.html

CI Bug Log - changes from CI_DRM_13238_full -> Patchwork_118886v2_full

Summary

  FAILURE

  Serious unknown changes coming with Patchwork_118886v2_full absolutely
  need to be
  verified manually.

  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_118886v2_full, please notify your bug team to
  allow them
  to document this new failure mode, which will reduce false positives in
  CI.

Participating hosts (7 -> 7)

  No changes in participating hosts

Possible new issues

  Here are the unknown changes that may have been introduced in
  Patchwork_118886v2_full:

 IGT changes

   Possible regressions

* igt@kms_vblank@pipe-b-accuracy-idle:

 * shard-glk: [3]PASS -> [4]FAIL


Unrelated to this patch since no OA use cases in the above test path.  


Umesh



Known issues



Re: [Intel-gfx] ✗ Fi.CI.BAT: failure for Avoid reading OA reports before they land

2023-06-07 Thread Umesh Nerlige Ramappa

On Mon, Jun 05, 2023 at 11:44:21PM +, Patchwork wrote:

  Patch Details

Series:  Avoid reading OA reports before they land
URL: [1]https://patchwork.freedesktop.org/series/118886/
State:   failure
Details: 
[2]https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118886v1/index.html

 CI Bug Log - changes from CI_DRM_13232 -> Patchwork_118886v1

Summary

  FAILURE

  Serious unknown changes coming with Patchwork_118886v1 absolutely need to
  be
  verified manually.

  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_118886v1, please notify your bug team to allow
  them
  to document this new failure mode, which will reduce false positives in
  CI.

  External URL:
  https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118886v1/index.html

Participating hosts (37 -> 37)

  Additional (1): bat-rpls-2
  Missing (1): fi-snb-2520m

Possible new issues

  Here are the unknown changes that may have been introduced in
  Patchwork_118886v1:

 IGT changes

   Possible regressions

* igt@i915_selftest@live@gt_timelines:

 * fi-apl-guc: [3]PASS -> [4]DMESG-WARN +2 similar issues


<3> [309.685038] i915 :00:02.0: [drm] *ERROR* Failed to probe lspcon

This warning is not related to OA or any use case from this patch.



   Warnings

* igt@kms_psr@sprite_plane_onoff:

 * bat-rplp-1: [5]SKIP ([6]i915#1072) -> [7]ABORT


+ John

These are not related to OA, but a known lockdep issue.

<4>[  229.036305] ==
<4>[  229.036320] WARNING: possible circular locking dependency detected
<4>[  229.036334] 6.4.0-rc5-Patchwork_118886v1-g450d228e3840+ #1 Not tainted
<4>[  229.036348] --
<4>[  229.036362] kworker/0:0H/8 is trying to acquire lock:
<4>[  229.036374] 888117b74f48 (>->reset.backoff_srcu){}-{0:0}, at: 
_intel_gt_reset_lock+0x0/0x330 [i915]
<4>[  229.036503] but task is already holding lock:
<4>[  229.036521] c90d3e60 
((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: 
process_one_work+0x1cc/0x510
<4>[  229.036548] which lock already depends on the new lock.

<4>[  229.036574] the existing dependency chain (in reverse order) is:
<4>[  229.036598] -> #3 
((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
<4>[  229.036624]lock_acquire+0xd8/0x2d0
<4>[  229.036636]__flush_work+0x74/0x530
<4>[  229.036646]__cancel_work_timer+0x14f/0x1f0
<4>[  229.036658]intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
<4>[  229.036799]intel_uc_reset_prepare+0x9c/0x120 [i915]
<4>[  229.036938]reset_prepare+0x21/0x60 [i915]
<4>[  229.037054]intel_gt_reset+0x1dd/0x470 [i915]
<4>[  229.037172]intel_gt_reset_global+0xfb/0x170 [i915]
<4>[  229.037285]intel_gt_handle_error+0x368/0x420 [i915]
<4>[  229.037401]intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
<4>[  229.037509]i915_wedged_set+0x29/0x40 [i915]
<4>[  229.037600]simple_attr_write_xsigned.constprop.0+0xb4/0x110
<4>[  229.037616]full_proxy_write+0x52/0x80
<4>[  229.037627]vfs_write+0xc5/0x4f0
<4>[  229.037637]ksys_write+0x64/0xe0
<4>[  229.037646]do_syscall_64+0x3c/0x90
<4>[  229.037658]entry_SYSCALL_64_after_hwframe+0x72/0xdc
<4>[  229.037672] -> #2 (>->reset.mutex){+.+.}-{3:3}:
<4>[  229.037694]lock_acquire+0xd8/0x2d0
<4>[  229.037704]i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
<4>[  229.037835]intel_gt_init_reset+0x65/0x80 [i915]
<4>[  229.037948]intel_gt_common_init_early+0xe1/0x170 [i915]
<4>[  229.038055]intel_root_gt_init_early+0x48/0x60 [i915]
<4>[  229.038158]i915_driver_probe+0x243/0xcd0 [i915]
<4>[  229.038247]i915_pci_probe+0xdc/0x210 [i915]
<4>[  229.038335]pci_device_probe+0x95/0x120
<4>[  229.038347]really_probe+0x164/0x3c0
<4>[  229.038358]__driver_probe_device+0x73/0x160
<4>[  229.038371]driver_probe_device+0x19/0xa0
<4>[  229.038384]__driver_attach+0xb6/0x180
<4>[  229.038395]bus_for_each_dev+0x77/0xd0
<4>[  229.038405]bus_add_driver+0x114/0x210
<4>[  229.038415]driver_register+0x5b/0x110
<4>[  229.038425]0xa00fd033
<4>[  229.038439]do_one_initcall+0x57/0x270
<4>[  229.038450]do_init_module+0x5f/0x220
<4>[  229.038461]load_module+0x1ca4/0x1f00
<4>[  229.038472]__do_sys_finit_module+0xb4/0x130
<4>[  229.038484]do_syscall_64+0x3c/0x90
<4>[  229.038495]entry_SYSCALL_64_after_hwframe+0x72/0xdc
<4>[  229.038508] -> #1 (fs_reclaim){+.+.}-{0:0}:
<4>[  229.038528]lock_acquire+0xd8/0x2d0
<4>[  229.038538]fs_reclaim_acquire+0xac/0xe0
<4>[  229.038550]__kmem_cache_alloc_node+0x30/0x1b0
<4>[  229.038563]kmalloc_trace+0x24/0xb0
<4>[  229.039296]kernfs_fop_open+0xc0/0x3d0
<4>[  229.040028]do_d

[Intel-gfx] [PATCH v4 0/2] Avoid reading OA reports before they land

2023-06-05 Thread Umesh Nerlige Ramappa
Fix OA issue seen on DG2 where parts of OA reports are zeroed out or
have stale values. This was due to the fact that rewind logic was not
being run when the tail pointer was aged. The series drops the complex
aging/aged logic and just checks the reports for validity.

rev1 - https://patchwork.freedesktop.org/series/118054/
v2: Drop aging logic completely
v3: Remove unnecessary renames and squash patches
v4: Indentaion fixes

Signed-off-by: Umesh Nerlige Ramappa 

Umesh Nerlige Ramappa (2):
  i915/perf: Drop the aging_tail logic in perf OA
  i915/perf: Do not add ggtt offset to hw_tail

 drivers/gpu/drm/i915/i915_perf.c   | 92 ++
 drivers/gpu/drm/i915/i915_perf_types.h | 12 
 2 files changed, 36 insertions(+), 68 deletions(-)

-- 
2.36.1



[Intel-gfx] [PATCH v4 1/2] i915/perf: Drop the aging_tail logic in perf OA

2023-06-05 Thread Umesh Nerlige Ramappa
On DG2, capturing OA reports while running heavy render workloads
sometimes results in invalid OA reports where 64-byte chunks inside
reports have stale values. Under memory pressure, high OA sampling rates
(13.3 us) and heavy render workload, occasionally, the OA HW TAIL
pointer does not progress as fast as the sampling rate. When these
glitches occur, the TAIL pointer takes approx. 200us to progress.  While
this is expected behavior from the HW perspective, invalid reports are
not expected.

In oa_buffer_check_unlocked(), when we execute the if condition, we are
updating the oa_buffer.tail to the aging tail and then setting pollin
based on this tail value, however, we do not have a chance to rewind and
validate the reports prior to setting pollin. The validation happens
in a subsequent call to oa_buffer_check_unlocked(). If a read occurs
before this validation, then we end up reading reports up until this
oa_buffer.tail value which includes invalid reports. Though found on
DG2, this affects all platforms.

The aging tail logic is no longer necessary since we are explicitly
checking for landed reports.

Start by dropping the aging tail logic.

v2:
- Drop extra blank line
- Add reason to drop aging logic (Ashutosh)
- Add bug links (Ashutosh)
- rename aged_tail to read_tail
- Squash patches 3 and 1

v3: (Ashutosh)
- Remove extra spaces
- Remove gtt_offset from the pollin calculation
- s/Bug:/Link/ in commit message (checkpatch)

Link: https://gitlab.freedesktop.org/drm/intel/-/issues/7484
Link: https://gitlab.freedesktop.org/drm/intel/-/issues/7757
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_perf.c   | 95 +++---
 drivers/gpu/drm/i915/i915_perf_types.h | 12 
 2 files changed, 38 insertions(+), 69 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 58284156428d..a8d43bf1f6d5 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -531,8 +531,7 @@ static void oa_context_id_squash(struct i915_perf_stream 
*stream, u32 *report)
  * (See description of OA_TAIL_MARGIN_NSEC above for further details.)
  *
  * Besides returning true when there is data available to read() this function
- * also updates the tail, aging_tail and aging_timestamp in the oa_buffer
- * object.
+ * also updates the tail in the oa_buffer object.
  *
  * Note: It's safe to read OA config state here unlocked, assuming that this is
  * only called while the stream is enabled, while the global OA configuration
@@ -544,10 +543,10 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
 {
u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
int report_size = stream->oa_buffer.format->size;
+   u32 head, tail, read_tail;
unsigned long flags;
bool pollin;
u32 hw_tail;
-   u64 now;
u32 partial_report_size;
 
/* We have to consider the (unlikely) possibility that read() errors
@@ -568,62 +567,47 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
/* Subtract partial amount off the tail */
hw_tail = gtt_offset + OA_TAKEN(hw_tail, partial_report_size);
 
-   now = ktime_get_mono_fast_ns();
-
-   if (hw_tail == stream->oa_buffer.aging_tail &&
-   (now - stream->oa_buffer.aging_timestamp) > OA_TAIL_MARGIN_NSEC) {
-   /* If the HW tail hasn't move since the last check and the HW
-* tail has been aging for long enough, declare it the new
-* tail.
-*/
-   stream->oa_buffer.tail = stream->oa_buffer.aging_tail;
-   } else {
-   u32 head, tail, aged_tail;
-
-   /* NB: The head we observe here might effectively be a little
-* out of date. If a read() is in progress, the head could be
-* anywhere between this head and stream->oa_buffer.tail.
-*/
-   head = stream->oa_buffer.head - gtt_offset;
-   aged_tail = stream->oa_buffer.tail - gtt_offset;
-
-   hw_tail -= gtt_offset;
-   tail = hw_tail;
-
-   /* Walk the stream backward until we find a report with report
-* id and timestmap not at 0. Since the circular buffer pointers
-* progress by increments of 64 bytes and that reports can be up
-* to 256 bytes long, we can't tell whether a report has fully
-* landed in memory before the report id and timestamp of the
-* following report have effectively landed.
-*
-* This is assuming that the writes of the OA unit land in
-* memory in the order they were written to.
-* If not : (╯°□°)╯︵ ┻━┻
-*/
-   while (OA_TAKEN(tail, aged_tail) >= r

[Intel-gfx] [PATCH v4 2/2] i915/perf: Do not add ggtt offset to hw_tail

2023-06-05 Thread Umesh Nerlige Ramappa
ggtt offset for hw_tail is not required for the calculations, so drop
it.

Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_perf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index a8d43bf1f6d5..0a111b281578 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -565,7 +565,7 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
partial_report_size %= report_size;
 
/* Subtract partial amount off the tail */
-   hw_tail = gtt_offset + OA_TAKEN(hw_tail, partial_report_size);
+   hw_tail = OA_TAKEN(hw_tail, partial_report_size);
 
/* NB: The head we observe here might effectively be a little
 * out of date. If a read() is in progress, the head could be
@@ -574,7 +574,6 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
head = stream->oa_buffer.head - gtt_offset;
read_tail = stream->oa_buffer.tail - gtt_offset;
 
-   hw_tail -= gtt_offset;
tail = hw_tail;
 
/* Walk the stream backward until we find a report with report
-- 
2.36.1



[Intel-gfx] [PATCH v3 0/2] Avoid reading OA reports before they land

2023-06-02 Thread Umesh Nerlige Ramappa
Fix OA issue seen on DG2 where parts of OA reports are zeroed out or
have stale values. This was due to the fact that rewind logic was not
being run when the tail pointer was aged. The series drops the complex
aging/aged logic and just checks the reports for validity.

rev1 - https://patchwork.freedesktop.org/series/118054/

Signed-off-by: Umesh Nerlige Ramappa 

Umesh Nerlige Ramappa (2):
  i915/perf: Drop the aging_tail logic in perf OA
  i915/perf: Do not add ggtt offset to hw_tail

 drivers/gpu/drm/i915/i915_perf.c   | 76 ++
 drivers/gpu/drm/i915/i915_perf_types.h | 12 
 2 files changed, 28 insertions(+), 60 deletions(-)

-- 
2.36.1



[Intel-gfx] [PATCH v3 1/2] i915/perf: Drop the aging_tail logic in perf OA

2023-06-02 Thread Umesh Nerlige Ramappa
On DG2, capturing OA reports while running heavy render workloads
sometimes results in invalid OA reports where 64-byte chunks inside
reports have stale values. Under memory pressure, high OA sampling rates
(13.3 us) and heavy render workload, occasionally, the OA HW TAIL
pointer does not progress as fast as the sampling rate. When these
glitches occur, the TAIL pointer takes approx. 200us to progress.  While
this is expected behavior from the HW perspective, invalid reports are
not expected.

In oa_buffer_check_unlocked(), when we execute the if condition, we are
updating the oa_buffer.tail to the aging tail and then setting pollin
based on this tail value, however, we do not have a chance to rewind and
validate the reports prior to setting pollin. The validation happens
in a subsequent call to oa_buffer_check_unlocked(). If a read occurs
before this validation, then we end up reading reports up until this
oa_buffer.tail value which includes invalid reports. Though found on
DG2, this affects all platforms.

The aging tail logic is no longer necessary since we are explicitly
checking for landed reports.

Start by dropping the aging tail logic.

v2:
- Drop extra blank line
- Add reason to drop aging logic (Ashutosh)
- Add bug links (Ashutosh)
- rename aged_tail to read_tail
- Squash patches 3 and 1

Bug: https://gitlab.freedesktop.org/drm/intel/-/issues/7484
Bug: https://gitlab.freedesktop.org/drm/intel/-/issues/7757
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c   | 75 ++
 drivers/gpu/drm/i915/i915_perf_types.h | 12 -
 2 files changed, 28 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 58284156428d..9cb3d395046e 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -531,8 +531,7 @@ static void oa_context_id_squash(struct i915_perf_stream 
*stream, u32 *report)
  * (See description of OA_TAIL_MARGIN_NSEC above for further details.)
  *
  * Besides returning true when there is data available to read() this function
- * also updates the tail, aging_tail and aging_timestamp in the oa_buffer
- * object.
+ * also updates the tail in the oa_buffer object.
  *
  * Note: It's safe to read OA config state here unlocked, assuming that this is
  * only called while the stream is enabled, while the global OA configuration
@@ -544,10 +543,10 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
 {
u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
int report_size = stream->oa_buffer.format->size;
+   u32 head, tail, read_tail;
unsigned long flags;
bool pollin;
u32 hw_tail;
-   u64 now;
u32 partial_report_size;
 
/* We have to consider the (unlikely) possibility that read() errors
@@ -568,27 +567,15 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
/* Subtract partial amount off the tail */
hw_tail = gtt_offset + OA_TAKEN(hw_tail, partial_report_size);
 
-   now = ktime_get_mono_fast_ns();
-
-   if (hw_tail == stream->oa_buffer.aging_tail &&
-   (now - stream->oa_buffer.aging_timestamp) > OA_TAIL_MARGIN_NSEC) {
-   /* If the HW tail hasn't move since the last check and the HW
-* tail has been aging for long enough, declare it the new
-* tail.
-*/
-   stream->oa_buffer.tail = stream->oa_buffer.aging_tail;
-   } else {
-   u32 head, tail, aged_tail;
-
-   /* NB: The head we observe here might effectively be a little
-* out of date. If a read() is in progress, the head could be
-* anywhere between this head and stream->oa_buffer.tail.
-*/
-   head = stream->oa_buffer.head - gtt_offset;
-   aged_tail = stream->oa_buffer.tail - gtt_offset;
+   /* NB: The head we observe here might effectively be a little
+* out of date. If a read() is in progress, the head could be
+* anywhere between this head and stream->oa_buffer.tail.
+*/
+   head = stream->oa_buffer.head - gtt_offset;
+   read_tail = stream->oa_buffer.tail - gtt_offset;
 
-   hw_tail -= gtt_offset;
-   tail = hw_tail;
+   hw_tail -= gtt_offset;
+   tail = hw_tail;
 
/* Walk the stream backward until we find a report with report
 * id and timestmap not at 0. Since the circular buffer pointers
@@ -596,31 +583,28 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
 * to 256 bytes long, we can't tell whether a report has fully
 * landed in memory before the report id and timestamp of the
 * following report have effectively landed.
-*
-

[Intel-gfx] [PATCH v3 2/2] i915/perf: Do not add ggtt offset to hw_tail

2023-06-02 Thread Umesh Nerlige Ramappa
ggtt offset for hw_tail is not required for the calculations, so drop
it.

Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_perf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 9cb3d395046e..0a1f40d21163 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -565,7 +565,7 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
partial_report_size %= report_size;
 
/* Subtract partial amount off the tail */
-   hw_tail = gtt_offset + OA_TAKEN(hw_tail, partial_report_size);
+   hw_tail = OA_TAKEN(hw_tail, partial_report_size);
 
/* NB: The head we observe here might effectively be a little
 * out of date. If a read() is in progress, the head could be
@@ -574,7 +574,6 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
head = stream->oa_buffer.head - gtt_offset;
read_tail = stream->oa_buffer.tail - gtt_offset;
 
-   hw_tail -= gtt_offset;
tail = hw_tail;
 
/* Walk the stream backward until we find a report with report
-- 
2.36.1



Re: [Intel-gfx] [PATCH] drm/i915: sync I915_PMU_MAX_GTS to I915_MAX_GT

2023-06-01 Thread Umesh Nerlige Ramappa

On Thu, Jun 01, 2023 at 11:22:18AM -0700, Dixit, Ashutosh wrote:

On Wed, 31 May 2023 14:35:47 -0700, Matt Atwood wrote:




Hi Matt,


Set I915_PMU_MAX_GTS to value in I915_MAX_GT, theres no reason for these
values to be different.

Cc: Tvrtko Ursulin 
Cc: Umesh Nerlige Ramappa 
Cc: Ashutosh Dixit 


I don't believe the mailer actually Cc'd us. I just saw this and am Cc'ing
the people who authored/reviewed the previous series now.


Signed-off-by: Matt Atwood 
---
 drivers/gpu/drm/i915/i915_pmu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index 33d80fbaab8b..aa929d8c224a 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -38,7 +38,7 @@ enum {
__I915_NUM_PMU_SAMPLERS
 };

-#define I915_PMU_MAX_GTS 2
+#define I915_PMU_MAX_GTS 4


This was a discussed during the previous review and it was decided to keep
the two values (I915_PMU_MAX_GTS and I915_MAX_GT) different. There are
currently no platforms and there will be no i915 supported platforms with
MAX_GT 4. So I prefer to leave the values as they currently are. Unless
Umesh or Tvrtko agrees to this patch.


I would leave it as 2 since we specifically changed it to 2 (was 4 
earlier) during review of the PMU multi tile support patches.


Thanks,
Umesh



Thanks.
--
Ashutosh



 /*
  * How many different events we track in the global PMU mask.
--
2.40.0



Re: [Intel-gfx] [PATCH 3/3] i915/perf: Drop the aged_tail from rewind logic

2023-06-01 Thread Umesh Nerlige Ramappa

On Wed, May 31, 2023 at 09:13:02PM -0700, Dixit, Ashutosh wrote:

On Wed, 31 May 2023 16:56:34 -0700, Umesh Nerlige Ramappa wrote:




Hi Umesh,


Instead of aged_tail use an iterator that starts from the hw_tail and
goes backward until the oa_buffer.tail looking for valid reports.


Hmm I don't think this description is correct. All this patch is doing is
the following:

a. s/aged_tail/tail/
b. s/tail/iter/

So basically I don't think we need this patch. All we want to do here is
change the variable name aged_tail to something else (to completely remove
the concept of aging from the OA code) but other changes such as name
change to iter etc. is unnecessary.

So I would just keep the patch simple and change the name aged_tail to
advertized_tail or exported_tail or read_tail, because basically
stream->oa_buffer.tail is the tail which the writer updates (or advertizes
or exports) for the reader.

So we only should rename aged_tail here, the other changes are not needed.

We could even squash this change into Patch 1 or Patch 2, since it is
really a trivial variable rename.


The whole point was just readability. head/tail point to what the user 
consumes. hw_tail points to the actual hw register value and iter is 
just loop iterator.


Since the intent of the series is to just get rid of aging/aged logic, I 
can just s/aged_tail/read_tail/ and squash it with 1 since it belongs 
more to 1 than 2, although, I still like the my current patch (maybe 
with additional description in the commit message to clarify that the 
patch is just renames for readability).


Will post next rev with the simple rename and squash.

Thanks,
Umesh



Thanks.
--
Ashutosh




Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index beb1269422ca..39f5ab1911c8 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -543,7 +543,7 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
 {
u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
int report_size = stream->oa_buffer.format->size;
-   u32 head, tail, aged_tail;
+   u32 head, tail, iter;
unsigned long flags;
bool pollin;
u32 hw_tail;
@@ -567,15 +567,14 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
/* Subtract partial amount off the tail */
hw_tail = OA_TAKEN(hw_tail, partial_report_size);

-
/* NB: The head we observe here might effectively be a little
 * out of date. If a read() is in progress, the head could be
 * anywhere between this head and stream->oa_buffer.tail.
 */
head = stream->oa_buffer.head - gtt_offset;
-   aged_tail = stream->oa_buffer.tail - gtt_offset;
+   tail = stream->oa_buffer.tail - gtt_offset;

-   tail = hw_tail;
+   iter = hw_tail;

/* Walk the stream backward until we find a report with report
 * id and timestmap not at 0. Since the circular buffer pointers
@@ -588,23 +587,23 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
 * memory in the order they were written to.
 * If not : (╯°□°)╯︵ ┻━┻
 */
-   while (OA_TAKEN(tail, aged_tail) >= report_size) {
-   void *report = stream->oa_buffer.vaddr + tail;
+   while (OA_TAKEN(iter, tail) >= report_size) {
+   void *report = stream->oa_buffer.vaddr + iter;

if (oa_report_id(stream, report) ||
oa_timestamp(stream, report))
break;

-   tail = (tail - report_size) & (OA_BUFFER_SIZE - 1);
+   iter = (iter - report_size) & (OA_BUFFER_SIZE - 1);
}

-   if (OA_TAKEN(hw_tail, tail) > report_size &&
+   if (OA_TAKEN(hw_tail, iter) > report_size &&
__ratelimit(&stream->perf->tail_pointer_race))
drm_notice(&stream->uncore->i915->drm,
   "unlanded report(s) head=0x%x tail=0x%x 
hw_tail=0x%x\n",
 head, tail, hw_tail);

-   stream->oa_buffer.tail = gtt_offset + tail;
+   stream->oa_buffer.tail = gtt_offset + iter;

pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset,
  stream->oa_buffer.head - gtt_offset) >= report_size;
--
2.36.1



[Intel-gfx] [PATCH 1/3] i915/perf: Drop the aging_tail logic in perf OA

2023-05-31 Thread Umesh Nerlige Ramappa
On DG2, capturing OA reports while running heavy render workloads
sometimes results in invalid OA reports where 64-byte chunks inside
reports have stale values. Under memory pressure, high OA sampling rates
(13.3 us) and heavy render workload, occasionally, the OA HW TAIL
pointer does not progress as fast as the sampling rate. When these
glitches occur, the TAIL pointer takes approx. 200us to progress.  While
this is expected behavior from the HW perspective, invalid reports are
not expected.

In oa_buffer_check_unlocked(), when we execute the if condition, we are
updating the oa_buffer.tail to the aging tail and then setting pollin
based on this tail value, however, we do not have a chance to rewind and
validate the reports prior to setting pollin. The validation happens
in a subsequent call to oa_buffer_check_unlocked(). If a read occurs
before this validation, then we end up reading reports up until this
oa_buffer.tail value which includes invalid reports. Though found on
DG2, this affects all platforms.

Start by dropping the aging tail logic.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c   | 74 ++
 drivers/gpu/drm/i915/i915_perf_types.h | 12 -
 2 files changed, 28 insertions(+), 58 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 58284156428d..29124dcba8e2 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -531,8 +531,7 @@ static void oa_context_id_squash(struct i915_perf_stream 
*stream, u32 *report)
  * (See description of OA_TAIL_MARGIN_NSEC above for further details.)
  *
  * Besides returning true when there is data available to read() this function
- * also updates the tail, aging_tail and aging_timestamp in the oa_buffer
- * object.
+ * also updates the tail in the oa_buffer object.
  *
  * Note: It's safe to read OA config state here unlocked, assuming that this is
  * only called while the stream is enabled, while the global OA configuration
@@ -544,10 +543,10 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
 {
u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
int report_size = stream->oa_buffer.format->size;
+   u32 head, tail, aged_tail;
unsigned long flags;
bool pollin;
u32 hw_tail;
-   u64 now;
u32 partial_report_size;
 
/* We have to consider the (unlikely) possibility that read() errors
@@ -568,27 +567,16 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
/* Subtract partial amount off the tail */
hw_tail = gtt_offset + OA_TAKEN(hw_tail, partial_report_size);
 
-   now = ktime_get_mono_fast_ns();
 
-   if (hw_tail == stream->oa_buffer.aging_tail &&
-   (now - stream->oa_buffer.aging_timestamp) > OA_TAIL_MARGIN_NSEC) {
-   /* If the HW tail hasn't move since the last check and the HW
-* tail has been aging for long enough, declare it the new
-* tail.
-*/
-   stream->oa_buffer.tail = stream->oa_buffer.aging_tail;
-   } else {
-   u32 head, tail, aged_tail;
-
-   /* NB: The head we observe here might effectively be a little
-* out of date. If a read() is in progress, the head could be
-* anywhere between this head and stream->oa_buffer.tail.
-*/
-   head = stream->oa_buffer.head - gtt_offset;
-   aged_tail = stream->oa_buffer.tail - gtt_offset;
+   /* NB: The head we observe here might effectively be a little
+* out of date. If a read() is in progress, the head could be
+* anywhere between this head and stream->oa_buffer.tail.
+*/
+   head = stream->oa_buffer.head - gtt_offset;
+   aged_tail = stream->oa_buffer.tail - gtt_offset;
 
-   hw_tail -= gtt_offset;
-   tail = hw_tail;
+   hw_tail -= gtt_offset;
+   tail = hw_tail;
 
/* Walk the stream backward until we find a report with report
 * id and timestmap not at 0. Since the circular buffer pointers
@@ -596,31 +584,28 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
 * to 256 bytes long, we can't tell whether a report has fully
 * landed in memory before the report id and timestamp of the
 * following report have effectively landed.
-*
-* This is assuming that the writes of the OA unit land in
-* memory in the order they were written to.
-* If not : (╯°□°)╯︵ ┻━┻
-*/
-   while (OA_TAKEN(tail, aged_tail) >= report_size) {
-   void *report = stream->oa_buffer.vaddr + tail;
+*
+* This is assuming that the

[Intel-gfx] [PATCH 3/3] i915/perf: Drop the aged_tail from rewind logic

2023-05-31 Thread Umesh Nerlige Ramappa
Instead of aged_tail use an iterator that starts from the hw_tail and
goes backward until the oa_buffer.tail looking for valid reports.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index beb1269422ca..39f5ab1911c8 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -543,7 +543,7 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
 {
u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
int report_size = stream->oa_buffer.format->size;
-   u32 head, tail, aged_tail;
+   u32 head, tail, iter;
unsigned long flags;
bool pollin;
u32 hw_tail;
@@ -567,15 +567,14 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
/* Subtract partial amount off the tail */
hw_tail = OA_TAKEN(hw_tail, partial_report_size);
 
-
/* NB: The head we observe here might effectively be a little
 * out of date. If a read() is in progress, the head could be
 * anywhere between this head and stream->oa_buffer.tail.
 */
head = stream->oa_buffer.head - gtt_offset;
-   aged_tail = stream->oa_buffer.tail - gtt_offset;
+   tail = stream->oa_buffer.tail - gtt_offset;
 
-   tail = hw_tail;
+   iter = hw_tail;
 
/* Walk the stream backward until we find a report with report
 * id and timestmap not at 0. Since the circular buffer pointers
@@ -588,23 +587,23 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
 * memory in the order they were written to.
 * If not : (╯°□°)╯︵ ┻━┻
 */
-   while (OA_TAKEN(tail, aged_tail) >= report_size) {
-   void *report = stream->oa_buffer.vaddr + tail;
+   while (OA_TAKEN(iter, tail) >= report_size) {
+   void *report = stream->oa_buffer.vaddr + iter;
 
if (oa_report_id(stream, report) ||
oa_timestamp(stream, report))
break;
 
-   tail = (tail - report_size) & (OA_BUFFER_SIZE - 1);
+   iter = (iter - report_size) & (OA_BUFFER_SIZE - 1);
}
 
-   if (OA_TAKEN(hw_tail, tail) > report_size &&
+   if (OA_TAKEN(hw_tail, iter) > report_size &&
__ratelimit(&stream->perf->tail_pointer_race))
drm_notice(&stream->uncore->i915->drm,
   "unlanded report(s) head=0x%x tail=0x%x 
hw_tail=0x%x\n",
 head, tail, hw_tail);
 
-   stream->oa_buffer.tail = gtt_offset + tail;
+   stream->oa_buffer.tail = gtt_offset + iter;
 
pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset,
  stream->oa_buffer.head - gtt_offset) >= report_size;
-- 
2.36.1



[Intel-gfx] [PATCH 2/3] i915/perf: Do not add ggtt offset to hw_tail

2023-05-31 Thread Umesh Nerlige Ramappa
ggtt offset for hw_tail is not required for the calculations, so drop
it.

Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 29124dcba8e2..beb1269422ca 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -565,7 +565,7 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
partial_report_size %= report_size;
 
/* Subtract partial amount off the tail */
-   hw_tail = gtt_offset + OA_TAKEN(hw_tail, partial_report_size);
+   hw_tail = OA_TAKEN(hw_tail, partial_report_size);
 
 
/* NB: The head we observe here might effectively be a little
@@ -575,7 +575,6 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
head = stream->oa_buffer.head - gtt_offset;
aged_tail = stream->oa_buffer.tail - gtt_offset;
 
-   hw_tail -= gtt_offset;
tail = hw_tail;
 
/* Walk the stream backward until we find a report with report
-- 
2.36.1



[Intel-gfx] [PATCH 0/3] Avoid reading OA reports before they land

2023-05-31 Thread Umesh Nerlige Ramappa
Fix OA issue seen on DG2 where parts of OA reports are zeroed out or
have stale values. This was due to the fact that rewind logic was not
being run when the tail pointer was aged. The series drops the complex
aging/aged logic and just checks the reports for validity.

rev1 - https://patchwork.freedesktop.org/series/118054/

Signed-off-by: Umesh Nerlige Ramappa 

Umesh Nerlige Ramappa (3):
  i915/perf: Drop the aging_tail logic in perf OA
  i915/perf: Do not add ggtt offset to hw_tail
  i915/perf: Drop the aged_tail from rewind logic

 drivers/gpu/drm/i915/i915_perf.c   | 76 ++
 drivers/gpu/drm/i915/i915_perf_types.h | 12 
 2 files changed, 28 insertions(+), 60 deletions(-)

-- 
2.36.1



Re: [Intel-gfx] [PATCH i-g-t] intel_gpu_top: Fix frequency and rc6 counters

2023-05-24 Thread Umesh Nerlige Ramappa

On Tue, May 23, 2023 at 04:24:07PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Need to reset aggregated counters before adding to them otherwise numbers
will grow endlessly.

Signed-off-by: Tvrtko Ursulin 
Fixes: 3dadeff69d4a ("intel_gpu_top: Switch pmu_counter to use aggregated 
values")
Cc: Umesh Nerlige Ramappa 
Cc: Ashutosh Dixit 
---
tools/intel_gpu_top.c | 4 
1 file changed, 4 insertions(+)

diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 4e49367a70c7..a89f13d46f11 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -710,6 +710,10 @@ static void pmu_sample(struct engines *engines)
engines->ts.prev = engines->ts.cur;
engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);

+   engines->freq_req.val.cur = engines->freq_req.val.prev = 0;
+   engines->freq_act.val.cur = engines->freq_act.val.prev = 0;
+   engines->rc6.val.cur = engines->rc6.val.prev = 0;
+


lgtm,

Reviewed-by: Umesh Nerlige Ramappa 

Umesh

for (i = 0; i < engines->num_gts; i++) {
update_sample(&engines->freq_req_gt[i], val);
engines->freq_req.val.cur += engines->freq_req_gt[i].val.cur;
--
2.39.2



Re: [Intel-gfx] [PATCH] i915/perf: Avoid reading OA reports before they land

2023-05-23 Thread Umesh Nerlige Ramappa

On Tue, May 23, 2023 at 11:20:54AM -0700, Dixit, Ashutosh wrote:

On Mon, 22 May 2023 15:08:42 -0700, Umesh Nerlige Ramappa wrote:




Hi Umesh,


On Mon, May 22, 2023 at 01:20:12PM -0700, Dixit, Ashutosh wrote:
> On Fri, 19 May 2023 15:56:42 -0700, Umesh Nerlige Ramappa wrote:
>>
>> On DG2, capturing OA reports while running heavy render workloads
>> sometimes results in invalid OA reports where 64-byte chunks inside
>> reports have stale values. Under memory pressure, high OA sampling rates
>> (13.3 us) and heavy render workload, occassionally, the OA HW TAIL
>> pointer does not progress as fast as the sampling rate. When these
>> glitches occur, the TAIL pointer takes approx. 200us to progress.  While
>> this is expected behavior from the HW perspective, invalid reports are
>> not expected.
>>
>> In oa_buffer_check_unlocked(), when we execute the if condition, we are
>> updating the oa_buffer.tail to the aging tail and then setting pollin
>> based on this tail value, however, we do not have a chance to rewind and
>> validate the reports prior to setting pollin. The validation happens
>> in a subsequent call to oa_buffer_check_unlocked(). If a read occurs
>> before this validation, then we end up reading reports up until this
>> oa_buffer.tail value which includes invalid reports. Though found on
>> DG2, this affects all platforms.
>>
>> Set the pollin only in the else condition in oa_buffer_check_unlocked.
>>
>> Bug: https://gitlab.freedesktop.org/drm/intel/-/issues/7484
>> Bug: https://gitlab.freedesktop.org/drm/intel/-/issues/7757
>> Signed-off-by: Umesh Nerlige Ramappa 
>> ---
>>  drivers/gpu/drm/i915/i915_perf.c | 8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c 
b/drivers/gpu/drm/i915/i915_perf.c
>> index 19d5652300ee..61536e3c4ac9 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -545,7 +545,7 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
>>u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
>>int report_size = stream->oa_buffer.format->size;
>>unsigned long flags;
>> -  bool pollin;
>> +  bool pollin = false;
>>u32 hw_tail;
>>u64 now;
>>u32 partial_report_size;
>> @@ -620,10 +620,10 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
>>stream->oa_buffer.tail = gtt_offset + tail;
>>stream->oa_buffer.aging_tail = gtt_offset + hw_tail;
>>stream->oa_buffer.aging_timestamp = now;
>> -  }
>>
>> -  pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset,
>> -stream->oa_buffer.head - gtt_offset) >= report_size;
>> +  pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset,
>> +stream->oa_buffer.head - gtt_offset) >= 
report_size;
>> +  }
>
> The issue has been correctly identified above. But seems that the real
> cause for the issue is not that pollin statement above is misplaced but
> that updating the tail via aging is unreliable (at least with the present
> timeout as you mention above). Also, it is not clear why we have tail aging
> at all, since it seems we can detect when reports land (by checking
> report_id and timestamp). So rather than move the pollin into the else, we
> should just eliminate the if () part. And if we are eliminating the if ()
> we can just eliminate the concept of tail aging from the code (and
> comments) and rely solely on explicit detection of reports landing.


I missed this yesterday but the above patch is basically incorrect. We need
to return pollin true when we have a "non-zero distance between head and
tail", i.e. when there is data to be read. And we have violated this for
the if () part with this patch (because we are unconditionally returning
false from the if () even when there is data to be read). So there are only
two ways to solve this:


Yikes, didn't see that. Ideally if the tail did progress the first time 
we entered this function, then let's say pollin is true since we find 
some valid reports. If the tail hasn't moved when the function is 
entered second time, then we return false (which is wrong) since there 
may still be data to be read.




a. Increase OA_TAIL_MARGIN_NSEC (the aging time)
b. Eliminate tail aging (i.e. eliminate the if ())

We cannot move the pollin statement into the else.

The preferred way is b. since it makes the overall code consistent
again. And it seems easy enough to do.


I thought s

Re: [Intel-gfx] [PATCH] drm/i915/perf: Clear out entire reports after reading if not power of 2 size

2023-05-23 Thread Umesh Nerlige Ramappa

On Mon, May 22, 2023 at 02:50:51PM -0700, Dixit, Ashutosh wrote:

On Mon, 22 May 2023 14:34:18 -0700, Umesh Nerlige Ramappa wrote:


On Mon, May 22, 2023 at 01:17:49PM -0700, Ashutosh Dixit wrote:
> Clearing out report id and timestamp as means to detect unlanded reports
> only works if report size is power of 2. That is, only when report size is
> a sub-multiple of the OA buffer size can we be certain that reports will
> land at the same place each time in the OA buffer (after rewind). If report
> size is not a power of 2, we need to zero out the entire report to be able
> to detect unlanded reports reliably.
>
> Cc: Umesh Nerlige Ramappa 
> Signed-off-by: Ashutosh Dixit 
> ---
> drivers/gpu/drm/i915/i915_perf.c | 17 +++--
> 1 file changed, 11 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_perf.c 
b/drivers/gpu/drm/i915/i915_perf.c
> index 19d5652300eeb..58284156428dc 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -877,12 +877,17 @@ static int gen8_append_oa_reports(struct 
i915_perf_stream *stream,
>stream->oa_buffer.last_ctx_id = ctx_id;
>}
>
> -  /*
> -   * Clear out the report id and timestamp as a means to detect 
unlanded
> -   * reports.
> -   */
> -  oa_report_id_clear(stream, report32);
> -  oa_timestamp_clear(stream, report32);
> +  if (is_power_of_2(report_size)) {
> +  /*
> +   * Clear out the report id and timestamp as a means
> +   * to detect unlanded reports.
> +   */
> +  oa_report_id_clear(stream, report32);
> +  oa_timestamp_clear(stream, report32);
> +  } else {
> +  /* Zero out the entire report */
> +  memset(report32, 0, report_size);

Indeed, this was a bug. For a minute, I started wondering if this is the
issue I am running into with the other patch posted for DG2, but then I see
the issue within the first fill of the OA buffer where chunks of the
reports are zeroed out, so this is a new issue.


Yes I saw this while reviewing your patch. And also I thought your issue
was happening on DG2 with power of 2 report size, only on MTL OAM we
introduce non power of 2 report size.


lgtm,

Reviewed-by: Umesh Nerlige Ramappa 


Maybe this should include Fixes: tag pointing to the patch that 
introduced the OAM non-power-of-2 format.


Umesh



Thanks.
--
Ashutosh



> +  }
>}
>
>if (start_offset != *offset) {
> --
> 2.38.0
>


Re: [Intel-gfx] [PATCH] i915/perf: Avoid reading OA reports before they land

2023-05-22 Thread Umesh Nerlige Ramappa

On Mon, May 22, 2023 at 01:20:12PM -0700, Dixit, Ashutosh wrote:

On Fri, 19 May 2023 15:56:42 -0700, Umesh Nerlige Ramappa wrote:




Hi Umesh,


On DG2, capturing OA reports while running heavy render workloads
sometimes results in invalid OA reports where 64-byte chunks inside
reports have stale values. Under memory pressure, high OA sampling rates
(13.3 us) and heavy render workload, occassionally, the OA HW TAIL
pointer does not progress as fast as the sampling rate. When these
glitches occur, the TAIL pointer takes approx. 200us to progress.  While
this is expected behavior from the HW perspective, invalid reports are
not expected.

In oa_buffer_check_unlocked(), when we execute the if condition, we are
updating the oa_buffer.tail to the aging tail and then setting pollin
based on this tail value, however, we do not have a chance to rewind and
validate the reports prior to setting pollin. The validation happens
in a subsequent call to oa_buffer_check_unlocked(). If a read occurs
before this validation, then we end up reading reports up until this
oa_buffer.tail value which includes invalid reports. Though found on
DG2, this affects all platforms.

Set the pollin only in the else condition in oa_buffer_check_unlocked.

Bug: https://gitlab.freedesktop.org/drm/intel/-/issues/7484
Bug: https://gitlab.freedesktop.org/drm/intel/-/issues/7757
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 19d5652300ee..61536e3c4ac9 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -545,7 +545,7 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
int report_size = stream->oa_buffer.format->size;
unsigned long flags;
-   bool pollin;
+   bool pollin = false;
u32 hw_tail;
u64 now;
u32 partial_report_size;
@@ -620,10 +620,10 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
stream->oa_buffer.tail = gtt_offset + tail;
stream->oa_buffer.aging_tail = gtt_offset + hw_tail;
stream->oa_buffer.aging_timestamp = now;
-   }

-   pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset,
- stream->oa_buffer.head - gtt_offset) >= report_size;
+   pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset,
+ stream->oa_buffer.head - gtt_offset) >= 
report_size;
+   }


The issue has been correctly identified above. But seems that the real
cause for the issue is not that pollin statement above is misplaced but
that updating the tail via aging is unreliable (at least with the present
timeout as you mention above). Also, it is not clear why we have tail aging
at all, since it seems we can detect when reports land (by checking
report_id and timestamp). So rather than move the pollin into the else, we
should just eliminate the if () part. And if we are eliminating the if ()
we can just eliminate the concept of tail aging from the code (and
comments) and rely solely on explicit detection of reports landing.


I thought so too, it would be much simpler code. Looks like Lionel 
agrees with removing this code as well. 


I do have a couple concerns though.

- In the blocking case, i915_perf_read() path waits on a queue with the
condition being oa_buffer_check_unlocked(). If sampling rate is high, 
oa_buffer_check_unlocked will almost always return true. If we remove 
the if block, we may run the rewind logic too often to detect reports 
that landed. The aging logic is just giving a 100us buffer to avoid 
repeated checks here if tail hasn't moved. (although tbh, 100 us is very 
small).


- The other concern - by dropping all this aging logic, are we changing 
  underlying behavior?


- Is there a significant ROI on current patch vs. dropping all the aging 
  logic?




Separately, there seems to be another related bug in the code, I have sent
a patch for that here:

https://patchwork.freedesktop.org/series/118151/


That's a valid new issue and different from this one, but related to the 
rewind logic. lgtm.


Thanks,
Umesh


Thanks.
--
Ashutosh


Re: [Intel-gfx] [PATCH] drm/i915/perf: Clear out entire reports after reading if not power of 2 size

2023-05-22 Thread Umesh Nerlige Ramappa

On Mon, May 22, 2023 at 01:17:49PM -0700, Ashutosh Dixit wrote:

Clearing out report id and timestamp as means to detect unlanded reports
only works if report size is power of 2. That is, only when report size is
a sub-multiple of the OA buffer size can we be certain that reports will
land at the same place each time in the OA buffer (after rewind). If report
size is not a power of 2, we need to zero out the entire report to be able
to detect unlanded reports reliably.

Cc: Umesh Nerlige Ramappa 
Signed-off-by: Ashutosh Dixit 
---
drivers/gpu/drm/i915/i915_perf.c | 17 +++--
1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 19d5652300eeb..58284156428dc 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -877,12 +877,17 @@ static int gen8_append_oa_reports(struct i915_perf_stream 
*stream,
stream->oa_buffer.last_ctx_id = ctx_id;
}

-   /*
-* Clear out the report id and timestamp as a means to detect 
unlanded
-* reports.
-*/
-   oa_report_id_clear(stream, report32);
-   oa_timestamp_clear(stream, report32);
+   if (is_power_of_2(report_size)) {
+   /*
+* Clear out the report id and timestamp as a means
+* to detect unlanded reports.
+*/
+   oa_report_id_clear(stream, report32);
+   oa_timestamp_clear(stream, report32);
+   } else {
+   /* Zero out the entire report */
+   memset(report32, 0, report_size);


Indeed, this was a bug. For a minute, I started wondering if this is the 
issue I am running into with the other patch posted for DG2, but then I 
see the issue within the first fill of the OA buffer where chunks of the 
reports are zeroed out, so this is a new issue.


lgtm,

Reviewed-by: Umesh Nerlige Ramappa 

Thanks,
Umesh



+   }
}

if (start_offset != *offset) {
--
2.38.0



[Intel-gfx] [PATCH] i915/perf: Avoid reading OA reports before they land

2023-05-19 Thread Umesh Nerlige Ramappa
On DG2, capturing OA reports while running heavy render workloads
sometimes results in invalid OA reports where 64-byte chunks inside
reports have stale values. Under memory pressure, high OA sampling rates
(13.3 us) and heavy render workload, occassionally, the OA HW TAIL
pointer does not progress as fast as the sampling rate. When these
glitches occur, the TAIL pointer takes approx. 200us to progress.  While
this is expected behavior from the HW perspective, invalid reports are
not expected.

In oa_buffer_check_unlocked(), when we execute the if condition, we are
updating the oa_buffer.tail to the aging tail and then setting pollin
based on this tail value, however, we do not have a chance to rewind and
validate the reports prior to setting pollin. The validation happens
in a subsequent call to oa_buffer_check_unlocked(). If a read occurs
before this validation, then we end up reading reports up until this
oa_buffer.tail value which includes invalid reports. Though found on
DG2, this affects all platforms.

Set the pollin only in the else condition in oa_buffer_check_unlocked.

Bug: https://gitlab.freedesktop.org/drm/intel/-/issues/7484
Bug: https://gitlab.freedesktop.org/drm/intel/-/issues/7757
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_perf.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 19d5652300ee..61536e3c4ac9 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -545,7 +545,7 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
int report_size = stream->oa_buffer.format->size;
unsigned long flags;
-   bool pollin;
+   bool pollin = false;
u32 hw_tail;
u64 now;
u32 partial_report_size;
@@ -620,10 +620,10 @@ static bool oa_buffer_check_unlocked(struct 
i915_perf_stream *stream)
stream->oa_buffer.tail = gtt_offset + tail;
stream->oa_buffer.aging_tail = gtt_offset + hw_tail;
stream->oa_buffer.aging_timestamp = now;
-   }
 
-   pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset,
- stream->oa_buffer.head - gtt_offset) >= report_size;
+   pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset,
+ stream->oa_buffer.head - gtt_offset) >= 
report_size;
+   }
 
spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
 
-- 
2.38.1



[Intel-gfx] [PATCH v7 5/7] drm/i915/pmu: Add reference counting to the sampling timer

2023-05-19 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

We do not want to have timers per tile and waste CPU cycles and energy via
multiple wake-up sources, for a relatively un-important task of PMU
sampling, so keeping a single timer works well. But we also do not want
the first GT which goes idle to turn off the timer.

Add some reference counting, via a mask of unparked GTs, to solve this.

v2: Drop the check for unparked in i915_sample (Ashutosh)
v3: Revert v2 (Tvrtko)

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 12 ++--
 drivers/gpu/drm/i915/i915_pmu.h |  4 
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 890693fdaf9e..ecb57a94143e 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -262,7 +262,9 @@ void i915_pmu_gt_parked(struct intel_gt *gt)
 * Signal sampling timer to stop if only engine events are enabled and
 * GPU went idle.
 */
-   pmu->timer_enabled = pmu_needs_timer(pmu, false);
+   pmu->unparked &= ~BIT(gt->info.id);
+   if (pmu->unparked == 0)
+   pmu->timer_enabled = pmu_needs_timer(pmu, false);
 
spin_unlock_irq(&pmu->lock);
 }
@@ -279,7 +281,10 @@ void i915_pmu_gt_unparked(struct intel_gt *gt)
/*
 * Re-enable sampling timer when GPU goes active.
 */
-   __i915_pmu_maybe_start_timer(pmu);
+   if (pmu->unparked == 0)
+   __i915_pmu_maybe_start_timer(pmu);
+
+   pmu->unparked |= BIT(gt->info.id);
 
spin_unlock_irq(&pmu->lock);
 }
@@ -449,6 +454,9 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
 */
 
for_each_gt(gt, i915, i) {
+   if (!(pmu->unparked & BIT(i)))
+   continue;
+
engines_sample(gt, period_ns);
 
if (i == 0) /* FIXME */
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index a686fd7ccedf..3a811266ac6a 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -76,6 +76,10 @@ struct i915_pmu {
 * @lock: Lock protecting enable mask and ref count handling.
 */
spinlock_t lock;
+   /**
+* @unparked: GT unparked mask.
+*/
+   unsigned int unparked;
/**
 * @timer: Timer for internal i915 PMU sampling.
 */
-- 
2.36.1



[Intel-gfx] [PATCH v7 4/7] drm/i915/pmu: Transform PMU parking code to be GT based

2023-05-19 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Trivial prep work for full multi-tile enablement later.

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Vinay Belgaumkar 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |  4 ++--
 drivers/gpu/drm/i915/i915_pmu.c   | 16 
 drivers/gpu/drm/i915/i915_pmu.h   |  9 +
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index e02cb90723ae..c2e69bafd02b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -87,7 +87,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
 
intel_rc6_unpark(>->rc6);
intel_rps_unpark(>->rps);
-   i915_pmu_gt_unparked(i915);
+   i915_pmu_gt_unparked(gt);
intel_guc_busyness_unpark(gt);
 
intel_gt_unpark_requests(gt);
@@ -109,7 +109,7 @@ static int __gt_park(struct intel_wakeref *wf)
 
intel_guc_busyness_park(gt);
i915_vma_parked(gt);
-   i915_pmu_gt_parked(i915);
+   i915_pmu_gt_parked(gt);
intel_rps_park(>->rps);
intel_rc6_park(>->rc6);
 
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 6d594f67f365..890693fdaf9e 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -228,11 +228,11 @@ static void init_rc6(struct i915_pmu *pmu)
}
 }
 
-static void park_rc6(struct drm_i915_private *i915)
+static void park_rc6(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
-   pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(to_gt(i915));
+   pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(gt);
pmu->sleep_last = ktime_get_raw();
 }
 
@@ -247,16 +247,16 @@ static void __i915_pmu_maybe_start_timer(struct i915_pmu 
*pmu)
}
 }
 
-void i915_pmu_gt_parked(struct drm_i915_private *i915)
+void i915_pmu_gt_parked(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
if (!pmu->base.event_init)
return;
 
spin_lock_irq(&pmu->lock);
 
-   park_rc6(i915);
+   park_rc6(gt);
 
/*
 * Signal sampling timer to stop if only engine events are enabled and
@@ -267,9 +267,9 @@ void i915_pmu_gt_parked(struct drm_i915_private *i915)
spin_unlock_irq(&pmu->lock);
 }
 
-void i915_pmu_gt_unparked(struct drm_i915_private *i915)
+void i915_pmu_gt_unparked(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
if (!pmu->base.event_init)
return;
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index c30f43319a78..a686fd7ccedf 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -13,6 +13,7 @@
 #include 
 
 struct drm_i915_private;
+struct intel_gt;
 
 /*
  * Non-engine events that we need to track enabled-disabled transition and
@@ -151,15 +152,15 @@ int i915_pmu_init(void);
 void i915_pmu_exit(void);
 void i915_pmu_register(struct drm_i915_private *i915);
 void i915_pmu_unregister(struct drm_i915_private *i915);
-void i915_pmu_gt_parked(struct drm_i915_private *i915);
-void i915_pmu_gt_unparked(struct drm_i915_private *i915);
+void i915_pmu_gt_parked(struct intel_gt *gt);
+void i915_pmu_gt_unparked(struct intel_gt *gt);
 #else
 static inline int i915_pmu_init(void) { return 0; }
 static inline void i915_pmu_exit(void) {}
 static inline void i915_pmu_register(struct drm_i915_private *i915) {}
 static inline void i915_pmu_unregister(struct drm_i915_private *i915) {}
-static inline void i915_pmu_gt_parked(struct drm_i915_private *i915) {}
-static inline void i915_pmu_gt_unparked(struct drm_i915_private *i915) {}
+static inline void i915_pmu_gt_parked(struct intel_gt *gt) {}
+static inline void i915_pmu_gt_unparked(struct intel_gt *gt) {}
 #endif
 
 #endif
-- 
2.36.1



[Intel-gfx] [PATCH v7 7/7] drm/i915/pmu: Export counters from all tiles

2023-05-19 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Start exporting frequency and RC6 counters from all tiles.

Existing counters keep their names and config values and new one use the
namespace added in the previous patch, with the "-gtN" added to their
names.

Interrupts counter is an odd one off. Because it is the global device
counters (not only GT) we choose not to add per tile versions for now.

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Aravind Iddamsetty 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 82 ++---
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 5cfc322e69b4..a814583e19fd 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -940,11 +940,20 @@ static const struct attribute_group 
i915_pmu_cpumask_attr_group = {
.attrs = i915_cpumask_attrs,
 };
 
-#define __event(__config, __name, __unit) \
+#define __event(__counter, __name, __unit) \
 { \
-   .config = (__config), \
+   .counter = (__counter), \
.name = (__name), \
.unit = (__unit), \
+   .global = false, \
+}
+
+#define __global_event(__counter, __name, __unit) \
+{ \
+   .counter = (__counter), \
+   .name = (__name), \
+   .unit = (__unit), \
+   .global = true, \
 }
 
 #define __engine_event(__sample, __name) \
@@ -983,15 +992,16 @@ create_event_attributes(struct i915_pmu *pmu)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
static const struct {
-   u64 config;
+   unsigned int counter;
const char *name;
const char *unit;
+   bool global;
} events[] = {
-   __event(I915_PMU_ACTUAL_FREQUENCY, "actual-frequency", "M"),
-   __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", 
"M"),
-   __event(I915_PMU_INTERRUPTS, "interrupts", NULL),
-   __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"),
-   __event(I915_PMU_SOFTWARE_GT_AWAKE_TIME, 
"software-gt-awake-time", "ns"),
+   __event(0, "actual-frequency", "M"),
+   __event(1, "requested-frequency", "M"),
+   __global_event(2, "interrupts", NULL),
+   __event(3, "rc6-residency", "ns"),
+   __event(4, "software-gt-awake-time", "ns"),
};
static const struct {
enum drm_i915_pmu_engine_sample sample;
@@ -1006,12 +1016,17 @@ create_event_attributes(struct i915_pmu *pmu)
struct i915_ext_attribute *i915_attr = NULL, *i915_iter;
struct attribute **attr = NULL, **attr_iter;
struct intel_engine_cs *engine;
-   unsigned int i;
+   struct intel_gt *gt;
+   unsigned int i, j;
 
/* Count how many counters we will be exposing. */
-   for (i = 0; i < ARRAY_SIZE(events); i++) {
-   if (!config_status(i915, events[i].config))
-   count++;
+   for_each_gt(gt, i915, j) {
+   for (i = 0; i < ARRAY_SIZE(events); i++) {
+   u64 config = ___I915_PMU_OTHER(j, events[i].counter);
+
+   if (!config_status(i915, config))
+   count++;
+   }
}
 
for_each_uabi_engine(engine, i915) {
@@ -1041,26 +1056,39 @@ create_event_attributes(struct i915_pmu *pmu)
attr_iter = attr;
 
/* Initialize supported non-engine counters. */
-   for (i = 0; i < ARRAY_SIZE(events); i++) {
-   char *str;
-
-   if (config_status(i915, events[i].config))
-   continue;
-
-   str = kstrdup(events[i].name, GFP_KERNEL);
-   if (!str)
-   goto err;
+   for_each_gt(gt, i915, j) {
+   for (i = 0; i < ARRAY_SIZE(events); i++) {
+   u64 config = ___I915_PMU_OTHER(j, events[i].counter);
+   char *str;
 
-   *attr_iter++ = &i915_iter->attr.attr;
-   i915_iter = add_i915_attr(i915_iter, str, events[i].config);
+   if (config_status(i915, config))
+   continue;
 
-   if (events[i].unit) {
-   str = kasprintf(GFP_KERNEL, "%s.unit", events[i].name);
+   if (events[i].global || !HAS_EXTRA_GT_LIST(i915))
+   str = kstrdup(events[i].name, GFP_KERNEL);
+   else
+   str = kasprintf(GFP_KERNEL, "%s-gt%u",
+   events[

[Intel-gfx] [PATCH v7 0/7] Add MTL PMU support for multi-gt

2023-05-19 Thread Umesh Nerlige Ramappa
With MTL, frequency and rc6 counters are specific to a gt. Export these
counters via gt-specific events to the user space.

v2: Remove aggregation support from kernel
v3: Review comments (Ashutosh, Tvrtko)
v4:
- Include R-b for 6/6
- Add Test-with
- Fix versioning info in cover letter
v5:
- Include "drm/i915/pmu: Change bitmask of enabled events to u32"
v6: s/u64/u32 (Ashutosh)
v7: CI rerun with updated IGT

Signed-off-by: Umesh Nerlige Ramappa 
Test-with: 20230519154650.3751855-1-umesh.nerlige.rama...@intel.com

Tvrtko Ursulin (7):
  drm/i915/pmu: Change bitmask of enabled events to u32
  drm/i915/pmu: Support PMU for all engines
  drm/i915/pmu: Skip sampling engines with no enabled counters
  drm/i915/pmu: Transform PMU parking code to be GT based
  drm/i915/pmu: Add reference counting to the sampling timer
  drm/i915/pmu: Prepare for multi-tile non-engine counters
  drm/i915/pmu: Export counters from all tiles

 drivers/gpu/drm/i915/gt/intel_gt_pm.c |   4 +-
 drivers/gpu/drm/i915/i915_pmu.c   | 290 ++
 drivers/gpu/drm/i915/i915_pmu.h   |  22 +-
 include/uapi/drm/i915_drm.h   |  17 +-
 4 files changed, 238 insertions(+), 95 deletions(-)

-- 
2.36.1



[Intel-gfx] [PATCH v7 6/7] drm/i915/pmu: Prepare for multi-tile non-engine counters

2023-05-19 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Reserve some bits in the counter config namespace which will carry the
tile id and prepare the code to handle this.

No per tile counters have been added yet.

v2:
- Fix checkpatch issues
- Use 4 bits for gt id in non-engine counters. Drop FIXME.
- Set MAX GTs to 4. Drop FIXME.

v3: (Ashutosh, Tvrtko)
- Drop BUG_ON that would never fire
- Make enable u64
- Pull in some code from next patch

v4: Set I915_PMU_MAX_GTS to 2 (Tvrtko)

v5: s/u64/u32 where needed (Ashutosh)

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 146 +++-
 drivers/gpu/drm/i915/i915_pmu.h |   9 +-
 include/uapi/drm/i915_drm.h |  17 +++-
 3 files changed, 127 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index ecb57a94143e..5cfc322e69b4 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -56,11 +56,21 @@ static bool is_engine_config(const u64 config)
return config < __I915_PMU_OTHER(0);
 }
 
+static unsigned int config_gt_id(const u64 config)
+{
+   return config >> __I915_PMU_GT_SHIFT;
+}
+
+static u64 config_counter(const u64 config)
+{
+   return config & ~(~0ULL << __I915_PMU_GT_SHIFT);
+}
+
 static unsigned int other_bit(const u64 config)
 {
unsigned int val;
 
-   switch (config) {
+   switch (config_counter(config)) {
case I915_PMU_ACTUAL_FREQUENCY:
val =  __I915_PMU_ACTUAL_FREQUENCY_ENABLED;
break;
@@ -78,7 +88,9 @@ static unsigned int other_bit(const u64 config)
return -1;
}
 
-   return I915_ENGINE_SAMPLE_COUNT + val;
+   return I915_ENGINE_SAMPLE_COUNT +
+  config_gt_id(config) * __I915_PMU_TRACKED_EVENT_COUNT +
+  val;
 }
 
 static unsigned int config_bit(const u64 config)
@@ -115,6 +127,18 @@ static unsigned int event_bit(struct perf_event *event)
return config_bit(event->attr.config);
 }
 
+static u32 frequency_enabled_mask(void)
+{
+   unsigned int i;
+   u32 mask = 0;
+
+   for (i = 0; i < I915_PMU_MAX_GTS; i++)
+   mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
+   config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
+
+   return mask;
+}
+
 static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
@@ -131,9 +155,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool 
gpu_active)
 * Mask out all the ones which do not need the timer, or in
 * other words keep all the ones that could need the timer.
 */
-   enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
- config_mask(I915_PMU_REQUESTED_FREQUENCY) |
- ENGINE_SAMPLE_MASK;
+   enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;
 
/*
 * When the GPU is idle per-engine counters do not need to be
@@ -175,9 +197,37 @@ static inline s64 ktime_since_raw(const ktime_t kt)
return ktime_to_ns(ktime_sub(ktime_get_raw(), kt));
 }
 
+static unsigned int
+__sample_idx(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   unsigned int idx = gt_id * __I915_NUM_PMU_SAMPLERS + sample;
+
+   GEM_BUG_ON(idx >= ARRAY_SIZE(pmu->sample));
+
+   return idx;
+}
+
+static u64 read_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   return pmu->sample[__sample_idx(pmu, gt_id, sample)].cur;
+}
+
+static void
+store_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample, u64 val)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur = val;
+}
+
+static void
+add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val, 
u32 mul)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur += mul_u32_u32(val, 
mul);
+}
+
 static u64 get_rc6(struct intel_gt *gt)
 {
struct drm_i915_private *i915 = gt->i915;
+   const unsigned int gt_id = gt->info.id;
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
bool awake = false;
@@ -192,7 +242,7 @@ static u64 get_rc6(struct intel_gt *gt)
spin_lock_irqsave(&pmu->lock, flags);
 
if (awake) {
-   pmu->sample[__I915_SAMPLE_RC6].cur = val;
+   store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
} else {
/*
 * We think we are runtime suspended.
@@ -201,14 +251,14 @@ static u64 get_rc6(struct intel_gt *gt)
 * on top of the last known real value, as the approximated RC6
 * counter value.
 */
-   val = ktime_since_raw(pmu->sleep_last);
-   val += pmu->sample[__I915_SAMPLE_RC6].cur;
+   val = ktime_since_raw(pmu->slee

[Intel-gfx] [PATCH v7 1/7] drm/i915/pmu: Change bitmask of enabled events to u32

2023-05-19 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Having it as u64 was a confusing (but harmless) mistake.

Also add some asserts to make sure the internal field does not overflow
in the future.

v2: Fix WARN_ON firing for INTERRUPT event (Umesh)

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 7ece883a7d95..96543dce2db1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -50,7 +50,7 @@ static u8 engine_event_instance(struct perf_event *event)
return (event->attr.config >> I915_PMU_SAMPLE_BITS) & 0xff;
 }
 
-static bool is_engine_config(u64 config)
+static bool is_engine_config(const u64 config)
 {
return config < __I915_PMU_OTHER(0);
 }
@@ -88,9 +88,20 @@ static unsigned int config_bit(const u64 config)
return other_bit(config);
 }
 
-static u64 config_mask(u64 config)
+static u32 config_mask(const u64 config)
 {
-   return BIT_ULL(config_bit(config));
+   unsigned int bit = config_bit(config);
+
+   if (__builtin_constant_p(config))
+   BUILD_BUG_ON(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);
+   else
+   WARN_ON_ONCE(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);
+
+   return BIT(config_bit(config));
 }
 
 static bool is_engine_event(struct perf_event *event)
@@ -633,11 +644,10 @@ static void i915_pmu_enable(struct perf_event *event)
 {
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
-   unsigned int bit;
 
-   bit = event_bit(event);
if (bit == -1)
goto update;
 
@@ -651,7 +661,7 @@ static void i915_pmu_enable(struct perf_event *event)
GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
GEM_BUG_ON(pmu->enable_count[bit] == ~0);
 
-   pmu->enable |= BIT_ULL(bit);
+   pmu->enable |= BIT(bit);
pmu->enable_count[bit]++;
 
/*
@@ -698,7 +708,7 @@ static void i915_pmu_disable(struct perf_event *event)
 {
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
-   unsigned int bit = event_bit(event);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
 
@@ -734,7 +744,7 @@ static void i915_pmu_disable(struct perf_event *event)
 * bitmask when the last listener on an event goes away.
 */
if (--pmu->enable_count[bit] == 0) {
-   pmu->enable &= ~BIT_ULL(bit);
+   pmu->enable &= ~BIT(bit);
pmu->timer_enabled &= pmu_needs_timer(pmu, true);
}
 
-- 
2.36.1



[Intel-gfx] [PATCH v7 3/7] drm/i915/pmu: Skip sampling engines with no enabled counters

2023-05-19 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

As we have more and more engines do not waste time sampling the ones no-
one is monitoring.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 9edf87ee5d10..6d594f67f365 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -350,6 +350,9 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
return;
 
for_each_engine(engine, gt, id) {
+   if (!engine->pmu.enable)
+   continue;
+
if (!intel_engine_pm_get_if_awake(engine))
continue;
 
-- 
2.36.1



[Intel-gfx] [PATCH v7 2/7] drm/i915/pmu: Support PMU for all engines

2023-05-19 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Given how the metrics are already exported, we also need to run sampling
over engines from all GTs.

Problem of GT frequencies is left for later.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 96543dce2db1..9edf87ee5d10 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -10,6 +10,7 @@
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_regs.h"
 #include "gt/intel_engine_user.h"
+#include "gt/intel_gt.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_gt_regs.h"
 #include "gt/intel_rc6.h"
@@ -425,8 +426,9 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
struct drm_i915_private *i915 =
container_of(hrtimer, struct drm_i915_private, pmu.timer);
struct i915_pmu *pmu = &i915->pmu;
-   struct intel_gt *gt = to_gt(i915);
unsigned int period_ns;
+   struct intel_gt *gt;
+   unsigned int i;
ktime_t now;
 
if (!READ_ONCE(pmu->timer_enabled))
@@ -442,8 +444,13 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
 * grabbing the forcewake. However the potential error from timer call-
 * back delay greatly dominates this so we keep it simple.
 */
-   engines_sample(gt, period_ns);
-   frequency_sample(gt, period_ns);
+
+   for_each_gt(gt, i915, i) {
+   engines_sample(gt, period_ns);
+
+   if (i == 0) /* FIXME */
+   frequency_sample(gt, period_ns);
+   }
 
hrtimer_forward(hrtimer, now, ns_to_ktime(PERIOD));
 
-- 
2.36.1



[Intel-gfx] [PATCH v6 5/7] drm/i915/pmu: Add reference counting to the sampling timer

2023-05-17 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

We do not want to have timers per tile and waste CPU cycles and energy via
multiple wake-up sources, for a relatively un-important task of PMU
sampling, so keeping a single timer works well. But we also do not want
the first GT which goes idle to turn off the timer.

Add some reference counting, via a mask of unparked GTs, to solve this.

v2: Drop the check for unparked in i915_sample (Ashutosh)
v3: Revert v2 (Tvrtko)

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 12 ++--
 drivers/gpu/drm/i915/i915_pmu.h |  4 
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 890693fdaf9e..ecb57a94143e 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -262,7 +262,9 @@ void i915_pmu_gt_parked(struct intel_gt *gt)
 * Signal sampling timer to stop if only engine events are enabled and
 * GPU went idle.
 */
-   pmu->timer_enabled = pmu_needs_timer(pmu, false);
+   pmu->unparked &= ~BIT(gt->info.id);
+   if (pmu->unparked == 0)
+   pmu->timer_enabled = pmu_needs_timer(pmu, false);
 
spin_unlock_irq(&pmu->lock);
 }
@@ -279,7 +281,10 @@ void i915_pmu_gt_unparked(struct intel_gt *gt)
/*
 * Re-enable sampling timer when GPU goes active.
 */
-   __i915_pmu_maybe_start_timer(pmu);
+   if (pmu->unparked == 0)
+   __i915_pmu_maybe_start_timer(pmu);
+
+   pmu->unparked |= BIT(gt->info.id);
 
spin_unlock_irq(&pmu->lock);
 }
@@ -449,6 +454,9 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
 */
 
for_each_gt(gt, i915, i) {
+   if (!(pmu->unparked & BIT(i)))
+   continue;
+
engines_sample(gt, period_ns);
 
if (i == 0) /* FIXME */
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index a686fd7ccedf..3a811266ac6a 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -76,6 +76,10 @@ struct i915_pmu {
 * @lock: Lock protecting enable mask and ref count handling.
 */
spinlock_t lock;
+   /**
+* @unparked: GT unparked mask.
+*/
+   unsigned int unparked;
/**
 * @timer: Timer for internal i915 PMU sampling.
 */
-- 
2.36.1



[Intel-gfx] [PATCH v6 4/7] drm/i915/pmu: Transform PMU parking code to be GT based

2023-05-17 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Trivial prep work for full multi-tile enablement later.

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Vinay Belgaumkar 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |  4 ++--
 drivers/gpu/drm/i915/i915_pmu.c   | 16 
 drivers/gpu/drm/i915/i915_pmu.h   |  9 +
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index e02cb90723ae..c2e69bafd02b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -87,7 +87,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
 
intel_rc6_unpark(>->rc6);
intel_rps_unpark(>->rps);
-   i915_pmu_gt_unparked(i915);
+   i915_pmu_gt_unparked(gt);
intel_guc_busyness_unpark(gt);
 
intel_gt_unpark_requests(gt);
@@ -109,7 +109,7 @@ static int __gt_park(struct intel_wakeref *wf)
 
intel_guc_busyness_park(gt);
i915_vma_parked(gt);
-   i915_pmu_gt_parked(i915);
+   i915_pmu_gt_parked(gt);
intel_rps_park(>->rps);
intel_rc6_park(>->rc6);
 
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 6d594f67f365..890693fdaf9e 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -228,11 +228,11 @@ static void init_rc6(struct i915_pmu *pmu)
}
 }
 
-static void park_rc6(struct drm_i915_private *i915)
+static void park_rc6(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
-   pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(to_gt(i915));
+   pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(gt);
pmu->sleep_last = ktime_get_raw();
 }
 
@@ -247,16 +247,16 @@ static void __i915_pmu_maybe_start_timer(struct i915_pmu 
*pmu)
}
 }
 
-void i915_pmu_gt_parked(struct drm_i915_private *i915)
+void i915_pmu_gt_parked(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
if (!pmu->base.event_init)
return;
 
spin_lock_irq(&pmu->lock);
 
-   park_rc6(i915);
+   park_rc6(gt);
 
/*
 * Signal sampling timer to stop if only engine events are enabled and
@@ -267,9 +267,9 @@ void i915_pmu_gt_parked(struct drm_i915_private *i915)
spin_unlock_irq(&pmu->lock);
 }
 
-void i915_pmu_gt_unparked(struct drm_i915_private *i915)
+void i915_pmu_gt_unparked(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
if (!pmu->base.event_init)
return;
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index c30f43319a78..a686fd7ccedf 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -13,6 +13,7 @@
 #include 
 
 struct drm_i915_private;
+struct intel_gt;
 
 /*
  * Non-engine events that we need to track enabled-disabled transition and
@@ -151,15 +152,15 @@ int i915_pmu_init(void);
 void i915_pmu_exit(void);
 void i915_pmu_register(struct drm_i915_private *i915);
 void i915_pmu_unregister(struct drm_i915_private *i915);
-void i915_pmu_gt_parked(struct drm_i915_private *i915);
-void i915_pmu_gt_unparked(struct drm_i915_private *i915);
+void i915_pmu_gt_parked(struct intel_gt *gt);
+void i915_pmu_gt_unparked(struct intel_gt *gt);
 #else
 static inline int i915_pmu_init(void) { return 0; }
 static inline void i915_pmu_exit(void) {}
 static inline void i915_pmu_register(struct drm_i915_private *i915) {}
 static inline void i915_pmu_unregister(struct drm_i915_private *i915) {}
-static inline void i915_pmu_gt_parked(struct drm_i915_private *i915) {}
-static inline void i915_pmu_gt_unparked(struct drm_i915_private *i915) {}
+static inline void i915_pmu_gt_parked(struct intel_gt *gt) {}
+static inline void i915_pmu_gt_unparked(struct intel_gt *gt) {}
 #endif
 
 #endif
-- 
2.36.1



[Intel-gfx] [PATCH v6 6/7] drm/i915/pmu: Prepare for multi-tile non-engine counters

2023-05-17 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Reserve some bits in the counter config namespace which will carry the
tile id and prepare the code to handle this.

No per tile counters have been added yet.

v2:
- Fix checkpatch issues
- Use 4 bits for gt id in non-engine counters. Drop FIXME.
- Set MAX GTs to 4. Drop FIXME.

v3: (Ashutosh, Tvrtko)
- Drop BUG_ON that would never fire
- Make enable u64
- Pull in some code from next patch

v4: Set I915_PMU_MAX_GTS to 2 (Tvrtko)

v5: s/u64/u32 where needed (Ashutosh)

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 146 +++-
 drivers/gpu/drm/i915/i915_pmu.h |   9 +-
 include/uapi/drm/i915_drm.h |  17 +++-
 3 files changed, 127 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index ecb57a94143e..5cfc322e69b4 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -56,11 +56,21 @@ static bool is_engine_config(const u64 config)
return config < __I915_PMU_OTHER(0);
 }
 
+static unsigned int config_gt_id(const u64 config)
+{
+   return config >> __I915_PMU_GT_SHIFT;
+}
+
+static u64 config_counter(const u64 config)
+{
+   return config & ~(~0ULL << __I915_PMU_GT_SHIFT);
+}
+
 static unsigned int other_bit(const u64 config)
 {
unsigned int val;
 
-   switch (config) {
+   switch (config_counter(config)) {
case I915_PMU_ACTUAL_FREQUENCY:
val =  __I915_PMU_ACTUAL_FREQUENCY_ENABLED;
break;
@@ -78,7 +88,9 @@ static unsigned int other_bit(const u64 config)
return -1;
}
 
-   return I915_ENGINE_SAMPLE_COUNT + val;
+   return I915_ENGINE_SAMPLE_COUNT +
+  config_gt_id(config) * __I915_PMU_TRACKED_EVENT_COUNT +
+  val;
 }
 
 static unsigned int config_bit(const u64 config)
@@ -115,6 +127,18 @@ static unsigned int event_bit(struct perf_event *event)
return config_bit(event->attr.config);
 }
 
+static u32 frequency_enabled_mask(void)
+{
+   unsigned int i;
+   u32 mask = 0;
+
+   for (i = 0; i < I915_PMU_MAX_GTS; i++)
+   mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
+   config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
+
+   return mask;
+}
+
 static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
@@ -131,9 +155,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool 
gpu_active)
 * Mask out all the ones which do not need the timer, or in
 * other words keep all the ones that could need the timer.
 */
-   enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
- config_mask(I915_PMU_REQUESTED_FREQUENCY) |
- ENGINE_SAMPLE_MASK;
+   enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;
 
/*
 * When the GPU is idle per-engine counters do not need to be
@@ -175,9 +197,37 @@ static inline s64 ktime_since_raw(const ktime_t kt)
return ktime_to_ns(ktime_sub(ktime_get_raw(), kt));
 }
 
+static unsigned int
+__sample_idx(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   unsigned int idx = gt_id * __I915_NUM_PMU_SAMPLERS + sample;
+
+   GEM_BUG_ON(idx >= ARRAY_SIZE(pmu->sample));
+
+   return idx;
+}
+
+static u64 read_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   return pmu->sample[__sample_idx(pmu, gt_id, sample)].cur;
+}
+
+static void
+store_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample, u64 val)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur = val;
+}
+
+static void
+add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val, 
u32 mul)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur += mul_u32_u32(val, 
mul);
+}
+
 static u64 get_rc6(struct intel_gt *gt)
 {
struct drm_i915_private *i915 = gt->i915;
+   const unsigned int gt_id = gt->info.id;
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
bool awake = false;
@@ -192,7 +242,7 @@ static u64 get_rc6(struct intel_gt *gt)
spin_lock_irqsave(&pmu->lock, flags);
 
if (awake) {
-   pmu->sample[__I915_SAMPLE_RC6].cur = val;
+   store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
} else {
/*
 * We think we are runtime suspended.
@@ -201,14 +251,14 @@ static u64 get_rc6(struct intel_gt *gt)
 * on top of the last known real value, as the approximated RC6
 * counter value.
 */
-   val = ktime_since_raw(pmu->sleep_last);
-   val += pmu->sample[__I915_SAMPLE_RC6].cur;
+   val = ktime_since_raw(pmu->sleep_last[gt_id]);
+ 

[Intel-gfx] [PATCH v6 3/7] drm/i915/pmu: Skip sampling engines with no enabled counters

2023-05-17 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

As we have more and more engines do not waste time sampling the ones no-
one is monitoring.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 9edf87ee5d10..6d594f67f365 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -350,6 +350,9 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
return;
 
for_each_engine(engine, gt, id) {
+   if (!engine->pmu.enable)
+   continue;
+
if (!intel_engine_pm_get_if_awake(engine))
continue;
 
-- 
2.36.1



[Intel-gfx] [PATCH v6 0/7] Add MTL PMU support for multi-gt

2023-05-17 Thread Umesh Nerlige Ramappa
With MTL, frequency and rc6 counters are specific to a gt. Export these
counters via gt-specific events to the user space.

v2: Remove aggregation support from kernel
v3: Review comments (Ashutosh, Tvrtko)
v4:
- Include R-b for 6/6
- Add Test-with
- Fix versioning info in cover letter
v5:
- Include "drm/i915/pmu: Change bitmask of enabled events to u32"

Signed-off-by: Umesh Nerlige Ramappa 
Test-with: 20230513022234.2832233-1-umesh.nerlige.rama...@intel.com

Tvrtko Ursulin (7):
  drm/i915/pmu: Change bitmask of enabled events to u32
  drm/i915/pmu: Support PMU for all engines
  drm/i915/pmu: Skip sampling engines with no enabled counters
  drm/i915/pmu: Transform PMU parking code to be GT based
  drm/i915/pmu: Add reference counting to the sampling timer
  drm/i915/pmu: Prepare for multi-tile non-engine counters
  drm/i915/pmu: Export counters from all tiles

 drivers/gpu/drm/i915/gt/intel_gt_pm.c |   4 +-
 drivers/gpu/drm/i915/i915_pmu.c   | 290 ++
 drivers/gpu/drm/i915/i915_pmu.h   |  22 +-
 include/uapi/drm/i915_drm.h   |  17 +-
 4 files changed, 238 insertions(+), 95 deletions(-)

-- 
2.36.1



[Intel-gfx] [PATCH v6 1/7] drm/i915/pmu: Change bitmask of enabled events to u32

2023-05-17 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Having it as u64 was a confusing (but harmless) mistake.

Also add some asserts to make sure the internal field does not overflow
in the future.

v2: Fix WARN_ON firing for INTERRUPT event (Umesh)

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 7ece883a7d95..96543dce2db1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -50,7 +50,7 @@ static u8 engine_event_instance(struct perf_event *event)
return (event->attr.config >> I915_PMU_SAMPLE_BITS) & 0xff;
 }
 
-static bool is_engine_config(u64 config)
+static bool is_engine_config(const u64 config)
 {
return config < __I915_PMU_OTHER(0);
 }
@@ -88,9 +88,20 @@ static unsigned int config_bit(const u64 config)
return other_bit(config);
 }
 
-static u64 config_mask(u64 config)
+static u32 config_mask(const u64 config)
 {
-   return BIT_ULL(config_bit(config));
+   unsigned int bit = config_bit(config);
+
+   if (__builtin_constant_p(config))
+   BUILD_BUG_ON(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);
+   else
+   WARN_ON_ONCE(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);
+
+   return BIT(config_bit(config));
 }
 
 static bool is_engine_event(struct perf_event *event)
@@ -633,11 +644,10 @@ static void i915_pmu_enable(struct perf_event *event)
 {
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
-   unsigned int bit;
 
-   bit = event_bit(event);
if (bit == -1)
goto update;
 
@@ -651,7 +661,7 @@ static void i915_pmu_enable(struct perf_event *event)
GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
GEM_BUG_ON(pmu->enable_count[bit] == ~0);
 
-   pmu->enable |= BIT_ULL(bit);
+   pmu->enable |= BIT(bit);
pmu->enable_count[bit]++;
 
/*
@@ -698,7 +708,7 @@ static void i915_pmu_disable(struct perf_event *event)
 {
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
-   unsigned int bit = event_bit(event);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
 
@@ -734,7 +744,7 @@ static void i915_pmu_disable(struct perf_event *event)
 * bitmask when the last listener on an event goes away.
 */
if (--pmu->enable_count[bit] == 0) {
-   pmu->enable &= ~BIT_ULL(bit);
+   pmu->enable &= ~BIT(bit);
pmu->timer_enabled &= pmu_needs_timer(pmu, true);
}
 
-- 
2.36.1



[Intel-gfx] [PATCH v6 2/7] drm/i915/pmu: Support PMU for all engines

2023-05-17 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Given how the metrics are already exported, we also need to run sampling
over engines from all GTs.

Problem of GT frequencies is left for later.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 96543dce2db1..9edf87ee5d10 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -10,6 +10,7 @@
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_regs.h"
 #include "gt/intel_engine_user.h"
+#include "gt/intel_gt.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_gt_regs.h"
 #include "gt/intel_rc6.h"
@@ -425,8 +426,9 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
struct drm_i915_private *i915 =
container_of(hrtimer, struct drm_i915_private, pmu.timer);
struct i915_pmu *pmu = &i915->pmu;
-   struct intel_gt *gt = to_gt(i915);
unsigned int period_ns;
+   struct intel_gt *gt;
+   unsigned int i;
ktime_t now;
 
if (!READ_ONCE(pmu->timer_enabled))
@@ -442,8 +444,13 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
 * grabbing the forcewake. However the potential error from timer call-
 * back delay greatly dominates this so we keep it simple.
 */
-   engines_sample(gt, period_ns);
-   frequency_sample(gt, period_ns);
+
+   for_each_gt(gt, i915, i) {
+   engines_sample(gt, period_ns);
+
+   if (i == 0) /* FIXME */
+   frequency_sample(gt, period_ns);
+   }
 
hrtimer_forward(hrtimer, now, ns_to_ktime(PERIOD));
 
-- 
2.36.1



[Intel-gfx] [PATCH v6 7/7] drm/i915/pmu: Export counters from all tiles

2023-05-17 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Start exporting frequency and RC6 counters from all tiles.

Existing counters keep their names and config values and new one use the
namespace added in the previous patch, with the "-gtN" added to their
names.

Interrupts counter is an odd one off. Because it is the global device
counters (not only GT) we choose not to add per tile versions for now.

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Aravind Iddamsetty 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 82 ++---
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 5cfc322e69b4..a814583e19fd 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -940,11 +940,20 @@ static const struct attribute_group 
i915_pmu_cpumask_attr_group = {
.attrs = i915_cpumask_attrs,
 };
 
-#define __event(__config, __name, __unit) \
+#define __event(__counter, __name, __unit) \
 { \
-   .config = (__config), \
+   .counter = (__counter), \
.name = (__name), \
.unit = (__unit), \
+   .global = false, \
+}
+
+#define __global_event(__counter, __name, __unit) \
+{ \
+   .counter = (__counter), \
+   .name = (__name), \
+   .unit = (__unit), \
+   .global = true, \
 }
 
 #define __engine_event(__sample, __name) \
@@ -983,15 +992,16 @@ create_event_attributes(struct i915_pmu *pmu)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
static const struct {
-   u64 config;
+   unsigned int counter;
const char *name;
const char *unit;
+   bool global;
} events[] = {
-   __event(I915_PMU_ACTUAL_FREQUENCY, "actual-frequency", "M"),
-   __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", 
"M"),
-   __event(I915_PMU_INTERRUPTS, "interrupts", NULL),
-   __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"),
-   __event(I915_PMU_SOFTWARE_GT_AWAKE_TIME, 
"software-gt-awake-time", "ns"),
+   __event(0, "actual-frequency", "M"),
+   __event(1, "requested-frequency", "M"),
+   __global_event(2, "interrupts", NULL),
+   __event(3, "rc6-residency", "ns"),
+   __event(4, "software-gt-awake-time", "ns"),
};
static const struct {
enum drm_i915_pmu_engine_sample sample;
@@ -1006,12 +1016,17 @@ create_event_attributes(struct i915_pmu *pmu)
struct i915_ext_attribute *i915_attr = NULL, *i915_iter;
struct attribute **attr = NULL, **attr_iter;
struct intel_engine_cs *engine;
-   unsigned int i;
+   struct intel_gt *gt;
+   unsigned int i, j;
 
/* Count how many counters we will be exposing. */
-   for (i = 0; i < ARRAY_SIZE(events); i++) {
-   if (!config_status(i915, events[i].config))
-   count++;
+   for_each_gt(gt, i915, j) {
+   for (i = 0; i < ARRAY_SIZE(events); i++) {
+   u64 config = ___I915_PMU_OTHER(j, events[i].counter);
+
+   if (!config_status(i915, config))
+   count++;
+   }
}
 
for_each_uabi_engine(engine, i915) {
@@ -1041,26 +1056,39 @@ create_event_attributes(struct i915_pmu *pmu)
attr_iter = attr;
 
/* Initialize supported non-engine counters. */
-   for (i = 0; i < ARRAY_SIZE(events); i++) {
-   char *str;
-
-   if (config_status(i915, events[i].config))
-   continue;
-
-   str = kstrdup(events[i].name, GFP_KERNEL);
-   if (!str)
-   goto err;
+   for_each_gt(gt, i915, j) {
+   for (i = 0; i < ARRAY_SIZE(events); i++) {
+   u64 config = ___I915_PMU_OTHER(j, events[i].counter);
+   char *str;
 
-   *attr_iter++ = &i915_iter->attr.attr;
-   i915_iter = add_i915_attr(i915_iter, str, events[i].config);
+   if (config_status(i915, config))
+   continue;
 
-   if (events[i].unit) {
-   str = kasprintf(GFP_KERNEL, "%s.unit", events[i].name);
+   if (events[i].global || !HAS_EXTRA_GT_LIST(i915))
+   str = kstrdup(events[i].name, GFP_KERNEL);
+   else
+   str = kasprintf(GFP_KERNEL, "%s-gt%u",
+   events[

Re: [Intel-gfx] [PATCH v5 1/7] drm/i915/pmu: Change bitmask of enabled events to u32

2023-05-17 Thread Umesh Nerlige Ramappa

On Wed, May 17, 2023 at 09:25:03AM -0700, Dixit, Ashutosh wrote:

On Wed, 17 May 2023 01:26:15 -0700, Tvrtko Ursulin wrote:



On 17/05/2023 07:55, Umesh Nerlige Ramappa wrote:
> On Tue, May 16, 2023 at 05:25:50PM -0700, Dixit, Ashutosh wrote:
>> On Tue, 16 May 2023 16:35:28 -0700, Umesh Nerlige Ramappa wrote:
>>>
>>
>> Hi Umesh/Tvrtko,
>>
>> Mostly repeating comments/questions made on the previous patch below.

First of all thanks for improving this, my v1 obviously wasn't good enough.

>>
>>> From: Tvrtko Ursulin 
>>>
>>> Having it as u64 was a confusing (but harmless) mistake.
>>>
>>> Also add some asserts to make sure the internal field does not overflow
>>> in the future.
>>>
>>> v2: Fix WARN_ON firing for INTERRUPT event (Umesh)
>>>
>>> Signed-off-by: Tvrtko Ursulin 
>>> Signed-off-by: Umesh Nerlige Ramappa 
>>> Cc: Ashutosh Dixit 
>>> ---
>>>  drivers/gpu/drm/i915/i915_pmu.c | 26 ++
>>>  1 file changed, 18 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_pmu.c
>>> b/drivers/gpu/drm/i915/i915_pmu.c
>>> index 7ece883a7d95..96543dce2db1 100644
>>> --- a/drivers/gpu/drm/i915/i915_pmu.c
>>> +++ b/drivers/gpu/drm/i915/i915_pmu.c
>>> @@ -50,7 +50,7 @@ static u8 engine_event_instance(struct perf_event
>>> *event)
>>> return (event->attr.config >> I915_PMU_SAMPLE_BITS) & 0xff;
>>>  }
>>>
>>> -static bool is_engine_config(u64 config)
>>> +static bool is_engine_config(const u64 config)
>>>  {
>>> return config < __I915_PMU_OTHER(0);
>>>  }
>>> @@ -88,9 +88,20 @@ static unsigned int config_bit(const u64 config)
>>>     return other_bit(config);
>>>  }
>>>
>>> -static u64 config_mask(u64 config)
>>> +static u32 config_mask(const u64 config)
>>>  {
>>> -    return BIT_ULL(config_bit(config));
>>> +    unsigned int bit = config_bit(config);
>>
>> Give that config_bit() can return -1 (I understand it is avoided in
>> moving
>> the code to config_mask from config_bit), maybe the code below should
>> also
>> have that check?
>
> config_mask is only called to check frequency related events in the code,
> so I don't see it returing -1 here.

Yeah that should be fine since -1 would make the below asserts fire
anyway. (If it would get called from a different path in the future.)

>>
>> int bit = config_bit(config);
>>
>> if (bit != -1)
>> {
>>     ...
>> }
>>
>> Though as mentioned below the 'if (__builtin_constant_p())' would have to
>> go. Maybe the code could even have stayed in config_bit with the check.
>>
>>> +
>>> +    if (__builtin_constant_p(config))
>>> +    BUILD_BUG_ON(bit >
>>> + BITS_PER_TYPE(typeof_member(struct i915_pmu,
>>> + enable)) - 1);
>>
>> Given that config comes from the event (it is event->attr.config), can
>> this
>> ever be a builtin constant?
>
> Not sure about earlier code where these checks were inside config_bit(),
> but with changes I made, I don't see this being a builtin
> constant. However, nothing prevents a caller from just passing a
> builtin_constant to this in future.

Are you sure? I would have thought it would always be a compile time
constant now that the check is in config_mask. Aahhh.. with the multi-tile
changes maybe it can't unroll the loops and calculate the masks at compile
time. Maybe it is a bit too much and we should drop the
__builtin_constant_p branch? Probably..


Ah yes, with the code move to config_mask, they really all are compile time
constants (provided compiler can unroll the loops) so at least that is the
justfication for leaving the __builtin_constant_p in. So I'd probably just
leave it as is (though it is a bit too much).


But I guess it is safe to use GEM_WARN_ON_ONCE instead of WARN_ON_ONCE
since there are no external callers (nothing coming from event) now. That
way at least production builds don't have to have the check.


Hmm, there's a GEM_WARN_ON but no GEM_WARN_ON_ONCE. So leave that as is too
I guess.

So I'm ok with the code staying as is. Enough bike-shed on this already.


Leaving it as is. @Ashutosh, okay to use your R-b without any changes to 
this patch?


Thanks,
Umesh



Thanks.
--
Ashutosh




Regards,

Tvrtko

>
> Thanks,
> Umesh
>
>>
>>> +    else
>>> +    WARN_ON_ONCE(bit >
>>> +

Re: [Intel-gfx] [PATCH v5 6/7] drm/i915/pmu: Prepare for multi-tile non-engine counters

2023-05-16 Thread Umesh Nerlige Ramappa

On Tue, May 16, 2023 at 05:39:02PM -0700, Dixit, Ashutosh wrote:

On Tue, 16 May 2023 16:35:33 -0700, Umesh Nerlige Ramappa wrote:




Hi Umesh,


+static u64 frequency_enabled_mask(void)


u32


+{
+   unsigned int i;
+   u64 mask = 0;


u32


+
+   for (i = 0; i < I915_PMU_MAX_GTS; i++)
+   mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
+   config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
+
+   return mask;
+}
+
 static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
-   u32 enable;
+   u64 enable;


u32



/*
 * Only some counters need the sampling timer.
@@ -131,9 +155,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool 
gpu_active)
 * Mask out all the ones which do not need the timer, or in
 * other words keep all the ones that could need the timer.
 */
-   enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
- config_mask(I915_PMU_REQUESTED_FREQUENCY) |
- ENGINE_SAMPLE_MASK;
+   enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;

/*
 * When the GPU is idle per-engine counters do not need to be


/snip/


diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index 3a811266ac6a..f88de9ae1ebb 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -38,13 +38,16 @@ enum {
__I915_NUM_PMU_SAMPLERS
 };

+#define I915_PMU_MAX_GTS 2
+
 /*
  * How many different events we track in the global PMU mask.
  *
  * It is also used to know to needed number of event reference counters.
  */
 #define I915_PMU_MASK_BITS \
-   (I915_ENGINE_SAMPLE_COUNT + __I915_PMU_TRACKED_EVENT_COUNT)
+   (I915_ENGINE_SAMPLE_COUNT + \
+I915_PMU_MAX_GTS * __I915_PMU_TRACKED_EVENT_COUNT)

 #define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_SEMA + 1)

@@ -95,7 +98,7 @@ struct i915_pmu {
 *
 * Low bits are engine samplers and other events continue from there.
 */
-   u32 enable;
+   u64 enable;


u32


Hmm, I missed that. Will fix.

Thanks,
Umesh


Thanks.
--
Ashutosh


Re: [Intel-gfx] [PATCH v5 1/7] drm/i915/pmu: Change bitmask of enabled events to u32

2023-05-16 Thread Umesh Nerlige Ramappa

On Tue, May 16, 2023 at 05:25:50PM -0700, Dixit, Ashutosh wrote:

On Tue, 16 May 2023 16:35:28 -0700, Umesh Nerlige Ramappa wrote:




Hi Umesh/Tvrtko,

Mostly repeating comments/questions made on the previous patch below.


From: Tvrtko Ursulin 

Having it as u64 was a confusing (but harmless) mistake.

Also add some asserts to make sure the internal field does not overflow
in the future.

v2: Fix WARN_ON firing for INTERRUPT event (Umesh)

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
Cc: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 7ece883a7d95..96543dce2db1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -50,7 +50,7 @@ static u8 engine_event_instance(struct perf_event *event)
return (event->attr.config >> I915_PMU_SAMPLE_BITS) & 0xff;
 }

-static bool is_engine_config(u64 config)
+static bool is_engine_config(const u64 config)
 {
return config < __I915_PMU_OTHER(0);
 }
@@ -88,9 +88,20 @@ static unsigned int config_bit(const u64 config)
return other_bit(config);
 }

-static u64 config_mask(u64 config)
+static u32 config_mask(const u64 config)
 {
-   return BIT_ULL(config_bit(config));
+   unsigned int bit = config_bit(config);


Give that config_bit() can return -1 (I understand it is avoided in moving
the code to config_mask from config_bit), maybe the code below should also
have that check?


config_mask is only called to check frequency related events in the 
code, so I don't see it returing -1 here.




int bit = config_bit(config);

if (bit != -1)
{
...
}

Though as mentioned below the 'if (__builtin_constant_p())' would have to
go. Maybe the code could even have stayed in config_bit with the check.


+
+   if (__builtin_constant_p(config))
+   BUILD_BUG_ON(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);


Given that config comes from the event (it is event->attr.config), can this
ever be a builtin constant?


Not sure about earlier code where these checks were inside config_bit(), 
but with changes I made, I don't see this being a builtin constant.  
However, nothing prevents a caller from just passing a builtin_constant 
to this in future.


Thanks,
Umesh




+   else
+   WARN_ON_ONCE(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);


There is really an even stricter limit on what the bit can be, which is the
total number of possible events but anyway this is good enough.

After addressing the above, this patch is:

Reviewed-by: Ashutosh Dixit 


+
+   return BIT(config_bit(config));
 }

 static bool is_engine_event(struct perf_event *event)
@@ -633,11 +644,10 @@ static void i915_pmu_enable(struct perf_event *event)
 {
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
-   unsigned int bit;

-   bit = event_bit(event);
if (bit == -1)
goto update;

@@ -651,7 +661,7 @@ static void i915_pmu_enable(struct perf_event *event)
GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
GEM_BUG_ON(pmu->enable_count[bit] == ~0);

-   pmu->enable |= BIT_ULL(bit);
+   pmu->enable |= BIT(bit);
pmu->enable_count[bit]++;

/*
@@ -698,7 +708,7 @@ static void i915_pmu_disable(struct perf_event *event)
 {
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
-   unsigned int bit = event_bit(event);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;

@@ -734,7 +744,7 @@ static void i915_pmu_disable(struct perf_event *event)
 * bitmask when the last listener on an event goes away.
 */
if (--pmu->enable_count[bit] == 0) {
-   pmu->enable &= ~BIT_ULL(bit);
+   pmu->enable &= ~BIT(bit);
pmu->timer_enabled &= pmu_needs_timer(pmu, true);
}

--
2.36.1



Re: [Intel-gfx] [PATCH] drm/i915/pmu: Change bitmask of enabled events to u32

2023-05-16 Thread Umesh Nerlige Ramappa

On Tue, May 16, 2023 at 03:13:01PM -0700, Umesh Nerlige Ramappa wrote:

On Tue, May 16, 2023 at 10:24:45AM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Having it as u64 was a confusing (but harmless) mistake.

Also add some asserts to make sure the internal field does not overflow
in the future.

Signed-off-by: Tvrtko Ursulin 
Cc: Ashutosh Dixit 
Cc: Umesh Nerlige Ramappa 
---
I am not entirely sure the __builtin_constant_p->BUILD_BUG_ON branch will
work with all compilers. Lets see...

Compile tested only.
---
drivers/gpu/drm/i915/i915_pmu.c | 32 ++--
1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 7ece883a7d95..8736b3418f88 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -50,7 +50,7 @@ static u8 engine_event_instance(struct perf_event *event)
return (event->attr.config >> I915_PMU_SAMPLE_BITS) & 0xff;
}

-static bool is_engine_config(u64 config)
+static bool is_engine_config(const u64 config)
{
return config < __I915_PMU_OTHER(0);
}
@@ -82,15 +82,28 @@ static unsigned int other_bit(const u64 config)

static unsigned int config_bit(const u64 config)
{
+   unsigned int bit;
+
if (is_engine_config(config))
-   return engine_config_sample(config);
+   bit = engine_config_sample(config);
else
-   return other_bit(config);
+   bit = other_bit(config);
+
+   if (__builtin_constant_p(config))
+   BUILD_BUG_ON(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);
+   else
+   WARN_ON_ONCE(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);


The else is firing for the INTERRUPT event because event_bit() also 
calls config_bit(). It would be best to move this check to 
config_mask() and leave this function as is.


I posted the modified version here - 
https://patchwork.freedesktop.org/patch/537361/?series=117843&rev=1 as 
part of the MTL PMU series so that it Tests out with IGT patches.


Thanks,
Umesh



Thanks,
Umesh


+
+   return bit;
}

-static u64 config_mask(u64 config)
+static u32 config_mask(const u64 config)
{
-   return BIT_ULL(config_bit(config));
+   return BIT(config_bit(config));
}

static bool is_engine_event(struct perf_event *event)
@@ -633,11 +646,10 @@ static void i915_pmu_enable(struct perf_event *event)
{
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
-   unsigned int bit;

-   bit = event_bit(event);
if (bit == -1)
goto update;

@@ -651,7 +663,7 @@ static void i915_pmu_enable(struct perf_event *event)
GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
GEM_BUG_ON(pmu->enable_count[bit] == ~0);

-   pmu->enable |= BIT_ULL(bit);
+   pmu->enable |= BIT(bit);
pmu->enable_count[bit]++;

/*
@@ -698,7 +710,7 @@ static void i915_pmu_disable(struct perf_event *event)
{
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
-   unsigned int bit = event_bit(event);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;

@@ -734,7 +746,7 @@ static void i915_pmu_disable(struct perf_event *event)
 * bitmask when the last listener on an event goes away.
 */
if (--pmu->enable_count[bit] == 0) {
-   pmu->enable &= ~BIT_ULL(bit);
+   pmu->enable &= ~BIT(bit);
pmu->timer_enabled &= pmu_needs_timer(pmu, true);
}

--
2.39.2



[Intel-gfx] [PATCH v5 4/7] drm/i915/pmu: Transform PMU parking code to be GT based

2023-05-16 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Trivial prep work for full multi-tile enablement later.

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Vinay Belgaumkar 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |  4 ++--
 drivers/gpu/drm/i915/i915_pmu.c   | 16 
 drivers/gpu/drm/i915/i915_pmu.h   |  9 +
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index e02cb90723ae..c2e69bafd02b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -87,7 +87,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
 
intel_rc6_unpark(>->rc6);
intel_rps_unpark(>->rps);
-   i915_pmu_gt_unparked(i915);
+   i915_pmu_gt_unparked(gt);
intel_guc_busyness_unpark(gt);
 
intel_gt_unpark_requests(gt);
@@ -109,7 +109,7 @@ static int __gt_park(struct intel_wakeref *wf)
 
intel_guc_busyness_park(gt);
i915_vma_parked(gt);
-   i915_pmu_gt_parked(i915);
+   i915_pmu_gt_parked(gt);
intel_rps_park(>->rps);
intel_rc6_park(>->rc6);
 
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 6d594f67f365..890693fdaf9e 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -228,11 +228,11 @@ static void init_rc6(struct i915_pmu *pmu)
}
 }
 
-static void park_rc6(struct drm_i915_private *i915)
+static void park_rc6(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
-   pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(to_gt(i915));
+   pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(gt);
pmu->sleep_last = ktime_get_raw();
 }
 
@@ -247,16 +247,16 @@ static void __i915_pmu_maybe_start_timer(struct i915_pmu 
*pmu)
}
 }
 
-void i915_pmu_gt_parked(struct drm_i915_private *i915)
+void i915_pmu_gt_parked(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
if (!pmu->base.event_init)
return;
 
spin_lock_irq(&pmu->lock);
 
-   park_rc6(i915);
+   park_rc6(gt);
 
/*
 * Signal sampling timer to stop if only engine events are enabled and
@@ -267,9 +267,9 @@ void i915_pmu_gt_parked(struct drm_i915_private *i915)
spin_unlock_irq(&pmu->lock);
 }
 
-void i915_pmu_gt_unparked(struct drm_i915_private *i915)
+void i915_pmu_gt_unparked(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
if (!pmu->base.event_init)
return;
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index c30f43319a78..a686fd7ccedf 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -13,6 +13,7 @@
 #include 
 
 struct drm_i915_private;
+struct intel_gt;
 
 /*
  * Non-engine events that we need to track enabled-disabled transition and
@@ -151,15 +152,15 @@ int i915_pmu_init(void);
 void i915_pmu_exit(void);
 void i915_pmu_register(struct drm_i915_private *i915);
 void i915_pmu_unregister(struct drm_i915_private *i915);
-void i915_pmu_gt_parked(struct drm_i915_private *i915);
-void i915_pmu_gt_unparked(struct drm_i915_private *i915);
+void i915_pmu_gt_parked(struct intel_gt *gt);
+void i915_pmu_gt_unparked(struct intel_gt *gt);
 #else
 static inline int i915_pmu_init(void) { return 0; }
 static inline void i915_pmu_exit(void) {}
 static inline void i915_pmu_register(struct drm_i915_private *i915) {}
 static inline void i915_pmu_unregister(struct drm_i915_private *i915) {}
-static inline void i915_pmu_gt_parked(struct drm_i915_private *i915) {}
-static inline void i915_pmu_gt_unparked(struct drm_i915_private *i915) {}
+static inline void i915_pmu_gt_parked(struct intel_gt *gt) {}
+static inline void i915_pmu_gt_unparked(struct intel_gt *gt) {}
 #endif
 
 #endif
-- 
2.36.1



[Intel-gfx] [PATCH v5 5/7] drm/i915/pmu: Add reference counting to the sampling timer

2023-05-16 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

We do not want to have timers per tile and waste CPU cycles and energy via
multiple wake-up sources, for a relatively un-important task of PMU
sampling, so keeping a single timer works well. But we also do not want
the first GT which goes idle to turn off the timer.

Add some reference counting, via a mask of unparked GTs, to solve this.

v2: Drop the check for unparked in i915_sample (Ashutosh)
v3: Revert v2 (Tvrtko)

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 12 ++--
 drivers/gpu/drm/i915/i915_pmu.h |  4 
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 890693fdaf9e..ecb57a94143e 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -262,7 +262,9 @@ void i915_pmu_gt_parked(struct intel_gt *gt)
 * Signal sampling timer to stop if only engine events are enabled and
 * GPU went idle.
 */
-   pmu->timer_enabled = pmu_needs_timer(pmu, false);
+   pmu->unparked &= ~BIT(gt->info.id);
+   if (pmu->unparked == 0)
+   pmu->timer_enabled = pmu_needs_timer(pmu, false);
 
spin_unlock_irq(&pmu->lock);
 }
@@ -279,7 +281,10 @@ void i915_pmu_gt_unparked(struct intel_gt *gt)
/*
 * Re-enable sampling timer when GPU goes active.
 */
-   __i915_pmu_maybe_start_timer(pmu);
+   if (pmu->unparked == 0)
+   __i915_pmu_maybe_start_timer(pmu);
+
+   pmu->unparked |= BIT(gt->info.id);
 
spin_unlock_irq(&pmu->lock);
 }
@@ -449,6 +454,9 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
 */
 
for_each_gt(gt, i915, i) {
+   if (!(pmu->unparked & BIT(i)))
+   continue;
+
engines_sample(gt, period_ns);
 
if (i == 0) /* FIXME */
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index a686fd7ccedf..3a811266ac6a 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -76,6 +76,10 @@ struct i915_pmu {
 * @lock: Lock protecting enable mask and ref count handling.
 */
spinlock_t lock;
+   /**
+* @unparked: GT unparked mask.
+*/
+   unsigned int unparked;
/**
 * @timer: Timer for internal i915 PMU sampling.
 */
-- 
2.36.1



[Intel-gfx] [PATCH v5 6/7] drm/i915/pmu: Prepare for multi-tile non-engine counters

2023-05-16 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Reserve some bits in the counter config namespace which will carry the
tile id and prepare the code to handle this.

No per tile counters have been added yet.

v2:
- Fix checkpatch issues
- Use 4 bits for gt id in non-engine counters. Drop FIXME.
- Set MAX GTs to 4. Drop FIXME.

v3: (Ashutosh, Tvrtko)
- Drop BUG_ON that would never fire
- Make enable u64
- Pull in some code from next patch

v4: Set I915_PMU_MAX_GTS to 2 (Tvrtko)

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 148 +++-
 drivers/gpu/drm/i915/i915_pmu.h |  11 ++-
 include/uapi/drm/i915_drm.h |  17 +++-
 3 files changed, 129 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index ecb57a94143e..dc1ca3a15ff6 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -56,11 +56,21 @@ static bool is_engine_config(const u64 config)
return config < __I915_PMU_OTHER(0);
 }
 
+static unsigned int config_gt_id(const u64 config)
+{
+   return config >> __I915_PMU_GT_SHIFT;
+}
+
+static u64 config_counter(const u64 config)
+{
+   return config & ~(~0ULL << __I915_PMU_GT_SHIFT);
+}
+
 static unsigned int other_bit(const u64 config)
 {
unsigned int val;
 
-   switch (config) {
+   switch (config_counter(config)) {
case I915_PMU_ACTUAL_FREQUENCY:
val =  __I915_PMU_ACTUAL_FREQUENCY_ENABLED;
break;
@@ -78,7 +88,9 @@ static unsigned int other_bit(const u64 config)
return -1;
}
 
-   return I915_ENGINE_SAMPLE_COUNT + val;
+   return I915_ENGINE_SAMPLE_COUNT +
+  config_gt_id(config) * __I915_PMU_TRACKED_EVENT_COUNT +
+  val;
 }
 
 static unsigned int config_bit(const u64 config)
@@ -115,10 +127,22 @@ static unsigned int event_bit(struct perf_event *event)
return config_bit(event->attr.config);
 }
 
+static u64 frequency_enabled_mask(void)
+{
+   unsigned int i;
+   u64 mask = 0;
+
+   for (i = 0; i < I915_PMU_MAX_GTS; i++)
+   mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
+   config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
+
+   return mask;
+}
+
 static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
-   u32 enable;
+   u64 enable;
 
/*
 * Only some counters need the sampling timer.
@@ -131,9 +155,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool 
gpu_active)
 * Mask out all the ones which do not need the timer, or in
 * other words keep all the ones that could need the timer.
 */
-   enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
- config_mask(I915_PMU_REQUESTED_FREQUENCY) |
- ENGINE_SAMPLE_MASK;
+   enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;
 
/*
 * When the GPU is idle per-engine counters do not need to be
@@ -175,9 +197,37 @@ static inline s64 ktime_since_raw(const ktime_t kt)
return ktime_to_ns(ktime_sub(ktime_get_raw(), kt));
 }
 
+static unsigned int
+__sample_idx(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   unsigned int idx = gt_id * __I915_NUM_PMU_SAMPLERS + sample;
+
+   GEM_BUG_ON(idx >= ARRAY_SIZE(pmu->sample));
+
+   return idx;
+}
+
+static u64 read_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   return pmu->sample[__sample_idx(pmu, gt_id, sample)].cur;
+}
+
+static void
+store_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample, u64 val)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur = val;
+}
+
+static void
+add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val, 
u32 mul)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur += mul_u32_u32(val, 
mul);
+}
+
 static u64 get_rc6(struct intel_gt *gt)
 {
struct drm_i915_private *i915 = gt->i915;
+   const unsigned int gt_id = gt->info.id;
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
bool awake = false;
@@ -192,7 +242,7 @@ static u64 get_rc6(struct intel_gt *gt)
spin_lock_irqsave(&pmu->lock, flags);
 
if (awake) {
-   pmu->sample[__I915_SAMPLE_RC6].cur = val;
+   store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
} else {
/*
 * We think we are runtime suspended.
@@ -201,14 +251,14 @@ static u64 get_rc6(struct intel_gt *gt)
 * on top of the last known real value, as the approximated RC6
 * counter value.
 */
-   val = ktime_since_raw(pmu->sleep_last);
-   val += pmu->sample[__I915

[Intel-gfx] [PATCH v5 2/7] drm/i915/pmu: Support PMU for all engines

2023-05-16 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Given how the metrics are already exported, we also need to run sampling
over engines from all GTs.

Problem of GT frequencies is left for later.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 96543dce2db1..9edf87ee5d10 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -10,6 +10,7 @@
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_regs.h"
 #include "gt/intel_engine_user.h"
+#include "gt/intel_gt.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_gt_regs.h"
 #include "gt/intel_rc6.h"
@@ -425,8 +426,9 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
struct drm_i915_private *i915 =
container_of(hrtimer, struct drm_i915_private, pmu.timer);
struct i915_pmu *pmu = &i915->pmu;
-   struct intel_gt *gt = to_gt(i915);
unsigned int period_ns;
+   struct intel_gt *gt;
+   unsigned int i;
ktime_t now;
 
if (!READ_ONCE(pmu->timer_enabled))
@@ -442,8 +444,13 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
 * grabbing the forcewake. However the potential error from timer call-
 * back delay greatly dominates this so we keep it simple.
 */
-   engines_sample(gt, period_ns);
-   frequency_sample(gt, period_ns);
+
+   for_each_gt(gt, i915, i) {
+   engines_sample(gt, period_ns);
+
+   if (i == 0) /* FIXME */
+   frequency_sample(gt, period_ns);
+   }
 
hrtimer_forward(hrtimer, now, ns_to_ktime(PERIOD));
 
-- 
2.36.1



[Intel-gfx] [PATCH v5 7/7] drm/i915/pmu: Export counters from all tiles

2023-05-16 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Start exporting frequency and RC6 counters from all tiles.

Existing counters keep their names and config values and new one use the
namespace added in the previous patch, with the "-gtN" added to their
names.

Interrupts counter is an odd one off. Because it is the global device
counters (not only GT) we choose not to add per tile versions for now.

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Aravind Iddamsetty 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 82 ++---
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index dc1ca3a15ff6..dbb24c0c6093 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -940,11 +940,20 @@ static const struct attribute_group 
i915_pmu_cpumask_attr_group = {
.attrs = i915_cpumask_attrs,
 };
 
-#define __event(__config, __name, __unit) \
+#define __event(__counter, __name, __unit) \
 { \
-   .config = (__config), \
+   .counter = (__counter), \
.name = (__name), \
.unit = (__unit), \
+   .global = false, \
+}
+
+#define __global_event(__counter, __name, __unit) \
+{ \
+   .counter = (__counter), \
+   .name = (__name), \
+   .unit = (__unit), \
+   .global = true, \
 }
 
 #define __engine_event(__sample, __name) \
@@ -983,15 +992,16 @@ create_event_attributes(struct i915_pmu *pmu)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
static const struct {
-   u64 config;
+   unsigned int counter;
const char *name;
const char *unit;
+   bool global;
} events[] = {
-   __event(I915_PMU_ACTUAL_FREQUENCY, "actual-frequency", "M"),
-   __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", 
"M"),
-   __event(I915_PMU_INTERRUPTS, "interrupts", NULL),
-   __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"),
-   __event(I915_PMU_SOFTWARE_GT_AWAKE_TIME, 
"software-gt-awake-time", "ns"),
+   __event(0, "actual-frequency", "M"),
+   __event(1, "requested-frequency", "M"),
+   __global_event(2, "interrupts", NULL),
+   __event(3, "rc6-residency", "ns"),
+   __event(4, "software-gt-awake-time", "ns"),
};
static const struct {
enum drm_i915_pmu_engine_sample sample;
@@ -1006,12 +1016,17 @@ create_event_attributes(struct i915_pmu *pmu)
struct i915_ext_attribute *i915_attr = NULL, *i915_iter;
struct attribute **attr = NULL, **attr_iter;
struct intel_engine_cs *engine;
-   unsigned int i;
+   struct intel_gt *gt;
+   unsigned int i, j;
 
/* Count how many counters we will be exposing. */
-   for (i = 0; i < ARRAY_SIZE(events); i++) {
-   if (!config_status(i915, events[i].config))
-   count++;
+   for_each_gt(gt, i915, j) {
+   for (i = 0; i < ARRAY_SIZE(events); i++) {
+   u64 config = ___I915_PMU_OTHER(j, events[i].counter);
+
+   if (!config_status(i915, config))
+   count++;
+   }
}
 
for_each_uabi_engine(engine, i915) {
@@ -1041,26 +1056,39 @@ create_event_attributes(struct i915_pmu *pmu)
attr_iter = attr;
 
/* Initialize supported non-engine counters. */
-   for (i = 0; i < ARRAY_SIZE(events); i++) {
-   char *str;
-
-   if (config_status(i915, events[i].config))
-   continue;
-
-   str = kstrdup(events[i].name, GFP_KERNEL);
-   if (!str)
-   goto err;
+   for_each_gt(gt, i915, j) {
+   for (i = 0; i < ARRAY_SIZE(events); i++) {
+   u64 config = ___I915_PMU_OTHER(j, events[i].counter);
+   char *str;
 
-   *attr_iter++ = &i915_iter->attr.attr;
-   i915_iter = add_i915_attr(i915_iter, str, events[i].config);
+   if (config_status(i915, config))
+   continue;
 
-   if (events[i].unit) {
-   str = kasprintf(GFP_KERNEL, "%s.unit", events[i].name);
+   if (events[i].global || !HAS_EXTRA_GT_LIST(i915))
+   str = kstrdup(events[i].name, GFP_KERNEL);
+   else
+   str = kasprintf(GFP_KERNEL, "%s-gt%u",
+   events[

[Intel-gfx] [PATCH v5 3/7] drm/i915/pmu: Skip sampling engines with no enabled counters

2023-05-16 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

As we have more and more engines do not waste time sampling the ones no-
one is monitoring.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 9edf87ee5d10..6d594f67f365 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -350,6 +350,9 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
return;
 
for_each_engine(engine, gt, id) {
+   if (!engine->pmu.enable)
+   continue;
+
if (!intel_engine_pm_get_if_awake(engine))
continue;
 
-- 
2.36.1



[Intel-gfx] [PATCH v5 1/7] drm/i915/pmu: Change bitmask of enabled events to u32

2023-05-16 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Having it as u64 was a confusing (but harmless) mistake.

Also add some asserts to make sure the internal field does not overflow
in the future.

v2: Fix WARN_ON firing for INTERRUPT event (Umesh)

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
Cc: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 7ece883a7d95..96543dce2db1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -50,7 +50,7 @@ static u8 engine_event_instance(struct perf_event *event)
return (event->attr.config >> I915_PMU_SAMPLE_BITS) & 0xff;
 }
 
-static bool is_engine_config(u64 config)
+static bool is_engine_config(const u64 config)
 {
return config < __I915_PMU_OTHER(0);
 }
@@ -88,9 +88,20 @@ static unsigned int config_bit(const u64 config)
return other_bit(config);
 }
 
-static u64 config_mask(u64 config)
+static u32 config_mask(const u64 config)
 {
-   return BIT_ULL(config_bit(config));
+   unsigned int bit = config_bit(config);
+
+   if (__builtin_constant_p(config))
+   BUILD_BUG_ON(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);
+   else
+   WARN_ON_ONCE(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);
+
+   return BIT(config_bit(config));
 }
 
 static bool is_engine_event(struct perf_event *event)
@@ -633,11 +644,10 @@ static void i915_pmu_enable(struct perf_event *event)
 {
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
-   unsigned int bit;
 
-   bit = event_bit(event);
if (bit == -1)
goto update;
 
@@ -651,7 +661,7 @@ static void i915_pmu_enable(struct perf_event *event)
GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
GEM_BUG_ON(pmu->enable_count[bit] == ~0);
 
-   pmu->enable |= BIT_ULL(bit);
+   pmu->enable |= BIT(bit);
pmu->enable_count[bit]++;
 
/*
@@ -698,7 +708,7 @@ static void i915_pmu_disable(struct perf_event *event)
 {
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
-   unsigned int bit = event_bit(event);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
 
@@ -734,7 +744,7 @@ static void i915_pmu_disable(struct perf_event *event)
 * bitmask when the last listener on an event goes away.
 */
if (--pmu->enable_count[bit] == 0) {
-   pmu->enable &= ~BIT_ULL(bit);
+   pmu->enable &= ~BIT(bit);
pmu->timer_enabled &= pmu_needs_timer(pmu, true);
}
 
-- 
2.36.1



[Intel-gfx] [PATCH v5 0/7] Add MTL PMU support for multi-gt

2023-05-16 Thread Umesh Nerlige Ramappa
With MTL, frequency and rc6 counters are specific to a gt. Export these
counters via gt-specific events to the user space.

v2: Remove aggregation support from kernel
v3: Review comments (Ashutosh, Tvrtko)
v4:
- Include R-b for 6/6
- Add Test-with
- Fix versioning info in cover letter
v5:
- Include "drm/i915/pmu: Change bitmask of enabled events to u32"

Signed-off-by: Umesh Nerlige Ramappa 
Test-with: 20230513022234.2832233-1-umesh.nerlige.rama...@intel.com

Tvrtko Ursulin (7):
  drm/i915/pmu: Change bitmask of enabled events to u32
  drm/i915/pmu: Support PMU for all engines
  drm/i915/pmu: Skip sampling engines with no enabled counters
  drm/i915/pmu: Transform PMU parking code to be GT based
  drm/i915/pmu: Add reference counting to the sampling timer
  drm/i915/pmu: Prepare for multi-tile non-engine counters
  drm/i915/pmu: Export counters from all tiles

 drivers/gpu/drm/i915/gt/intel_gt_pm.c |   4 +-
 drivers/gpu/drm/i915/i915_pmu.c   | 292 ++
 drivers/gpu/drm/i915/i915_pmu.h   |  24 ++-
 include/uapi/drm/i915_drm.h   |  17 +-
 4 files changed, 240 insertions(+), 97 deletions(-)

-- 
2.36.1



Re: [Intel-gfx] [PATCH] drm/i915/pmu: Change bitmask of enabled events to u32

2023-05-16 Thread Umesh Nerlige Ramappa

On Tue, May 16, 2023 at 10:24:45AM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Having it as u64 was a confusing (but harmless) mistake.

Also add some asserts to make sure the internal field does not overflow
in the future.

Signed-off-by: Tvrtko Ursulin 
Cc: Ashutosh Dixit 
Cc: Umesh Nerlige Ramappa 
---
I am not entirely sure the __builtin_constant_p->BUILD_BUG_ON branch will
work with all compilers. Lets see...

Compile tested only.
---
drivers/gpu/drm/i915/i915_pmu.c | 32 ++--
1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 7ece883a7d95..8736b3418f88 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -50,7 +50,7 @@ static u8 engine_event_instance(struct perf_event *event)
return (event->attr.config >> I915_PMU_SAMPLE_BITS) & 0xff;
}

-static bool is_engine_config(u64 config)
+static bool is_engine_config(const u64 config)
{
return config < __I915_PMU_OTHER(0);
}
@@ -82,15 +82,28 @@ static unsigned int other_bit(const u64 config)

static unsigned int config_bit(const u64 config)
{
+   unsigned int bit;
+
if (is_engine_config(config))
-   return engine_config_sample(config);
+   bit = engine_config_sample(config);
else
-   return other_bit(config);
+   bit = other_bit(config);
+
+   if (__builtin_constant_p(config))
+   BUILD_BUG_ON(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);
+   else
+   WARN_ON_ONCE(bit >
+BITS_PER_TYPE(typeof_member(struct i915_pmu,
+enable)) - 1);


The else is firing for the INTERRUPT event because event_bit() also 
calls config_bit(). It would be best to move this check to config_mask() 
and leave this function as is.


Thanks,
Umesh 


+
+   return bit;
}

-static u64 config_mask(u64 config)
+static u32 config_mask(const u64 config)
{
-   return BIT_ULL(config_bit(config));
+   return BIT(config_bit(config));
}

static bool is_engine_event(struct perf_event *event)
@@ -633,11 +646,10 @@ static void i915_pmu_enable(struct perf_event *event)
{
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
-   unsigned int bit;

-   bit = event_bit(event);
if (bit == -1)
goto update;

@@ -651,7 +663,7 @@ static void i915_pmu_enable(struct perf_event *event)
GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
GEM_BUG_ON(pmu->enable_count[bit] == ~0);

-   pmu->enable |= BIT_ULL(bit);
+   pmu->enable |= BIT(bit);
pmu->enable_count[bit]++;

/*
@@ -698,7 +710,7 @@ static void i915_pmu_disable(struct perf_event *event)
{
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), pmu.base);
-   unsigned int bit = event_bit(event);
+   const unsigned int bit = event_bit(event);
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;

@@ -734,7 +746,7 @@ static void i915_pmu_disable(struct perf_event *event)
 * bitmask when the last listener on an event goes away.
 */
if (--pmu->enable_count[bit] == 0) {
-   pmu->enable &= ~BIT_ULL(bit);
+   pmu->enable &= ~BIT(bit);
pmu->timer_enabled &= pmu_needs_timer(pmu, true);
}

--
2.39.2



Re: [Intel-gfx] [PATCH 5/6] drm/i915/pmu: Prepare for multi-tile non-engine counters

2023-05-15 Thread Umesh Nerlige Ramappa

On Mon, May 15, 2023 at 11:12:33AM +0100, Tvrtko Ursulin wrote:


On 15/05/2023 07:44, Umesh Nerlige Ramappa wrote:

From: Tvrtko Ursulin 

Reserve some bits in the counter config namespace which will carry the
tile id and prepare the code to handle this.

No per tile counters have been added yet.

v2:
- Fix checkpatch issues
- Use 4 bits for gt id in non-engine counters. Drop FIXME.
- Set MAX GTs to 4. Drop FIXME.

v3: (Ashutosh, Tvrtko)
- Drop BUG_ON that would never fire
- Make enable u64
- Pull in some code from next patch

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 148 +++-
 drivers/gpu/drm/i915/i915_pmu.h |  11 ++-
 include/uapi/drm/i915_drm.h |  17 +++-
 3 files changed, 129 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 725b01b00775..b3dd9e51c5cc 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -56,11 +56,21 @@ static bool is_engine_config(u64 config)
return config < __I915_PMU_OTHER(0);
 }
+static unsigned int config_gt_id(const u64 config)
+{
+   return config >> __I915_PMU_GT_SHIFT;
+}
+
+static u64 config_counter(const u64 config)
+{
+   return config & ~(~0ULL << __I915_PMU_GT_SHIFT);
+}
+
 static unsigned int other_bit(const u64 config)
 {
unsigned int val;
-   switch (config) {
+   switch (config_counter(config)) {
case I915_PMU_ACTUAL_FREQUENCY:
val =  __I915_PMU_ACTUAL_FREQUENCY_ENABLED;
break;
@@ -78,7 +88,9 @@ static unsigned int other_bit(const u64 config)
return -1;
}
-   return I915_ENGINE_SAMPLE_COUNT + val;
+   return I915_ENGINE_SAMPLE_COUNT +
+  config_gt_id(config) * __I915_PMU_TRACKED_EVENT_COUNT +
+  val;
 }
 static unsigned int config_bit(const u64 config)
@@ -104,10 +116,22 @@ static unsigned int event_bit(struct perf_event *event)
return config_bit(event->attr.config);
 }
+static u64 frequency_enabled_mask(void)
+{
+   unsigned int i;
+   u64 mask = 0;
+
+   for (i = 0; i < I915_PMU_MAX_GTS; i++)
+   mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
+   config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
+
+   return mask;
+}
+
 static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
-   u32 enable;
+   u64 enable;
/*
 * Only some counters need the sampling timer.
@@ -120,9 +144,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool 
gpu_active)
 * Mask out all the ones which do not need the timer, or in
 * other words keep all the ones that could need the timer.
 */
-   enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
- config_mask(I915_PMU_REQUESTED_FREQUENCY) |
- ENGINE_SAMPLE_MASK;
+   enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;
/*
 * When the GPU is idle per-engine counters do not need to be
@@ -164,9 +186,37 @@ static inline s64 ktime_since_raw(const ktime_t kt)
return ktime_to_ns(ktime_sub(ktime_get_raw(), kt));
 }
+static unsigned int
+__sample_idx(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   unsigned int idx = gt_id * __I915_NUM_PMU_SAMPLERS + sample;
+
+   GEM_BUG_ON(idx >= ARRAY_SIZE(pmu->sample));
+
+   return idx;
+}
+
+static u64 read_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   return pmu->sample[__sample_idx(pmu, gt_id, sample)].cur;
+}
+
+static void
+store_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample, u64 val)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur = val;
+}
+
+static void
+add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val, 
u32 mul)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur += mul_u32_u32(val, 
mul);
+}
+
 static u64 get_rc6(struct intel_gt *gt)
 {
struct drm_i915_private *i915 = gt->i915;
+   const unsigned int gt_id = gt->info.id;
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
bool awake = false;
@@ -181,7 +231,7 @@ static u64 get_rc6(struct intel_gt *gt)
spin_lock_irqsave(&pmu->lock, flags);
if (awake) {
-   pmu->sample[__I915_SAMPLE_RC6].cur = val;
+   store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
} else {
/*
 * We think we are runtime suspended.
@@ -190,14 +240,14 @@ static u64 get_rc6(struct intel_gt *gt)
 * on top of the last known real value, as the approximated RC6
 * counter value.
 */
-   val = ktime_since_raw(pmu-

[Intel-gfx] [PATCH v4 0/6] Add MTL PMU support for multi-gt

2023-05-14 Thread Umesh Nerlige Ramappa
With MTL, frequency and rc6 counters are specific to a gt. Export these
counters via gt-specific events to the user space.

v2: Remove aggregation support from kernel
v3: Review comments (Ashutosh, Tvrtko)
v4:
- Include R-b for 6/6
- Add Test-with
- Fix versioning info in cover letter

Signed-off-by: Umesh Nerlige Ramappa 
Test-with: 20230513022234.2832233-1-umesh.nerlige.rama...@intel.com

Tvrtko Ursulin (6):
  drm/i915/pmu: Support PMU for all engines
  drm/i915/pmu: Skip sampling engines with no enabled counters
  drm/i915/pmu: Transform PMU parking code to be GT based
  drm/i915/pmu: Add reference counting to the sampling timer
  drm/i915/pmu: Prepare for multi-tile non-engine counters
  drm/i915/pmu: Export counters from all tiles

 drivers/gpu/drm/i915/gt/intel_gt_pm.c |   4 +-
 drivers/gpu/drm/i915/i915_pmu.c   | 263 ++
 drivers/gpu/drm/i915/i915_pmu.h   |  24 ++-
 include/uapi/drm/i915_drm.h   |  17 +-
 4 files changed, 219 insertions(+), 89 deletions(-)

-- 
2.36.1



[Intel-gfx] [PATCH 3/6] drm/i915/pmu: Transform PMU parking code to be GT based

2023-05-14 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Trivial prep work for full multi-tile enablement later.

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Vinay Belgaumkar 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |  4 ++--
 drivers/gpu/drm/i915/i915_pmu.c   | 16 
 drivers/gpu/drm/i915/i915_pmu.h   |  9 +
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index e02cb90723ae..c2e69bafd02b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -87,7 +87,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
 
intel_rc6_unpark(>->rc6);
intel_rps_unpark(>->rps);
-   i915_pmu_gt_unparked(i915);
+   i915_pmu_gt_unparked(gt);
intel_guc_busyness_unpark(gt);
 
intel_gt_unpark_requests(gt);
@@ -109,7 +109,7 @@ static int __gt_park(struct intel_wakeref *wf)
 
intel_guc_busyness_park(gt);
i915_vma_parked(gt);
-   i915_pmu_gt_parked(i915);
+   i915_pmu_gt_parked(gt);
intel_rps_park(>->rps);
intel_rc6_park(>->rc6);
 
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index ba769f7fc385..2b63ee31e1b3 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -217,11 +217,11 @@ static void init_rc6(struct i915_pmu *pmu)
}
 }
 
-static void park_rc6(struct drm_i915_private *i915)
+static void park_rc6(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
-   pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(to_gt(i915));
+   pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(gt);
pmu->sleep_last = ktime_get_raw();
 }
 
@@ -236,16 +236,16 @@ static void __i915_pmu_maybe_start_timer(struct i915_pmu 
*pmu)
}
 }
 
-void i915_pmu_gt_parked(struct drm_i915_private *i915)
+void i915_pmu_gt_parked(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
if (!pmu->base.event_init)
return;
 
spin_lock_irq(&pmu->lock);
 
-   park_rc6(i915);
+   park_rc6(gt);
 
/*
 * Signal sampling timer to stop if only engine events are enabled and
@@ -256,9 +256,9 @@ void i915_pmu_gt_parked(struct drm_i915_private *i915)
spin_unlock_irq(&pmu->lock);
 }
 
-void i915_pmu_gt_unparked(struct drm_i915_private *i915)
+void i915_pmu_gt_unparked(struct intel_gt *gt)
 {
-   struct i915_pmu *pmu = &i915->pmu;
+   struct i915_pmu *pmu = >->i915->pmu;
 
if (!pmu->base.event_init)
return;
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index c30f43319a78..a686fd7ccedf 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -13,6 +13,7 @@
 #include 
 
 struct drm_i915_private;
+struct intel_gt;
 
 /*
  * Non-engine events that we need to track enabled-disabled transition and
@@ -151,15 +152,15 @@ int i915_pmu_init(void);
 void i915_pmu_exit(void);
 void i915_pmu_register(struct drm_i915_private *i915);
 void i915_pmu_unregister(struct drm_i915_private *i915);
-void i915_pmu_gt_parked(struct drm_i915_private *i915);
-void i915_pmu_gt_unparked(struct drm_i915_private *i915);
+void i915_pmu_gt_parked(struct intel_gt *gt);
+void i915_pmu_gt_unparked(struct intel_gt *gt);
 #else
 static inline int i915_pmu_init(void) { return 0; }
 static inline void i915_pmu_exit(void) {}
 static inline void i915_pmu_register(struct drm_i915_private *i915) {}
 static inline void i915_pmu_unregister(struct drm_i915_private *i915) {}
-static inline void i915_pmu_gt_parked(struct drm_i915_private *i915) {}
-static inline void i915_pmu_gt_unparked(struct drm_i915_private *i915) {}
+static inline void i915_pmu_gt_parked(struct intel_gt *gt) {}
+static inline void i915_pmu_gt_unparked(struct intel_gt *gt) {}
 #endif
 
 #endif
-- 
2.36.1



[Intel-gfx] [PATCH 4/6] drm/i915/pmu: Add reference counting to the sampling timer

2023-05-14 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

We do not want to have timers per tile and waste CPU cycles and energy via
multiple wake-up sources, for a relatively un-important task of PMU
sampling, so keeping a single timer works well. But we also do not want
the first GT which goes idle to turn off the timer.

Add some reference counting, via a mask of unparked GTs, to solve this.

v2: Drop the check for unparked in i915_sample (Ashutosh)

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 9 +++--
 drivers/gpu/drm/i915/i915_pmu.h | 4 
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 2b63ee31e1b3..725b01b00775 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -251,7 +251,9 @@ void i915_pmu_gt_parked(struct intel_gt *gt)
 * Signal sampling timer to stop if only engine events are enabled and
 * GPU went idle.
 */
-   pmu->timer_enabled = pmu_needs_timer(pmu, false);
+   pmu->unparked &= ~BIT(gt->info.id);
+   if (pmu->unparked == 0)
+   pmu->timer_enabled = pmu_needs_timer(pmu, false);
 
spin_unlock_irq(&pmu->lock);
 }
@@ -268,7 +270,10 @@ void i915_pmu_gt_unparked(struct intel_gt *gt)
/*
 * Re-enable sampling timer when GPU goes active.
 */
-   __i915_pmu_maybe_start_timer(pmu);
+   if (pmu->unparked == 0)
+   __i915_pmu_maybe_start_timer(pmu);
+
+   pmu->unparked |= BIT(gt->info.id);
 
spin_unlock_irq(&pmu->lock);
 }
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index a686fd7ccedf..3a811266ac6a 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -76,6 +76,10 @@ struct i915_pmu {
 * @lock: Lock protecting enable mask and ref count handling.
 */
spinlock_t lock;
+   /**
+* @unparked: GT unparked mask.
+*/
+   unsigned int unparked;
/**
 * @timer: Timer for internal i915 PMU sampling.
 */
-- 
2.36.1



[Intel-gfx] [PATCH 6/6] drm/i915/pmu: Export counters from all tiles

2023-05-14 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Start exporting frequency and RC6 counters from all tiles.

Existing counters keep their names and config values and new one use the
namespace added in the previous patch, with the "-gtN" added to their
names.

Interrupts counter is an odd one off. Because it is the global device
counters (not only GT) we choose not to add per tile versions for now.

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Aravind Iddamsetty 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 82 ++---
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index b3dd9e51c5cc..12345fd0b2cd 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -927,11 +927,20 @@ static const struct attribute_group 
i915_pmu_cpumask_attr_group = {
.attrs = i915_cpumask_attrs,
 };
 
-#define __event(__config, __name, __unit) \
+#define __event(__counter, __name, __unit) \
 { \
-   .config = (__config), \
+   .counter = (__counter), \
.name = (__name), \
.unit = (__unit), \
+   .global = false, \
+}
+
+#define __global_event(__counter, __name, __unit) \
+{ \
+   .counter = (__counter), \
+   .name = (__name), \
+   .unit = (__unit), \
+   .global = true, \
 }
 
 #define __engine_event(__sample, __name) \
@@ -970,15 +979,16 @@ create_event_attributes(struct i915_pmu *pmu)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
static const struct {
-   u64 config;
+   unsigned int counter;
const char *name;
const char *unit;
+   bool global;
} events[] = {
-   __event(I915_PMU_ACTUAL_FREQUENCY, "actual-frequency", "M"),
-   __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", 
"M"),
-   __event(I915_PMU_INTERRUPTS, "interrupts", NULL),
-   __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"),
-   __event(I915_PMU_SOFTWARE_GT_AWAKE_TIME, 
"software-gt-awake-time", "ns"),
+   __event(0, "actual-frequency", "M"),
+   __event(1, "requested-frequency", "M"),
+   __global_event(2, "interrupts", NULL),
+   __event(3, "rc6-residency", "ns"),
+   __event(4, "software-gt-awake-time", "ns"),
};
static const struct {
enum drm_i915_pmu_engine_sample sample;
@@ -993,12 +1003,17 @@ create_event_attributes(struct i915_pmu *pmu)
struct i915_ext_attribute *i915_attr = NULL, *i915_iter;
struct attribute **attr = NULL, **attr_iter;
struct intel_engine_cs *engine;
-   unsigned int i;
+   struct intel_gt *gt;
+   unsigned int i, j;
 
/* Count how many counters we will be exposing. */
-   for (i = 0; i < ARRAY_SIZE(events); i++) {
-   if (!config_status(i915, events[i].config))
-   count++;
+   for_each_gt(gt, i915, j) {
+   for (i = 0; i < ARRAY_SIZE(events); i++) {
+   u64 config = ___I915_PMU_OTHER(j, events[i].counter);
+
+   if (!config_status(i915, config))
+   count++;
+   }
}
 
for_each_uabi_engine(engine, i915) {
@@ -1028,26 +1043,39 @@ create_event_attributes(struct i915_pmu *pmu)
attr_iter = attr;
 
/* Initialize supported non-engine counters. */
-   for (i = 0; i < ARRAY_SIZE(events); i++) {
-   char *str;
-
-   if (config_status(i915, events[i].config))
-   continue;
-
-   str = kstrdup(events[i].name, GFP_KERNEL);
-   if (!str)
-   goto err;
+   for_each_gt(gt, i915, j) {
+   for (i = 0; i < ARRAY_SIZE(events); i++) {
+   u64 config = ___I915_PMU_OTHER(j, events[i].counter);
+   char *str;
 
-   *attr_iter++ = &i915_iter->attr.attr;
-   i915_iter = add_i915_attr(i915_iter, str, events[i].config);
+   if (config_status(i915, config))
+   continue;
 
-   if (events[i].unit) {
-   str = kasprintf(GFP_KERNEL, "%s.unit", events[i].name);
+   if (events[i].global || !HAS_EXTRA_GT_LIST(i915))
+   str = kstrdup(events[i].name, GFP_KERNEL);
+   else
+   str = kasprintf(GFP_KERNEL, "%s-gt%u",
+   events[

[Intel-gfx] [PATCH 5/6] drm/i915/pmu: Prepare for multi-tile non-engine counters

2023-05-14 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Reserve some bits in the counter config namespace which will carry the
tile id and prepare the code to handle this.

No per tile counters have been added yet.

v2:
- Fix checkpatch issues
- Use 4 bits for gt id in non-engine counters. Drop FIXME.
- Set MAX GTs to 4. Drop FIXME.

v3: (Ashutosh, Tvrtko)
- Drop BUG_ON that would never fire
- Make enable u64
- Pull in some code from next patch

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
Reviewed-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/i915_pmu.c | 148 +++-
 drivers/gpu/drm/i915/i915_pmu.h |  11 ++-
 include/uapi/drm/i915_drm.h |  17 +++-
 3 files changed, 129 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 725b01b00775..b3dd9e51c5cc 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -56,11 +56,21 @@ static bool is_engine_config(u64 config)
return config < __I915_PMU_OTHER(0);
 }
 
+static unsigned int config_gt_id(const u64 config)
+{
+   return config >> __I915_PMU_GT_SHIFT;
+}
+
+static u64 config_counter(const u64 config)
+{
+   return config & ~(~0ULL << __I915_PMU_GT_SHIFT);
+}
+
 static unsigned int other_bit(const u64 config)
 {
unsigned int val;
 
-   switch (config) {
+   switch (config_counter(config)) {
case I915_PMU_ACTUAL_FREQUENCY:
val =  __I915_PMU_ACTUAL_FREQUENCY_ENABLED;
break;
@@ -78,7 +88,9 @@ static unsigned int other_bit(const u64 config)
return -1;
}
 
-   return I915_ENGINE_SAMPLE_COUNT + val;
+   return I915_ENGINE_SAMPLE_COUNT +
+  config_gt_id(config) * __I915_PMU_TRACKED_EVENT_COUNT +
+  val;
 }
 
 static unsigned int config_bit(const u64 config)
@@ -104,10 +116,22 @@ static unsigned int event_bit(struct perf_event *event)
return config_bit(event->attr.config);
 }
 
+static u64 frequency_enabled_mask(void)
+{
+   unsigned int i;
+   u64 mask = 0;
+
+   for (i = 0; i < I915_PMU_MAX_GTS; i++)
+   mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
+   config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
+
+   return mask;
+}
+
 static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
-   u32 enable;
+   u64 enable;
 
/*
 * Only some counters need the sampling timer.
@@ -120,9 +144,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool 
gpu_active)
 * Mask out all the ones which do not need the timer, or in
 * other words keep all the ones that could need the timer.
 */
-   enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
- config_mask(I915_PMU_REQUESTED_FREQUENCY) |
- ENGINE_SAMPLE_MASK;
+   enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;
 
/*
 * When the GPU is idle per-engine counters do not need to be
@@ -164,9 +186,37 @@ static inline s64 ktime_since_raw(const ktime_t kt)
return ktime_to_ns(ktime_sub(ktime_get_raw(), kt));
 }
 
+static unsigned int
+__sample_idx(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   unsigned int idx = gt_id * __I915_NUM_PMU_SAMPLERS + sample;
+
+   GEM_BUG_ON(idx >= ARRAY_SIZE(pmu->sample));
+
+   return idx;
+}
+
+static u64 read_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   return pmu->sample[__sample_idx(pmu, gt_id, sample)].cur;
+}
+
+static void
+store_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample, u64 val)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur = val;
+}
+
+static void
+add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val, 
u32 mul)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur += mul_u32_u32(val, 
mul);
+}
+
 static u64 get_rc6(struct intel_gt *gt)
 {
struct drm_i915_private *i915 = gt->i915;
+   const unsigned int gt_id = gt->info.id;
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
bool awake = false;
@@ -181,7 +231,7 @@ static u64 get_rc6(struct intel_gt *gt)
spin_lock_irqsave(&pmu->lock, flags);
 
if (awake) {
-   pmu->sample[__I915_SAMPLE_RC6].cur = val;
+   store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
} else {
/*
 * We think we are runtime suspended.
@@ -190,14 +240,14 @@ static u64 get_rc6(struct intel_gt *gt)
 * on top of the last known real value, as the approximated RC6
 * counter value.
 */
-   val = ktime_since_raw(pmu->sleep_last);
-   val += pmu->sample[__I915_SAMPLE_RC6].cur;
+   val = kt

[Intel-gfx] [PATCH 2/6] drm/i915/pmu: Skip sampling engines with no enabled counters

2023-05-14 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

As we have more and more engines do not waste time sampling the ones no-
one is monitoring.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 67fa6cd77529..ba769f7fc385 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -339,6 +339,9 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
return;
 
for_each_engine(engine, gt, id) {
+   if (!engine->pmu.enable)
+   continue;
+
if (!intel_engine_pm_get_if_awake(engine))
continue;
 
-- 
2.36.1



[Intel-gfx] [PATCH 1/6] drm/i915/pmu: Support PMU for all engines

2023-05-14 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Given how the metrics are already exported, we also need to run sampling
over engines from all GTs.

Problem of GT frequencies is left for later.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Umesh Nerlige Ramappa 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 7ece883a7d95..67fa6cd77529 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -10,6 +10,7 @@
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_regs.h"
 #include "gt/intel_engine_user.h"
+#include "gt/intel_gt.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_gt_regs.h"
 #include "gt/intel_rc6.h"
@@ -414,8 +415,9 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
struct drm_i915_private *i915 =
container_of(hrtimer, struct drm_i915_private, pmu.timer);
struct i915_pmu *pmu = &i915->pmu;
-   struct intel_gt *gt = to_gt(i915);
unsigned int period_ns;
+   struct intel_gt *gt;
+   unsigned int i;
ktime_t now;
 
if (!READ_ONCE(pmu->timer_enabled))
@@ -431,8 +433,13 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
*hrtimer)
 * grabbing the forcewake. However the potential error from timer call-
 * back delay greatly dominates this so we keep it simple.
 */
-   engines_sample(gt, period_ns);
-   frequency_sample(gt, period_ns);
+
+   for_each_gt(gt, i915, i) {
+   engines_sample(gt, period_ns);
+
+   if (i == 0) /* FIXME */
+   frequency_sample(gt, period_ns);
+   }
 
hrtimer_forward(hrtimer, now, ns_to_ktime(PERIOD));
 
-- 
2.36.1



Re: [Intel-gfx] [PATCH 5/6] drm/i915/pmu: Prepare for multi-tile non-engine counters

2023-05-14 Thread Umesh Nerlige Ramappa

On Fri, May 12, 2023 at 09:41:56PM -0700, Dixit, Ashutosh wrote:

On Fri, 12 May 2023 18:55:44 -0700, Umesh Nerlige Ramappa wrote:


From: Tvrtko Ursulin 

Reserve some bits in the counter config namespace which will carry the
tile id and prepare the code to handle this.

No per tile counters have been added yet.

v2:
- Fix checkpatch issues
- Use 4 bits for gt id in non-engine counters. Drop FIXME.
- Set MAX GTs to 4. Drop FIXME.

v3: (Ashutosh, Tvrtko)
- Drop BUG_ON that would never fire
- Make enable u64
- Pull in some code from next patch


Just a reminder in case you want to do something like:

#define I915_PMU_MAX_GTS I915_MAX_GT

Or replace I915_PMU_MAX_GTS by I915_MAX_GT.


Hmmm, I thought I sent out a response separately for that in the 
previous series, but I am not able to locate it, strange. Anyways, I did 
try that and ran into issues that Tvrtko was mentioning w.r.t. header 
dependencies. I think i915_drv.h includes intel_engine.h and that 
includes i915_pmu.h. So including i915_drv.h in i915_pmu.h for the 
definition of I915_MAX_GT is just wreaking havoc during compile.


Hence, gave up on that and using whatever existed before in Tvrtko's 
patch.


Thanks,
Umesh



But otherwise v3 LGTM:

Reviewed-by: Ashutosh Dixit 


Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 148 +++-
 drivers/gpu/drm/i915/i915_pmu.h |  11 ++-
 include/uapi/drm/i915_drm.h |  17 +++-
 3 files changed, 129 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 725b01b00775..b3dd9e51c5cc 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -56,11 +56,21 @@ static bool is_engine_config(u64 config)
return config < __I915_PMU_OTHER(0);
 }

+static unsigned int config_gt_id(const u64 config)
+{
+   return config >> __I915_PMU_GT_SHIFT;
+}
+
+static u64 config_counter(const u64 config)
+{
+   return config & ~(~0ULL << __I915_PMU_GT_SHIFT);
+}
+
 static unsigned int other_bit(const u64 config)
 {
unsigned int val;

-   switch (config) {
+   switch (config_counter(config)) {
case I915_PMU_ACTUAL_FREQUENCY:
val =  __I915_PMU_ACTUAL_FREQUENCY_ENABLED;
break;
@@ -78,7 +88,9 @@ static unsigned int other_bit(const u64 config)
return -1;
}

-   return I915_ENGINE_SAMPLE_COUNT + val;
+   return I915_ENGINE_SAMPLE_COUNT +
+  config_gt_id(config) * __I915_PMU_TRACKED_EVENT_COUNT +
+  val;
 }

 static unsigned int config_bit(const u64 config)
@@ -104,10 +116,22 @@ static unsigned int event_bit(struct perf_event *event)
return config_bit(event->attr.config);
 }

+static u64 frequency_enabled_mask(void)
+{
+   unsigned int i;
+   u64 mask = 0;
+
+   for (i = 0; i < I915_PMU_MAX_GTS; i++)
+   mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
+   config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
+
+   return mask;
+}
+
 static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
-   u32 enable;
+   u64 enable;

/*
 * Only some counters need the sampling timer.
@@ -120,9 +144,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool 
gpu_active)
 * Mask out all the ones which do not need the timer, or in
 * other words keep all the ones that could need the timer.
 */
-   enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
- config_mask(I915_PMU_REQUESTED_FREQUENCY) |
- ENGINE_SAMPLE_MASK;
+   enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;

/*
 * When the GPU is idle per-engine counters do not need to be
@@ -164,9 +186,37 @@ static inline s64 ktime_since_raw(const ktime_t kt)
return ktime_to_ns(ktime_sub(ktime_get_raw(), kt));
 }

+static unsigned int
+__sample_idx(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   unsigned int idx = gt_id * __I915_NUM_PMU_SAMPLERS + sample;
+
+   GEM_BUG_ON(idx >= ARRAY_SIZE(pmu->sample));
+
+   return idx;
+}
+
+static u64 read_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   return pmu->sample[__sample_idx(pmu, gt_id, sample)].cur;
+}
+
+static void
+store_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample, u64 val)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur = val;
+}
+
+static void
+add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val, 
u32 mul)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur += mul_u32_u32(val, 
mul);
+}
+
 static u64 get_rc6(struct intel_gt *gt)
 {
struct drm_i915_private *i915 = gt->i915;
+   const unsigned int gt_id 

[Intel-gfx] [PATCH 5/6] drm/i915/pmu: Prepare for multi-tile non-engine counters

2023-05-12 Thread Umesh Nerlige Ramappa
From: Tvrtko Ursulin 

Reserve some bits in the counter config namespace which will carry the
tile id and prepare the code to handle this.

No per tile counters have been added yet.

v2:
- Fix checkpatch issues
- Use 4 bits for gt id in non-engine counters. Drop FIXME.
- Set MAX GTs to 4. Drop FIXME.

v3: (Ashutosh, Tvrtko)
- Drop BUG_ON that would never fire
- Make enable u64
- Pull in some code from next patch

Signed-off-by: Tvrtko Ursulin 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/i915_pmu.c | 148 +++-
 drivers/gpu/drm/i915/i915_pmu.h |  11 ++-
 include/uapi/drm/i915_drm.h |  17 +++-
 3 files changed, 129 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 725b01b00775..b3dd9e51c5cc 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -56,11 +56,21 @@ static bool is_engine_config(u64 config)
return config < __I915_PMU_OTHER(0);
 }
 
+static unsigned int config_gt_id(const u64 config)
+{
+   return config >> __I915_PMU_GT_SHIFT;
+}
+
+static u64 config_counter(const u64 config)
+{
+   return config & ~(~0ULL << __I915_PMU_GT_SHIFT);
+}
+
 static unsigned int other_bit(const u64 config)
 {
unsigned int val;
 
-   switch (config) {
+   switch (config_counter(config)) {
case I915_PMU_ACTUAL_FREQUENCY:
val =  __I915_PMU_ACTUAL_FREQUENCY_ENABLED;
break;
@@ -78,7 +88,9 @@ static unsigned int other_bit(const u64 config)
return -1;
}
 
-   return I915_ENGINE_SAMPLE_COUNT + val;
+   return I915_ENGINE_SAMPLE_COUNT +
+  config_gt_id(config) * __I915_PMU_TRACKED_EVENT_COUNT +
+  val;
 }
 
 static unsigned int config_bit(const u64 config)
@@ -104,10 +116,22 @@ static unsigned int event_bit(struct perf_event *event)
return config_bit(event->attr.config);
 }
 
+static u64 frequency_enabled_mask(void)
+{
+   unsigned int i;
+   u64 mask = 0;
+
+   for (i = 0; i < I915_PMU_MAX_GTS; i++)
+   mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
+   config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
+
+   return mask;
+}
+
 static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
 {
struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
-   u32 enable;
+   u64 enable;
 
/*
 * Only some counters need the sampling timer.
@@ -120,9 +144,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool 
gpu_active)
 * Mask out all the ones which do not need the timer, or in
 * other words keep all the ones that could need the timer.
 */
-   enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
- config_mask(I915_PMU_REQUESTED_FREQUENCY) |
- ENGINE_SAMPLE_MASK;
+   enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;
 
/*
 * When the GPU is idle per-engine counters do not need to be
@@ -164,9 +186,37 @@ static inline s64 ktime_since_raw(const ktime_t kt)
return ktime_to_ns(ktime_sub(ktime_get_raw(), kt));
 }
 
+static unsigned int
+__sample_idx(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   unsigned int idx = gt_id * __I915_NUM_PMU_SAMPLERS + sample;
+
+   GEM_BUG_ON(idx >= ARRAY_SIZE(pmu->sample));
+
+   return idx;
+}
+
+static u64 read_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample)
+{
+   return pmu->sample[__sample_idx(pmu, gt_id, sample)].cur;
+}
+
+static void
+store_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample, u64 val)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur = val;
+}
+
+static void
+add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val, 
u32 mul)
+{
+   pmu->sample[__sample_idx(pmu, gt_id, sample)].cur += mul_u32_u32(val, 
mul);
+}
+
 static u64 get_rc6(struct intel_gt *gt)
 {
struct drm_i915_private *i915 = gt->i915;
+   const unsigned int gt_id = gt->info.id;
struct i915_pmu *pmu = &i915->pmu;
unsigned long flags;
bool awake = false;
@@ -181,7 +231,7 @@ static u64 get_rc6(struct intel_gt *gt)
spin_lock_irqsave(&pmu->lock, flags);
 
if (awake) {
-   pmu->sample[__I915_SAMPLE_RC6].cur = val;
+   store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
} else {
/*
 * We think we are runtime suspended.
@@ -190,14 +240,14 @@ static u64 get_rc6(struct intel_gt *gt)
 * on top of the last known real value, as the approximated RC6
 * counter value.
 */
-   val = ktime_since_raw(pmu->sleep_last);
-   val += pmu->sample[__I915_SAMPLE_RC6].cur;
+   val = ktime_since_raw(pmu->slee

[Intel-gfx] [PATCH 0/6] Add MTL PMU support for multi-gt

2023-05-12 Thread Umesh Nerlige Ramappa
With MTL, frequency and rc6 counters are specific to a gt. Export these
counters via gt-specific events to the user space.

v2: Review comments (Ashutosh, Tvrtko)

Signed-off-by: Umesh Nerlige Ramappa 

Tvrtko Ursulin (6):
  drm/i915/pmu: Support PMU for all engines
  drm/i915/pmu: Skip sampling engines with no enabled counters
  drm/i915/pmu: Transform PMU parking code to be GT based
  drm/i915/pmu: Add reference counting to the sampling timer
  drm/i915/pmu: Prepare for multi-tile non-engine counters
  drm/i915/pmu: Export counters from all tiles

 drivers/gpu/drm/i915/gt/intel_gt_pm.c |   4 +-
 drivers/gpu/drm/i915/i915_pmu.c   | 263 ++
 drivers/gpu/drm/i915/i915_pmu.h   |  24 ++-
 include/uapi/drm/i915_drm.h   |  17 +-
 4 files changed, 219 insertions(+), 89 deletions(-)

-- 
2.36.1



  1   2   3   4   5   6   7   8   9   10   >