Everytime we take a wakeref, record the stack trace of where it was
taken; clearing the set if we ever drop back to no owners. For debugging
a rpm leak, we can look at all the current wakerefs and check if they
have a matching rpm_put.

Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Kconfig.debug            |  13 ++
 drivers/gpu/drm/i915/i915_drv.c               |  10 +-
 drivers/gpu/drm/i915/i915_drv.h               |   7 +
 drivers/gpu/drm/i915/intel_drv.h              |  32 +--
 drivers/gpu/drm/i915/intel_runtime_pm.c       | 219 +++++++++++++++---
 .../gpu/drm/i915/selftests/mock_gem_device.c  |   8 +-
 6 files changed, 241 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig.debug 
b/drivers/gpu/drm/i915/Kconfig.debug
index 459f8f88a34c..ed572a454e01 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -39,6 +39,19 @@ config DRM_I915_DEBUG
 
           If in doubt, say "N".
 
+config DRM_I915_DEBUG_RPM
+        bool "Insert extra checks into the runtime pm internals"
+        depends on DRM_I915
+        default n
+        select STACKDEPOT
+        help
+          Enable extra sanity checks (including BUGs) along the runtime pm
+          paths that may slow the system down and if hit hang the machine.
+
+          Recommended for driver developers only.
+
+          If in doubt, say "N".
+
 config DRM_I915_DEBUG_GEM
         bool "Insert extra checks into the GEM internals"
         default n
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index bc072bef8f0a..b2b3c85299e6 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1315,6 +1315,8 @@ static void i915_welcome_messages(struct drm_i915_private 
*dev_priv)
                DRM_INFO("DRM_I915_DEBUG enabled\n");
        if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
                DRM_INFO("DRM_I915_DEBUG_GEM enabled\n");
+       if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_RPM))
+               DRM_INFO("DRM_I915_DEBUG_RPM enabled\n");
 }
 
 static struct drm_i915_private *
@@ -1376,6 +1378,8 @@ int i915_driver_load(struct pci_dev *pdev, const struct 
pci_device_id *ent)
        if (!dev_priv)
                return -ENOMEM;
 
+       intel_runtime_pm_init_early(dev_priv);
+
        intel_runtime_pm_get(dev_priv);
        intel_runtime_pm_enable(dev_priv);
 
@@ -1478,7 +1482,7 @@ void i915_driver_unload(struct drm_device *dev)
 
        intel_runtime_pm_disable(dev_priv);
        intel_display_power_put(dev_priv, POWER_DOMAIN_INIT);
-       WARN_ON(atomic_read(&dev_priv->runtime_pm.wakeref_count));
+       intel_runtime_pm_cleanup(dev_priv);
 }
 
 static void i915_driver_release(struct drm_device *dev)
@@ -1686,6 +1690,8 @@ static int i915_drm_suspend_late(struct drm_device *dev, 
bool hibernation)
 
 out:
        enable_rpm_wakeref_asserts(dev_priv);
+       if (!dev_priv->uncore.user_forcewake.count)
+               intel_runtime_pm_cleanup(dev_priv);
 
        return ret;
 }
@@ -2646,7 +2652,7 @@ static int intel_runtime_suspend(struct device *kdev)
        }
 
        enable_rpm_wakeref_asserts(dev_priv);
-       WARN_ON_ONCE(atomic_read(&dev_priv->runtime_pm.wakeref_count));
+       intel_runtime_pm_cleanup(dev_priv);
 
        if (intel_uncore_arm_unclaimed_mmio_detection(dev_priv))
                DRM_ERROR("Unclaimed access detected prior to suspending\n");
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5fa13887b911..6f2b81f135cf 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -45,6 +45,7 @@
 #include <linux/pm_qos.h>
 #include <linux/reservation.h>
 #include <linux/shmem_fs.h>
+#include <linux/stackdepot.h>
 
 #include <drm/drmP.h>
 #include <drm/intel-gtt.h>
@@ -1287,6 +1288,12 @@ struct i915_runtime_pm {
        atomic_t wakeref_count;
        bool suspended;
        bool irqs_enabled;
+
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_RPM)
+       spinlock_t debug_lock;
+       depot_stack_handle_t *debug_owners;
+       unsigned long debug_count;
+#endif
 };
 
 enum intel_pipe_crc_source {
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 364fc2504fa4..5942593f1732 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1950,6 +1950,7 @@ void intel_psr_short_pulse(struct intel_dp *intel_dp);
 int intel_psr_wait_for_idle(const struct intel_crtc_state *new_crtc_state);
 
 /* intel_runtime_pm.c */
+void intel_runtime_pm_init_early(struct drm_i915_private *dev_priv);
 int intel_power_domains_init(struct drm_i915_private *);
 void intel_power_domains_cleanup(struct drm_i915_private *dev_priv);
 void intel_power_domains_init_hw(struct drm_i915_private *dev_priv, bool 
resume);
@@ -1960,6 +1961,7 @@ void bxt_display_core_init(struct drm_i915_private 
*dev_priv, bool resume);
 void bxt_display_core_uninit(struct drm_i915_private *dev_priv);
 void intel_runtime_pm_enable(struct drm_i915_private *dev_priv);
 void intel_runtime_pm_disable(struct drm_i915_private *dev_priv);
+void intel_runtime_pm_cleanup(struct drm_i915_private *dev_priv);
 const char *
 intel_display_power_domain_str(enum intel_display_power_domain domain);
 
@@ -1977,23 +1979,23 @@ void icl_dbuf_slices_update(struct drm_i915_private 
*dev_priv,
                            u8 req_slices);
 
 static inline void
-assert_rpm_device_not_suspended(struct drm_i915_private *dev_priv)
+assert_rpm_device_not_suspended(struct drm_i915_private *i915)
 {
-       WARN_ONCE(dev_priv->runtime_pm.suspended,
+       WARN_ONCE(i915->runtime_pm.suspended,
                  "Device suspended during HW access\n");
 }
 
 static inline void
-assert_rpm_wakelock_held(struct drm_i915_private *dev_priv)
+assert_rpm_wakelock_held(struct drm_i915_private *i915)
 {
-       assert_rpm_device_not_suspended(dev_priv);
-       WARN_ONCE(!atomic_read(&dev_priv->runtime_pm.wakeref_count),
+       assert_rpm_device_not_suspended(i915);
+       WARN_ONCE(!atomic_read(&i915->runtime_pm.wakeref_count),
                  "RPM wakelock ref not held during HW access");
 }
 
 /**
  * disable_rpm_wakeref_asserts - disable the RPM assert checks
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  *
  * This function disable asserts that check if we hold an RPM wakelock
  * reference, while keeping the device-not-suspended checks still enabled.
@@ -2010,14 +2012,14 @@ assert_rpm_wakelock_held(struct drm_i915_private 
*dev_priv)
  * enable_rpm_wakeref_asserts().
  */
 static inline void
-disable_rpm_wakeref_asserts(struct drm_i915_private *dev_priv)
+disable_rpm_wakeref_asserts(struct drm_i915_private *i915)
 {
-       atomic_inc(&dev_priv->runtime_pm.wakeref_count);
+       atomic_inc(&i915->runtime_pm.wakeref_count);
 }
 
 /**
  * enable_rpm_wakeref_asserts - re-enable the RPM assert checks
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  *
  * This function re-enables the RPM assert checks after disabling them with
  * disable_rpm_wakeref_asserts. It's meant to be used only in special
@@ -2027,15 +2029,15 @@ disable_rpm_wakeref_asserts(struct drm_i915_private 
*dev_priv)
  * disable_rpm_wakeref_asserts().
  */
 static inline void
-enable_rpm_wakeref_asserts(struct drm_i915_private *dev_priv)
+enable_rpm_wakeref_asserts(struct drm_i915_private *i915)
 {
-       atomic_dec(&dev_priv->runtime_pm.wakeref_count);
+       atomic_dec(&i915->runtime_pm.wakeref_count);
 }
 
-void intel_runtime_pm_get(struct drm_i915_private *dev_priv);
-bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *dev_priv);
-void intel_runtime_pm_get_noresume(struct drm_i915_private *dev_priv);
-void intel_runtime_pm_put(struct drm_i915_private *dev_priv);
+void intel_runtime_pm_get(struct drm_i915_private *i915);
+bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *i915);
+void intel_runtime_pm_get_noresume(struct drm_i915_private *i915);
+void intel_runtime_pm_put(struct drm_i915_private *i915);
 
 void intel_display_set_init_power(struct drm_i915_private *dev, bool enable);
 
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c 
b/drivers/gpu/drm/i915/intel_runtime_pm.c
index c0983f0e46ac..425d885f84b6 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -49,6 +49,143 @@
  * present for a given platform.
  */
 
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_RPM)
+
+#include <linux/sort.h>
+
+#define STACKDEPTH 8
+
+static void init_intel_runtime_pm_wakeref(struct drm_i915_private *i915)
+{
+       spin_lock_init(&i915->runtime_pm.debug_lock);
+}
+
+static noinline void
+track_intel_runtime_pm_wakeref(struct drm_i915_private *i915)
+{
+       struct i915_runtime_pm *rpm = &i915->runtime_pm;
+       unsigned long entries[STACKDEPTH];
+       struct stack_trace trace = {
+               .entries = entries,
+               .max_entries = ARRAY_SIZE(entries),
+               .skip = 1
+       };
+       unsigned long flags;
+       depot_stack_handle_t stack, *stacks;
+
+       if (!HAS_RUNTIME_PM(i915))
+               return;
+
+       save_stack_trace(&trace);
+       if (trace.nr_entries &&
+           trace.entries[trace.nr_entries - 1] == ULONG_MAX)
+               trace.nr_entries--;
+
+       stack = depot_save_stack(&trace, GFP_NOWAIT | __GFP_NOWARN);
+       if (!stack)
+               return;
+
+       spin_lock_irqsave(&rpm->debug_lock, flags);
+       stacks = krealloc(rpm->debug_owners,
+                         (rpm->debug_count + 1) * sizeof(*stacks),
+                         GFP_NOWAIT | __GFP_NOWARN);
+       if (stacks) {
+               stacks[rpm->debug_count++] = stack;
+               rpm->debug_owners = stacks;
+       }
+       spin_unlock_irqrestore(&rpm->debug_lock, flags);
+}
+
+static void untrack_intel_runtime_pm_wakeref(struct drm_i915_private *i915)
+{
+       struct i915_runtime_pm *rpm = &i915->runtime_pm;
+       depot_stack_handle_t *stacks;
+       unsigned long flags;
+
+       spin_lock_irqsave(&rpm->debug_lock, flags);
+       stacks = fetch_and_zero(&rpm->debug_owners);
+       rpm->debug_count = 0;
+       spin_unlock_irqrestore(&rpm->debug_lock, flags);
+
+       kfree(stacks);
+}
+
+static int cmphandle(const void *_a, const void *_b)
+{
+       const depot_stack_handle_t * const a = _a, * const b = _b;
+
+       if (*a < *b)
+               return -1;
+       else if (*a > *b)
+               return 1;
+       else
+               return 0;
+}
+
+static void show_intel_runtime_pm_wakeref(struct drm_i915_private *i915)
+{
+       struct i915_runtime_pm *rpm = &i915->runtime_pm;
+       unsigned long entries[STACKDEPTH];
+       depot_stack_handle_t *stacks;
+       unsigned long flags, count, i;
+       char *buf;
+
+       spin_lock_irqsave(&rpm->debug_lock, flags);
+       stacks = fetch_and_zero(&rpm->debug_owners);
+       count = fetch_and_zero(&rpm->debug_count);
+       spin_unlock_irqrestore(&rpm->debug_lock, flags);
+       if (!count)
+               return;
+
+       DRM_DEBUG_DRIVER("leaked %lu wakerefs\n", count);
+
+       buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!buf)
+               goto out_stacks;
+
+       sort(stacks, count, sizeof(*stacks), cmphandle, NULL);
+
+       for (i = 0; i < count; i++) {
+               struct stack_trace trace = {
+                       .entries = entries,
+                       .max_entries = ARRAY_SIZE(entries),
+               };
+               depot_stack_handle_t stack = stacks[i];
+               unsigned long rep;
+
+               rep = 1;
+               while (i + 1 < count && stacks[i + 1] == stack)
+                       rep++, i++;
+               depot_fetch_stack(stack, &trace);
+               snprint_stack_trace(buf, PAGE_SIZE, &trace, 0);
+               DRM_DEBUG_DRIVER("wakeref x%lu at\n%s", rep, buf);
+       }
+
+       kfree(buf);
+out_stacks:
+       kfree(stacks);
+}
+
+#else
+
+static void init_intel_runtime_pm_wakeref(struct drm_i915_private *i915)
+{
+}
+
+static void track_intel_runtime_pm_wakeref(struct drm_i915_private *i915)
+{
+}
+
+static void untrack_intel_runtime_pm_wakeref(struct drm_i915_private *i915)
+{
+}
+
+static void show_intel_runtime_pm_wakeref(struct drm_i915_private *i915)
+{
+}
+
+#endif
+
 bool intel_display_power_well_is_enabled(struct drm_i915_private *dev_priv,
                                         enum i915_power_well_id power_well_id);
 
@@ -3919,7 +4056,7 @@ void intel_power_domains_verify_state(struct 
drm_i915_private *dev_priv)
 
 /**
  * intel_runtime_pm_get - grab a runtime pm reference
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  *
  * This function grabs a device-level runtime pm reference (mostly used for GEM
  * code to ensure the GTT or GT is on) and ensures that it is powered up.
@@ -3927,22 +4064,24 @@ void intel_power_domains_verify_state(struct 
drm_i915_private *dev_priv)
  * Any runtime pm reference obtained by this function must have a symmetric
  * call to intel_runtime_pm_put() to release the reference again.
  */
-void intel_runtime_pm_get(struct drm_i915_private *dev_priv)
+void intel_runtime_pm_get(struct drm_i915_private *i915)
 {
-       struct pci_dev *pdev = dev_priv->drm.pdev;
+       struct pci_dev *pdev = i915->drm.pdev;
        struct device *kdev = &pdev->dev;
        int ret;
 
        ret = pm_runtime_get_sync(kdev);
        WARN_ONCE(ret < 0, "pm_runtime_get_sync() failed: %d\n", ret);
 
-       atomic_inc(&dev_priv->runtime_pm.wakeref_count);
-       assert_rpm_wakelock_held(dev_priv);
+       atomic_inc(&i915->runtime_pm.wakeref_count);
+       assert_rpm_wakelock_held(i915);
+
+       track_intel_runtime_pm_wakeref(i915);
 }
 
 /**
  * intel_runtime_pm_get_if_in_use - grab a runtime pm reference if device in 
use
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  *
  * This function grabs a device-level runtime pm reference if the device is
  * already in use and ensures that it is powered up. It is illegal to try
@@ -3953,10 +4092,10 @@ void intel_runtime_pm_get(struct drm_i915_private 
*dev_priv)
  *
  * Returns: True if the wakeref was acquired, or False otherwise.
  */
-bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *dev_priv)
+bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *i915)
 {
        if (IS_ENABLED(CONFIG_PM)) {
-               struct pci_dev *pdev = dev_priv->drm.pdev;
+               struct pci_dev *pdev = i915->drm.pdev;
                struct device *kdev = &pdev->dev;
 
                /*
@@ -3969,15 +4108,17 @@ bool intel_runtime_pm_get_if_in_use(struct 
drm_i915_private *dev_priv)
                        return false;
        }
 
-       atomic_inc(&dev_priv->runtime_pm.wakeref_count);
-       assert_rpm_wakelock_held(dev_priv);
+       atomic_inc(&i915->runtime_pm.wakeref_count);
+       assert_rpm_wakelock_held(i915);
+
+       track_intel_runtime_pm_wakeref(i915);
 
        return true;
 }
 
 /**
  * intel_runtime_pm_get_noresume - grab a runtime pm reference
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  *
  * This function grabs a device-level runtime pm reference (mostly used for GEM
  * code to ensure the GTT or GT is on).
@@ -3992,32 +4133,35 @@ bool intel_runtime_pm_get_if_in_use(struct 
drm_i915_private *dev_priv)
  * Any runtime pm reference obtained by this function must have a symmetric
  * call to intel_runtime_pm_put() to release the reference again.
  */
-void intel_runtime_pm_get_noresume(struct drm_i915_private *dev_priv)
+void intel_runtime_pm_get_noresume(struct drm_i915_private *i915)
 {
-       struct pci_dev *pdev = dev_priv->drm.pdev;
+       struct pci_dev *pdev = i915->drm.pdev;
        struct device *kdev = &pdev->dev;
 
-       assert_rpm_wakelock_held(dev_priv);
+       assert_rpm_wakelock_held(i915);
        pm_runtime_get_noresume(kdev);
 
-       atomic_inc(&dev_priv->runtime_pm.wakeref_count);
+       atomic_inc(&i915->runtime_pm.wakeref_count);
+
+       track_intel_runtime_pm_wakeref(i915);
 }
 
 /**
  * intel_runtime_pm_put - release a runtime pm reference
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  *
  * This function drops the device-level runtime pm reference obtained by
  * intel_runtime_pm_get() and might power down the corresponding
  * hardware block right away if this is the last reference.
  */
-void intel_runtime_pm_put(struct drm_i915_private *dev_priv)
+void intel_runtime_pm_put(struct drm_i915_private *i915)
 {
-       struct pci_dev *pdev = dev_priv->drm.pdev;
+       struct pci_dev *pdev = i915->drm.pdev;
        struct device *kdev = &pdev->dev;
 
-       assert_rpm_wakelock_held(dev_priv);
-       atomic_dec(&dev_priv->runtime_pm.wakeref_count);
+       assert_rpm_wakelock_held(i915);
+       if (atomic_dec_and_test(&i915->runtime_pm.wakeref_count))
+               untrack_intel_runtime_pm_wakeref(i915);
 
        pm_runtime_mark_last_busy(kdev);
        pm_runtime_put_autosuspend(kdev);
@@ -4025,7 +4169,7 @@ void intel_runtime_pm_put(struct drm_i915_private 
*dev_priv)
 
 /**
  * intel_runtime_pm_enable - enable runtime pm
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  *
  * This function enables runtime pm at the end of the driver load sequence.
  *
@@ -4033,9 +4177,9 @@ void intel_runtime_pm_put(struct drm_i915_private 
*dev_priv)
  * subordinate display power domains. That is only done on the first modeset
  * using intel_display_set_init_power().
  */
-void intel_runtime_pm_enable(struct drm_i915_private *dev_priv)
+void intel_runtime_pm_enable(struct drm_i915_private *i915)
 {
-       struct pci_dev *pdev = dev_priv->drm.pdev;
+       struct pci_dev *pdev = i915->drm.pdev;
        struct device *kdev = &pdev->dev;
 
        /*
@@ -4057,7 +4201,7 @@ void intel_runtime_pm_enable(struct drm_i915_private 
*dev_priv)
         * so the driver's own RPM reference tracking asserts also work on
         * platforms without RPM support.
         */
-       if (!HAS_RUNTIME_PM(dev_priv)) {
+       if (!HAS_RUNTIME_PM(i915)) {
                int ret;
 
                pm_runtime_dont_use_autosuspend(kdev);
@@ -4075,17 +4219,36 @@ void intel_runtime_pm_enable(struct drm_i915_private 
*dev_priv)
        pm_runtime_put_autosuspend(kdev);
 }
 
-void intel_runtime_pm_disable(struct drm_i915_private *dev_priv)
+void intel_runtime_pm_disable(struct drm_i915_private *i915)
 {
-       struct pci_dev *pdev = dev_priv->drm.pdev;
+       struct pci_dev *pdev = i915->drm.pdev;
        struct device *kdev = &pdev->dev;
 
        /* Transfer rpm ownership back to core */
-       WARN(pm_runtime_get_sync(&dev_priv->drm.pdev->dev) < 0,
+       WARN(pm_runtime_get_sync(kdev) < 0,
             "Failed to pass rpm ownership back to core\n");
 
        pm_runtime_dont_use_autosuspend(kdev);
 
-       if (!HAS_RUNTIME_PM(dev_priv))
+       if (!HAS_RUNTIME_PM(i915))
                pm_runtime_put(kdev);
 }
+
+void intel_runtime_pm_cleanup(struct drm_i915_private *i915)
+{
+       struct i915_runtime_pm *rpm = &i915->runtime_pm;
+
+       if (WARN(atomic_read(&rpm->wakeref_count),
+                "i915->runtime_pm.wakeref_count=%d on cleanup\n",
+                atomic_read(&rpm->wakeref_count))) {
+               show_intel_runtime_pm_wakeref(i915);
+               atomic_set(&rpm->wakeref_count, 0);
+       }
+
+       untrack_intel_runtime_pm_wakeref(i915);
+}
+
+void intel_runtime_pm_init_early(struct drm_i915_private *i915)
+{
+       init_intel_runtime_pm_wakeref(i915);
+}
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c 
b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 43ed8b28aeaa..0eb283e7fc96 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -154,15 +154,17 @@ struct drm_i915_private *mock_gem_device(void)
        pdev->dev.archdata.iommu = (void *)-1;
 #endif
 
+       i915 = (struct drm_i915_private *)(pdev + 1);
+       pci_set_drvdata(pdev, i915);
+
+       intel_runtime_pm_init_early(i915);
+
        dev_pm_domain_set(&pdev->dev, &pm_domain);
        pm_runtime_enable(&pdev->dev);
        pm_runtime_dont_use_autosuspend(&pdev->dev);
        if (pm_runtime_enabled(&pdev->dev))
                WARN_ON(pm_runtime_get_sync(&pdev->dev));
 
-       i915 = (struct drm_i915_private *)(pdev + 1);
-       pci_set_drvdata(pdev, i915);
-
        err = drm_dev_init(&i915->drm, &mock_driver, &pdev->dev);
        if (err) {
                pr_err("Failed to initialise mock GEM device: err=%d\n", err);
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to