Everytime we take a wakeref, record the stack trace of where it was taken; clearing the set if we ever drop back to no owners. For debugging a rpm leak, we can look at all the current wakerefs and check if they have a matching rpm_put.
Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> --- drivers/gpu/drm/i915/Kconfig.debug | 13 ++ drivers/gpu/drm/i915/i915_drv.c | 10 +- drivers/gpu/drm/i915/i915_drv.h | 7 + drivers/gpu/drm/i915/intel_drv.h | 32 +-- drivers/gpu/drm/i915/intel_runtime_pm.c | 219 +++++++++++++++--- .../gpu/drm/i915/selftests/mock_gem_device.c | 8 +- 6 files changed, 241 insertions(+), 48 deletions(-) diff --git a/drivers/gpu/drm/i915/Kconfig.debug b/drivers/gpu/drm/i915/Kconfig.debug index 459f8f88a34c..ed572a454e01 100644 --- a/drivers/gpu/drm/i915/Kconfig.debug +++ b/drivers/gpu/drm/i915/Kconfig.debug @@ -39,6 +39,19 @@ config DRM_I915_DEBUG If in doubt, say "N". +config DRM_I915_DEBUG_RPM + bool "Insert extra checks into the runtime pm internals" + depends on DRM_I915 + default n + select STACKDEPOT + help + Enable extra sanity checks (including BUGs) along the runtime pm + paths that may slow the system down and if hit hang the machine. + + Recommended for driver developers only. + + If in doubt, say "N". + config DRM_I915_DEBUG_GEM bool "Insert extra checks into the GEM internals" default n diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index bc072bef8f0a..b2b3c85299e6 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1315,6 +1315,8 @@ static void i915_welcome_messages(struct drm_i915_private *dev_priv) DRM_INFO("DRM_I915_DEBUG enabled\n"); if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) DRM_INFO("DRM_I915_DEBUG_GEM enabled\n"); + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_RPM)) + DRM_INFO("DRM_I915_DEBUG_RPM enabled\n"); } static struct drm_i915_private * @@ -1376,6 +1378,8 @@ int i915_driver_load(struct pci_dev *pdev, const struct pci_device_id *ent) if (!dev_priv) return -ENOMEM; + intel_runtime_pm_init_early(dev_priv); + intel_runtime_pm_get(dev_priv); intel_runtime_pm_enable(dev_priv); @@ -1478,7 +1482,7 @@ void i915_driver_unload(struct drm_device *dev) intel_runtime_pm_disable(dev_priv); intel_display_power_put(dev_priv, POWER_DOMAIN_INIT); - WARN_ON(atomic_read(&dev_priv->runtime_pm.wakeref_count)); + intel_runtime_pm_cleanup(dev_priv); } static void i915_driver_release(struct drm_device *dev) @@ -1686,6 +1690,8 @@ static int i915_drm_suspend_late(struct drm_device *dev, bool hibernation) out: enable_rpm_wakeref_asserts(dev_priv); + if (!dev_priv->uncore.user_forcewake.count) + intel_runtime_pm_cleanup(dev_priv); return ret; } @@ -2646,7 +2652,7 @@ static int intel_runtime_suspend(struct device *kdev) } enable_rpm_wakeref_asserts(dev_priv); - WARN_ON_ONCE(atomic_read(&dev_priv->runtime_pm.wakeref_count)); + intel_runtime_pm_cleanup(dev_priv); if (intel_uncore_arm_unclaimed_mmio_detection(dev_priv)) DRM_ERROR("Unclaimed access detected prior to suspending\n"); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 5fa13887b911..6f2b81f135cf 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -45,6 +45,7 @@ #include <linux/pm_qos.h> #include <linux/reservation.h> #include <linux/shmem_fs.h> +#include <linux/stackdepot.h> #include <drm/drmP.h> #include <drm/intel-gtt.h> @@ -1287,6 +1288,12 @@ struct i915_runtime_pm { atomic_t wakeref_count; bool suspended; bool irqs_enabled; + +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_RPM) + spinlock_t debug_lock; + depot_stack_handle_t *debug_owners; + unsigned long debug_count; +#endif }; enum intel_pipe_crc_source { diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 364fc2504fa4..5942593f1732 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -1950,6 +1950,7 @@ void intel_psr_short_pulse(struct intel_dp *intel_dp); int intel_psr_wait_for_idle(const struct intel_crtc_state *new_crtc_state); /* intel_runtime_pm.c */ +void intel_runtime_pm_init_early(struct drm_i915_private *dev_priv); int intel_power_domains_init(struct drm_i915_private *); void intel_power_domains_cleanup(struct drm_i915_private *dev_priv); void intel_power_domains_init_hw(struct drm_i915_private *dev_priv, bool resume); @@ -1960,6 +1961,7 @@ void bxt_display_core_init(struct drm_i915_private *dev_priv, bool resume); void bxt_display_core_uninit(struct drm_i915_private *dev_priv); void intel_runtime_pm_enable(struct drm_i915_private *dev_priv); void intel_runtime_pm_disable(struct drm_i915_private *dev_priv); +void intel_runtime_pm_cleanup(struct drm_i915_private *dev_priv); const char * intel_display_power_domain_str(enum intel_display_power_domain domain); @@ -1977,23 +1979,23 @@ void icl_dbuf_slices_update(struct drm_i915_private *dev_priv, u8 req_slices); static inline void -assert_rpm_device_not_suspended(struct drm_i915_private *dev_priv) +assert_rpm_device_not_suspended(struct drm_i915_private *i915) { - WARN_ONCE(dev_priv->runtime_pm.suspended, + WARN_ONCE(i915->runtime_pm.suspended, "Device suspended during HW access\n"); } static inline void -assert_rpm_wakelock_held(struct drm_i915_private *dev_priv) +assert_rpm_wakelock_held(struct drm_i915_private *i915) { - assert_rpm_device_not_suspended(dev_priv); - WARN_ONCE(!atomic_read(&dev_priv->runtime_pm.wakeref_count), + assert_rpm_device_not_suspended(i915); + WARN_ONCE(!atomic_read(&i915->runtime_pm.wakeref_count), "RPM wakelock ref not held during HW access"); } /** * disable_rpm_wakeref_asserts - disable the RPM assert checks - * @dev_priv: i915 device instance + * @i915: i915 device instance * * This function disable asserts that check if we hold an RPM wakelock * reference, while keeping the device-not-suspended checks still enabled. @@ -2010,14 +2012,14 @@ assert_rpm_wakelock_held(struct drm_i915_private *dev_priv) * enable_rpm_wakeref_asserts(). */ static inline void -disable_rpm_wakeref_asserts(struct drm_i915_private *dev_priv) +disable_rpm_wakeref_asserts(struct drm_i915_private *i915) { - atomic_inc(&dev_priv->runtime_pm.wakeref_count); + atomic_inc(&i915->runtime_pm.wakeref_count); } /** * enable_rpm_wakeref_asserts - re-enable the RPM assert checks - * @dev_priv: i915 device instance + * @i915: i915 device instance * * This function re-enables the RPM assert checks after disabling them with * disable_rpm_wakeref_asserts. It's meant to be used only in special @@ -2027,15 +2029,15 @@ disable_rpm_wakeref_asserts(struct drm_i915_private *dev_priv) * disable_rpm_wakeref_asserts(). */ static inline void -enable_rpm_wakeref_asserts(struct drm_i915_private *dev_priv) +enable_rpm_wakeref_asserts(struct drm_i915_private *i915) { - atomic_dec(&dev_priv->runtime_pm.wakeref_count); + atomic_dec(&i915->runtime_pm.wakeref_count); } -void intel_runtime_pm_get(struct drm_i915_private *dev_priv); -bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *dev_priv); -void intel_runtime_pm_get_noresume(struct drm_i915_private *dev_priv); -void intel_runtime_pm_put(struct drm_i915_private *dev_priv); +void intel_runtime_pm_get(struct drm_i915_private *i915); +bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *i915); +void intel_runtime_pm_get_noresume(struct drm_i915_private *i915); +void intel_runtime_pm_put(struct drm_i915_private *i915); void intel_display_set_init_power(struct drm_i915_private *dev, bool enable); diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index c0983f0e46ac..425d885f84b6 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -49,6 +49,143 @@ * present for a given platform. */ +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_RPM) + +#include <linux/sort.h> + +#define STACKDEPTH 8 + +static void init_intel_runtime_pm_wakeref(struct drm_i915_private *i915) +{ + spin_lock_init(&i915->runtime_pm.debug_lock); +} + +static noinline void +track_intel_runtime_pm_wakeref(struct drm_i915_private *i915) +{ + struct i915_runtime_pm *rpm = &i915->runtime_pm; + unsigned long entries[STACKDEPTH]; + struct stack_trace trace = { + .entries = entries, + .max_entries = ARRAY_SIZE(entries), + .skip = 1 + }; + unsigned long flags; + depot_stack_handle_t stack, *stacks; + + if (!HAS_RUNTIME_PM(i915)) + return; + + save_stack_trace(&trace); + if (trace.nr_entries && + trace.entries[trace.nr_entries - 1] == ULONG_MAX) + trace.nr_entries--; + + stack = depot_save_stack(&trace, GFP_NOWAIT | __GFP_NOWARN); + if (!stack) + return; + + spin_lock_irqsave(&rpm->debug_lock, flags); + stacks = krealloc(rpm->debug_owners, + (rpm->debug_count + 1) * sizeof(*stacks), + GFP_NOWAIT | __GFP_NOWARN); + if (stacks) { + stacks[rpm->debug_count++] = stack; + rpm->debug_owners = stacks; + } + spin_unlock_irqrestore(&rpm->debug_lock, flags); +} + +static void untrack_intel_runtime_pm_wakeref(struct drm_i915_private *i915) +{ + struct i915_runtime_pm *rpm = &i915->runtime_pm; + depot_stack_handle_t *stacks; + unsigned long flags; + + spin_lock_irqsave(&rpm->debug_lock, flags); + stacks = fetch_and_zero(&rpm->debug_owners); + rpm->debug_count = 0; + spin_unlock_irqrestore(&rpm->debug_lock, flags); + + kfree(stacks); +} + +static int cmphandle(const void *_a, const void *_b) +{ + const depot_stack_handle_t * const a = _a, * const b = _b; + + if (*a < *b) + return -1; + else if (*a > *b) + return 1; + else + return 0; +} + +static void show_intel_runtime_pm_wakeref(struct drm_i915_private *i915) +{ + struct i915_runtime_pm *rpm = &i915->runtime_pm; + unsigned long entries[STACKDEPTH]; + depot_stack_handle_t *stacks; + unsigned long flags, count, i; + char *buf; + + spin_lock_irqsave(&rpm->debug_lock, flags); + stacks = fetch_and_zero(&rpm->debug_owners); + count = fetch_and_zero(&rpm->debug_count); + spin_unlock_irqrestore(&rpm->debug_lock, flags); + if (!count) + return; + + DRM_DEBUG_DRIVER("leaked %lu wakerefs\n", count); + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto out_stacks; + + sort(stacks, count, sizeof(*stacks), cmphandle, NULL); + + for (i = 0; i < count; i++) { + struct stack_trace trace = { + .entries = entries, + .max_entries = ARRAY_SIZE(entries), + }; + depot_stack_handle_t stack = stacks[i]; + unsigned long rep; + + rep = 1; + while (i + 1 < count && stacks[i + 1] == stack) + rep++, i++; + depot_fetch_stack(stack, &trace); + snprint_stack_trace(buf, PAGE_SIZE, &trace, 0); + DRM_DEBUG_DRIVER("wakeref x%lu at\n%s", rep, buf); + } + + kfree(buf); +out_stacks: + kfree(stacks); +} + +#else + +static void init_intel_runtime_pm_wakeref(struct drm_i915_private *i915) +{ +} + +static void track_intel_runtime_pm_wakeref(struct drm_i915_private *i915) +{ +} + +static void untrack_intel_runtime_pm_wakeref(struct drm_i915_private *i915) +{ +} + +static void show_intel_runtime_pm_wakeref(struct drm_i915_private *i915) +{ +} + +#endif + bool intel_display_power_well_is_enabled(struct drm_i915_private *dev_priv, enum i915_power_well_id power_well_id); @@ -3919,7 +4056,7 @@ void intel_power_domains_verify_state(struct drm_i915_private *dev_priv) /** * intel_runtime_pm_get - grab a runtime pm reference - * @dev_priv: i915 device instance + * @i915: i915 device instance * * This function grabs a device-level runtime pm reference (mostly used for GEM * code to ensure the GTT or GT is on) and ensures that it is powered up. @@ -3927,22 +4064,24 @@ void intel_power_domains_verify_state(struct drm_i915_private *dev_priv) * Any runtime pm reference obtained by this function must have a symmetric * call to intel_runtime_pm_put() to release the reference again. */ -void intel_runtime_pm_get(struct drm_i915_private *dev_priv) +void intel_runtime_pm_get(struct drm_i915_private *i915) { - struct pci_dev *pdev = dev_priv->drm.pdev; + struct pci_dev *pdev = i915->drm.pdev; struct device *kdev = &pdev->dev; int ret; ret = pm_runtime_get_sync(kdev); WARN_ONCE(ret < 0, "pm_runtime_get_sync() failed: %d\n", ret); - atomic_inc(&dev_priv->runtime_pm.wakeref_count); - assert_rpm_wakelock_held(dev_priv); + atomic_inc(&i915->runtime_pm.wakeref_count); + assert_rpm_wakelock_held(i915); + + track_intel_runtime_pm_wakeref(i915); } /** * intel_runtime_pm_get_if_in_use - grab a runtime pm reference if device in use - * @dev_priv: i915 device instance + * @i915: i915 device instance * * This function grabs a device-level runtime pm reference if the device is * already in use and ensures that it is powered up. It is illegal to try @@ -3953,10 +4092,10 @@ void intel_runtime_pm_get(struct drm_i915_private *dev_priv) * * Returns: True if the wakeref was acquired, or False otherwise. */ -bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *dev_priv) +bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *i915) { if (IS_ENABLED(CONFIG_PM)) { - struct pci_dev *pdev = dev_priv->drm.pdev; + struct pci_dev *pdev = i915->drm.pdev; struct device *kdev = &pdev->dev; /* @@ -3969,15 +4108,17 @@ bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *dev_priv) return false; } - atomic_inc(&dev_priv->runtime_pm.wakeref_count); - assert_rpm_wakelock_held(dev_priv); + atomic_inc(&i915->runtime_pm.wakeref_count); + assert_rpm_wakelock_held(i915); + + track_intel_runtime_pm_wakeref(i915); return true; } /** * intel_runtime_pm_get_noresume - grab a runtime pm reference - * @dev_priv: i915 device instance + * @i915: i915 device instance * * This function grabs a device-level runtime pm reference (mostly used for GEM * code to ensure the GTT or GT is on). @@ -3992,32 +4133,35 @@ bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *dev_priv) * Any runtime pm reference obtained by this function must have a symmetric * call to intel_runtime_pm_put() to release the reference again. */ -void intel_runtime_pm_get_noresume(struct drm_i915_private *dev_priv) +void intel_runtime_pm_get_noresume(struct drm_i915_private *i915) { - struct pci_dev *pdev = dev_priv->drm.pdev; + struct pci_dev *pdev = i915->drm.pdev; struct device *kdev = &pdev->dev; - assert_rpm_wakelock_held(dev_priv); + assert_rpm_wakelock_held(i915); pm_runtime_get_noresume(kdev); - atomic_inc(&dev_priv->runtime_pm.wakeref_count); + atomic_inc(&i915->runtime_pm.wakeref_count); + + track_intel_runtime_pm_wakeref(i915); } /** * intel_runtime_pm_put - release a runtime pm reference - * @dev_priv: i915 device instance + * @i915: i915 device instance * * This function drops the device-level runtime pm reference obtained by * intel_runtime_pm_get() and might power down the corresponding * hardware block right away if this is the last reference. */ -void intel_runtime_pm_put(struct drm_i915_private *dev_priv) +void intel_runtime_pm_put(struct drm_i915_private *i915) { - struct pci_dev *pdev = dev_priv->drm.pdev; + struct pci_dev *pdev = i915->drm.pdev; struct device *kdev = &pdev->dev; - assert_rpm_wakelock_held(dev_priv); - atomic_dec(&dev_priv->runtime_pm.wakeref_count); + assert_rpm_wakelock_held(i915); + if (atomic_dec_and_test(&i915->runtime_pm.wakeref_count)) + untrack_intel_runtime_pm_wakeref(i915); pm_runtime_mark_last_busy(kdev); pm_runtime_put_autosuspend(kdev); @@ -4025,7 +4169,7 @@ void intel_runtime_pm_put(struct drm_i915_private *dev_priv) /** * intel_runtime_pm_enable - enable runtime pm - * @dev_priv: i915 device instance + * @i915: i915 device instance * * This function enables runtime pm at the end of the driver load sequence. * @@ -4033,9 +4177,9 @@ void intel_runtime_pm_put(struct drm_i915_private *dev_priv) * subordinate display power domains. That is only done on the first modeset * using intel_display_set_init_power(). */ -void intel_runtime_pm_enable(struct drm_i915_private *dev_priv) +void intel_runtime_pm_enable(struct drm_i915_private *i915) { - struct pci_dev *pdev = dev_priv->drm.pdev; + struct pci_dev *pdev = i915->drm.pdev; struct device *kdev = &pdev->dev; /* @@ -4057,7 +4201,7 @@ void intel_runtime_pm_enable(struct drm_i915_private *dev_priv) * so the driver's own RPM reference tracking asserts also work on * platforms without RPM support. */ - if (!HAS_RUNTIME_PM(dev_priv)) { + if (!HAS_RUNTIME_PM(i915)) { int ret; pm_runtime_dont_use_autosuspend(kdev); @@ -4075,17 +4219,36 @@ void intel_runtime_pm_enable(struct drm_i915_private *dev_priv) pm_runtime_put_autosuspend(kdev); } -void intel_runtime_pm_disable(struct drm_i915_private *dev_priv) +void intel_runtime_pm_disable(struct drm_i915_private *i915) { - struct pci_dev *pdev = dev_priv->drm.pdev; + struct pci_dev *pdev = i915->drm.pdev; struct device *kdev = &pdev->dev; /* Transfer rpm ownership back to core */ - WARN(pm_runtime_get_sync(&dev_priv->drm.pdev->dev) < 0, + WARN(pm_runtime_get_sync(kdev) < 0, "Failed to pass rpm ownership back to core\n"); pm_runtime_dont_use_autosuspend(kdev); - if (!HAS_RUNTIME_PM(dev_priv)) + if (!HAS_RUNTIME_PM(i915)) pm_runtime_put(kdev); } + +void intel_runtime_pm_cleanup(struct drm_i915_private *i915) +{ + struct i915_runtime_pm *rpm = &i915->runtime_pm; + + if (WARN(atomic_read(&rpm->wakeref_count), + "i915->runtime_pm.wakeref_count=%d on cleanup\n", + atomic_read(&rpm->wakeref_count))) { + show_intel_runtime_pm_wakeref(i915); + atomic_set(&rpm->wakeref_count, 0); + } + + untrack_intel_runtime_pm_wakeref(i915); +} + +void intel_runtime_pm_init_early(struct drm_i915_private *i915) +{ + init_intel_runtime_pm_wakeref(i915); +} diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c index 43ed8b28aeaa..0eb283e7fc96 100644 --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c @@ -154,15 +154,17 @@ struct drm_i915_private *mock_gem_device(void) pdev->dev.archdata.iommu = (void *)-1; #endif + i915 = (struct drm_i915_private *)(pdev + 1); + pci_set_drvdata(pdev, i915); + + intel_runtime_pm_init_early(i915); + dev_pm_domain_set(&pdev->dev, &pm_domain); pm_runtime_enable(&pdev->dev); pm_runtime_dont_use_autosuspend(&pdev->dev); if (pm_runtime_enabled(&pdev->dev)) WARN_ON(pm_runtime_get_sync(&pdev->dev)); - i915 = (struct drm_i915_private *)(pdev + 1); - pci_set_drvdata(pdev, i915); - err = drm_dev_init(&i915->drm, &mock_driver, &pdev->dev); if (err) { pr_err("Failed to initialise mock GEM device: err=%d\n", err); -- 2.18.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx