From: Benoit Goby <[email protected]>

Below is a patch from android kernel that detects a driver suspend/resume
lockup and captures dump in the kernel log. Please review and provide
comments.

Rather than hard-lock the kernel, dump the suspend/resume thread stack and
BUG() when a driver takes too long to suspend/resume.  The timeout is set to
12 seconds to be longer than the usbhid 10 second timeout.

Exclude from the watchdog the time spent waiting for children that
are resumed asynchronously and time every device, whether or not they
resumed synchronously.

This patch is targeted for mobile devices where a suspend/resume lockup
could cause a system reboot and catch user's attention. Information
about failing device can later be retrieved from captured log in
subsequent boot session.

The hardware watchdog timer is likely suspended during this time and
couldn't be relied upon. The soft-lockup detector would eventually tell
that tasks are not scheduled, but would provide little context as to why.
The patch hence uses system timer and assumes it is still active while the
devices are suspended/resumed.

Cc: Android Kernel Team <[email protected]>
Cc: Colin Cross <[email protected]>
Cc: Todd Poynor <[email protected]>
Cc: San Mehat <[email protected]>
Cc: Benoit Goby <[email protected]>
Cc: John Stultz <[email protected]>
Cc: Pavel Machek <[email protected]>
Cc: Rafael J. Wysocki <[email protected]>
Cc: Len Brown <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Original-author: San Mehat <[email protected]>
Signed-off-by: Benoit Goby <[email protected]>
[[email protected]: Changed printk(KERN_EMERG,...) to pr_emerg(...),
tweaked commit message. Minor changes to merge code into kernel tip.]
Signed-off-by: Zoran Markovic <[email protected]>
---
 drivers/base/power/main.c |   66 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 5a9b656..a6a02c0 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -29,6 +29,8 @@
 #include <linux/async.h>
 #include <linux/suspend.h>
 #include <linux/cpuidle.h>
+#include <linux/timer.h>
+
 #include "../base.h"
 #include "power.h"
 
@@ -54,6 +56,12 @@ struct suspend_stats suspend_stats;
 static DEFINE_MUTEX(dpm_list_mtx);
 static pm_message_t pm_transition;
 
+struct dpm_watchdog {
+       struct device           *dev;
+       struct task_struct      *tsk;
+       struct timer_list       timer;
+};
+
 static int async_error;
 
 /**
@@ -384,6 +392,56 @@ static int dpm_run_callback(pm_callback_t cb, struct 
device *dev,
        return error;
 }
 
+/**
+ * dpm_wd_handler - Driver suspend / resume watchdog handler.
+ *
+ * Called when a driver has timed out suspending or resuming.
+ * There's not much we can do here to recover so BUG() out for
+ * a crash-dump
+ */
+static void dpm_wd_handler(unsigned long data)
+{
+       struct dpm_watchdog *wd = (void *)data;
+       struct device *dev      = wd->dev;
+       struct task_struct *tsk = wd->tsk;
+
+       dev_emerg(dev, "**** DPM device timeout ****\n");
+       show_stack(tsk, NULL);
+
+       BUG();
+}
+
+/**
+ * dpm_wd_set - Enable pm watchdog for given device.
+ * @wd: Watchdog. Must be allocated on the stack.
+ * @dev: Device to handle.
+ */
+static void dpm_wd_set(struct dpm_watchdog *wd, struct device *dev)
+{
+       struct timer_list *timer = &wd->timer;
+
+       wd->dev = dev;
+       wd->tsk = get_current();
+
+       init_timer_on_stack(timer);
+       timer->expires = jiffies + HZ * 12;
+       timer->function = dpm_wd_handler;
+       timer->data = (unsigned long)wd;
+       add_timer(timer);
+}
+
+/**
+ * dpm_wd_clear - Disable pm watchdog.
+ * @wd: Watchdog to disable.
+ */
+static void dpm_wd_clear(struct dpm_watchdog *wd)
+{
+       struct timer_list *timer = &wd->timer;
+
+       del_timer_sync(timer);
+       destroy_timer_on_stack(timer);
+}
+
 /*------------------------- Resume routines -------------------------*/
 
 /**
@@ -570,6 +628,7 @@ static int device_resume(struct device *dev, pm_message_t 
state, bool async)
        pm_callback_t callback = NULL;
        char *info = NULL;
        int error = 0;
+       struct dpm_watchdog wd;
 
        TRACE_DEVICE(dev);
        TRACE_RESUME(0);
@@ -585,6 +644,7 @@ static int device_resume(struct device *dev, pm_message_t 
state, bool async)
         * a resumed device, even if the device hasn't been completed yet.
         */
        dev->power.is_prepared = false;
+       dpm_wd_set(&wd, dev);
 
        if (!dev->power.is_suspended)
                goto Unlock;
@@ -636,6 +696,7 @@ static int device_resume(struct device *dev, pm_message_t 
state, bool async)
 
  Unlock:
        device_unlock(dev);
+       dpm_wd_clear(&wd);
 
  Complete:
        complete_all(&dev->power.completion);
@@ -1053,6 +1114,7 @@ static int __device_suspend(struct device *dev, 
pm_message_t state, bool async)
        pm_callback_t callback = NULL;
        char *info = NULL;
        int error = 0;
+       struct dpm_watchdog wd;
 
        dpm_wait_for_children(dev, async);
 
@@ -1076,6 +1138,8 @@ static int __device_suspend(struct device *dev, 
pm_message_t state, bool async)
        if (dev->power.syscore)
                goto Complete;
 
+       dpm_wd_set(&wd, dev);
+
        device_lock(dev);
 
        if (dev->pm_domain) {
@@ -1131,6 +1195,8 @@ static int __device_suspend(struct device *dev, 
pm_message_t state, bool async)
 
        device_unlock(dev);
 
+       dpm_wd_clear(&wd);
+
  Complete:
        complete_all(&dev->power.completion);
        if (error)
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to