This should cover most cases I think.

I'll have to break this out into different patches, and maybe clean up
things a bit (there's certainly comments missing and some repetition).

But this boots on my test box and builds a kernel without generating a
single WARN -- big improvement over not booting (partly due to excessive
warn output) with just the sched/core.c bit.

---
 drivers/tty/n_tty.c              | 71 +++++++++++++++++++++++++++++++++-------
 fs/notify/inotify/inotify_user.c | 34 +++++++++++++++++--
 fs/notify/notification.c         |  2 +-
 include/linux/kernel.h           |  4 +--
 include/linux/sched.h            | 46 ++++++++++++++++++++++++--
 include/linux/wait.h             |  2 ++
 kernel/exit.c                    |  6 ++++
 kernel/sched/core.c              | 14 ++++++++
 kernel/smpboot.c                 | 15 +++++----
 9 files changed, 168 insertions(+), 26 deletions(-)

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index f44f1ba762c3..5e4830979937 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -2098,6 +2098,36 @@ static int job_control(struct tty_struct *tty, struct 
file *file)
        return 0;
 }
 
+/*
+ * n_tty_{read,write} use blocking primitives (mutex_lock, down_read, etc.)
+ * inside an open-coded wait loop.
+ *
+ * The wait loop relies on current->state to record wakeups; these will change
+ * it back to TASK_RUNNING. However blocking primitives themselves also change
+ * current->state. Therefore we must implement another means of recording
+ * wakeups.
+ *
+ * We do this by setting an alternative waitqueue wake function which changes
+ * an additional variable.
+ */
+
+struct n_tty_wakeup_state {
+       struct task_struct *task;
+       bool woken;
+};
+
+static int n_tty_wake(wait_queue_t *wait, unsigned mode,
+                           int wake_flags, void *key)
+{
+       struct n_tty_wakeup_state *s = wait->private;
+       DECLARE_WAITQUEUE(dummy_wait, s->task);
+
+       smp_wmb();
+       s->woken = true;
+
+       return default_wake_function(&dummy_wait, mode, wake_flags, key);
+}
+
 
 /**
  *     n_tty_read              -       read function for tty
@@ -2123,7 +2153,11 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct 
file *file,
 {
        struct n_tty_data *ldata = tty->disc_data;
        unsigned char __user *b = buf;
-       DECLARE_WAITQUEUE(wait, current);
+       struct n_tty_wakeup_state s = {
+               .task = current,
+               .woken = false,
+       };
+       wait_queue_t wait;
        int c;
        int minimum, time;
        ssize_t retval = 0;
@@ -2167,6 +2201,9 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct 
file *file,
 
        packet = tty->packet;
 
+       init_waitqueue_func_entry(&wait, n_tty_wake);
+       wait.private = &s;
+
        add_wait_queue(&tty->read_wait, &wait);
        while (nr) {
                /* First test for status change. */
@@ -2186,10 +2223,11 @@ static ssize_t n_tty_read(struct tty_struct *tty, 
struct file *file,
                        nr--;
                        break;
                }
-               /* This statement must be first before checking for input
-                  so that any interrupt will set the state back to
-                  TASK_RUNNING. */
-               set_current_state(TASK_INTERRUPTIBLE);
+               /*
+                * This statement must be first before checking for input so
+                * that any interrupt will set it to true.
+                */
+               s.woken = false;
 
                if (((minimum - (b - buf)) < ldata->minimum_to_wake) &&
                    ((minimum - (b - buf)) >= 1))
@@ -2220,13 +2258,15 @@ static ssize_t n_tty_read(struct tty_struct *tty, 
struct file *file,
                                n_tty_set_room(tty);
                                up_read(&tty->termios_rwsem);
 
-                               timeout = schedule_timeout(timeout);
+                               set_current_state(TASK_INTERRUPTIBLE);
+                               if (!s.woken)
+                                       timeout = schedule_timeout(timeout);
+                               __set_current_state(TASK_RUNNING);
 
                                down_read(&tty->termios_rwsem);
                                continue;
                        }
                }
-               __set_current_state(TASK_RUNNING);
 
                /* Deal with packet mode. */
                if (packet && b == buf) {
@@ -2273,7 +2313,6 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct 
file *file,
 
        mutex_unlock(&ldata->atomic_read_lock);
 
-       __set_current_state(TASK_RUNNING);
        if (b - buf)
                retval = b - buf;
 
@@ -2306,7 +2345,11 @@ static ssize_t n_tty_write(struct tty_struct *tty, 
struct file *file,
                           const unsigned char *buf, size_t nr)
 {
        const unsigned char *b = buf;
-       DECLARE_WAITQUEUE(wait, current);
+       struct n_tty_wakeup_state s = {
+               .task = current,
+               .woken = false,
+       };
+       wait_queue_t wait;
        int c;
        ssize_t retval = 0;
 
@@ -2322,9 +2365,12 @@ static ssize_t n_tty_write(struct tty_struct *tty, 
struct file *file,
        /* Write out any echoed characters that are still pending */
        process_echoes(tty);
 
+       init_waitqueue_func_entry(&wait, n_tty_wake);
+       wait.private = &s;
+
        add_wait_queue(&tty->write_wait, &wait);
        while (1) {
-               set_current_state(TASK_INTERRUPTIBLE);
+               s.woken = false;
                if (signal_pending(current)) {
                        retval = -ERESTARTSYS;
                        break;
@@ -2378,7 +2424,10 @@ static ssize_t n_tty_write(struct tty_struct *tty, 
struct file *file,
                }
                up_read(&tty->termios_rwsem);
 
-               schedule();
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (!s.woken)
+                       schedule();
+               __set_current_state(TASK_RUNNING);
 
                down_read(&tty->termios_rwsem);
        }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index cc423a30a0c8..e3b65ce6b312 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -220,6 +220,23 @@ static ssize_t copy_event_to_user(struct fsnotify_group 
*group,
        return event_size;
 }
 
+struct inotify_wakeup_state {
+       struct task_struct *task;
+       bool woken;
+};
+
+static int inotify_wake(wait_queue_t *wait, unsigned mode,
+                       int wake_flags, void *key)
+{
+       struct inotify_wakeup_state *s = wait->private;
+       DECLARE_WAITQUEUE(dummy_wait, s->task);
+
+       smp_wmb();
+       s->woken = true;
+
+       return autoremove_wake_function(&dummy_wait, mode, wake_flags, key);
+}
+
 static ssize_t inotify_read(struct file *file, char __user *buf,
                            size_t count, loff_t *pos)
 {
@@ -227,13 +244,21 @@ static ssize_t inotify_read(struct file *file, char 
__user *buf,
        struct fsnotify_event *kevent;
        char __user *start;
        int ret;
-       DEFINE_WAIT(wait);
+       struct inotify_wakeup_state s = {
+               .task = current,
+               .woken = false,
+       };
+       wait_queue_t wait;
+
+       init_waitqueue_func_entry(&wait, inotify_wake);
+       wait.private = &s;
 
        start = buf;
        group = file->private_data;
 
+       add_wait_queue(&group->notification_waitq, &wait);
        while (1) {
-               prepare_to_wait(&group->notification_waitq, &wait, 
TASK_INTERRUPTIBLE);
+               s.woken = false;
 
                mutex_lock(&group->notification_mutex);
                kevent = get_one_event(group, count);
@@ -264,7 +289,10 @@ static ssize_t inotify_read(struct file *file, char __user 
*buf,
                if (start != buf)
                        break;
 
-               schedule();
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (!s.woken)
+                       schedule();
+               __set_current_state(TASK_RUNNING);
        }
 
        finish_wait(&group->notification_waitq, &wait);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 1e58402171a5..dcfcdd69d1de 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
 /* return true if the notify queue is empty, false otherwise */
 bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
 {
-       BUG_ON(!mutex_is_locked(&group->notification_mutex));
+       lockdep_assert_held(&group->notification_mutex);
        return list_empty(&group->notification_list) ? true : false;
 }
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4c52907a6d8b..aac1dc9da2d0 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -162,7 +162,7 @@ extern int _cond_resched(void);
 #endif
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-  void __might_sleep(const char *file, int line, int preempt_offset);
+extern  void __might_sleep(const char *file, int line, int preempt_offset);
 /**
  * might_sleep - annotation for functions that can sleep
  *
@@ -174,7 +174,7 @@ extern int _cond_resched(void);
  * supposed to.
  */
 # define might_sleep() \
-       do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
+         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while 
(0)
 #else
   static inline void __might_sleep(const char *file, int line,
                                   int preempt_offset) { }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 66124d63371a..62dab5738e66 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -241,6 +241,43 @@ extern char ___assert_task_state[1 - 2*!!(
                                ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
                                 (task->flags & PF_FROZEN) == 0)
 
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+
+#define __set_task_state(tsk, state_value)                     \
+       do {                                                    \
+               (tsk)->task_state_change = _THIS_IP_;           \
+               (tsk)->state = (state_value);                   \
+       } while (0)
+#define set_task_state(tsk, state_value)                       \
+       do {                                                    \
+               (tsk)->task_state_change = _THIS_IP_;           \
+               set_mb((tsk)->state, (state_value));            \
+       } while (0)
+
+/*
+ * set_current_state() includes a barrier so that the write of current->state
+ * is correctly serialised wrt the caller's subsequent test of whether to
+ * actually sleep:
+ *
+ *     set_current_state(TASK_UNINTERRUPTIBLE);
+ *     if (do_i_need_to_sleep())
+ *             schedule();
+ *
+ * If the caller does not need such serialisation then use 
__set_current_state()
+ */
+#define __set_current_state(state_value)                       \
+       do {                                                    \
+               current->task_state_change = _THIS_IP_;         \
+               current->state = (state_value);                 \
+       } while (0)
+#define set_current_state(state_value)                         \
+       do {                                                    \
+               current->task_state_change = _THIS_IP_;         \
+               set_mb(current->state, (state_value));          \
+       } while (0)
+
+#else
+
 #define __set_task_state(tsk, state_value)             \
        do { (tsk)->state = (state_value); } while (0)
 #define set_task_state(tsk, state_value)               \
@@ -257,11 +294,13 @@ extern char ___assert_task_state[1 - 2*!!(
  *
  * If the caller does not need such serialisation then use 
__set_current_state()
  */
-#define __set_current_state(state_value)                       \
+#define __set_current_state(state_value)               \
        do { current->state = (state_value); } while (0)
-#define set_current_state(state_value)         \
+#define set_current_state(state_value)                 \
        set_mb(current->state, (state_value))
 
+#endif
+
 /* Task command name length */
 #define TASK_COMM_LEN 16
 
@@ -1650,6 +1689,9 @@ struct task_struct {
        unsigned int    sequential_io;
        unsigned int    sequential_io_avg;
 #endif
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+       unsigned long   task_state_change;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 6fb1ba5f9b2f..041b744e99b4 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -211,6 +211,8 @@ wait_queue_head_t *bit_waitqueue(void *, int);
        wait_queue_t __wait;                                            \
        long __ret = ret;       /* explicit shadow */                   \
                                                                        \
+       might_sleep();                                                  \
+                                                                       \
        INIT_LIST_HEAD(&__wait.task_list);                              \
        if (exclusive)                                                  \
                __wait.flags = WQ_FLAG_EXCLUSIVE;                       \
diff --git a/kernel/exit.c b/kernel/exit.c
index e5c4668f1799..44cbc8791fca 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1067,6 +1067,12 @@ static int wait_task_zombie(struct wait_opts *wo, struct 
task_struct *p)
        }
 
        /*
+        * Since we're going to terminate the wait loop from do_wait(),
+        * reset task state.
+        */
+       __set_current_state(TASK_RUNNING);
+
+       /*
         * Now we are sure this task is interesting, and no other
         * thread can reap it because we its state == DEAD/TRACE.
         */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2676866b4394..0577c12c9cf8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7077,6 +7077,19 @@ void __might_sleep(const char *file, int line, int 
preempt_offset)
 {
        static unsigned long prev_jiffy;        /* ratelimiting */
 
+       /*
+        * Blocking primitives will set (and therefore destroy) current->state,
+        * since we will exit with TASK_RUNNING make sure we enter with it,
+        * otherwise we will destroy state.
+        */
+       if (WARN(current->state != TASK_RUNNING,
+                       "do not call blocking ops when !TASK_RUNNING; "
+                       "state=%lx set at [<%p>] %pS\n",
+                       current->state,
+                       current->task_state_change,
+                       current->task_state_change))
+               __set_current_state(TASK_RUNNING);
+
        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
             !is_idle_task(current)) ||
@@ -7107,6 +7120,7 @@ void __might_sleep(const char *file, int line, int 
preempt_offset)
        dump_stack();
 }
 EXPORT_SYMBOL(__might_sleep);
+
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index eb89e1807408..f032fb5284e3 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data)
                set_current_state(TASK_INTERRUPTIBLE);
                preempt_disable();
                if (kthread_should_stop()) {
-                       set_current_state(TASK_RUNNING);
+                       __set_current_state(TASK_RUNNING);
                        preempt_enable();
                        if (ht->cleanup)
                                ht->cleanup(td->cpu, cpu_online(td->cpu));
@@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data)
                /* Check for state change setup */
                switch (td->status) {
                case HP_THREAD_NONE:
+                       __set_current_state(TASK_RUNNING);
                        preempt_enable();
                        if (ht->setup)
                                ht->setup(td->cpu);
                        td->status = HP_THREAD_ACTIVE;
-                       preempt_disable();
-                       break;
+                       continue;
+
                case HP_THREAD_PARKED:
+                       __set_current_state(TASK_RUNNING);
                        preempt_enable();
                        if (ht->unpark)
                                ht->unpark(td->cpu);
                        td->status = HP_THREAD_ACTIVE;
-                       preempt_disable();
-                       break;
+                       continue;
                }
 
                if (!ht->thread_should_run(td->cpu)) {
-                       preempt_enable();
+                       preempt_enable_no_resched();
                        schedule();
                } else {
-                       set_current_state(TASK_RUNNING);
+                       __set_current_state(TASK_RUNNING);
                        preempt_enable();
                        ht->thread_fn(td->cpu);
                }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to