On Fri, Sep 22, 2017 at 11:35:33AM +0200, Markus Trippelsdorf wrote:
> > It seems to work. Simply returning "I (idle)" from get_task_state() in
> > fs/proc/array.c when the state is TASK_IDLE does the trick.
> > I've tested top, htop and ps.

I ended up with the below; there was quite a lot of inconsistent state
printing around it seems.

I should probably split this thing into a bunch of patches :/

Alongside an explicit idle state, this also exposes TASK_PARKED,
although arguably we could map that to idle too. Opinions?

---
 fs/proc/array.c                   | 35 ++++++++++++-----------
 include/linux/sched.h             | 58 +++++++++++++++++++++++----------------
 include/trace/events/sched.h      | 24 +++++++++++-----
 kernel/sched/core.c               | 22 ++++++++++++++-
 kernel/sched/debug.c              |  2 --
 kernel/trace/trace_output.c       | 21 ++++----------
 kernel/trace/trace_sched_wakeup.c | 12 ++++----
 7 files changed, 103 insertions(+), 71 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 88c355574aa0..5a076854857f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -118,28 +118,31 @@ static inline void task_name(struct seq_file *m, struct 
task_struct *p)
  * simple bit tests.
  */
 static const char * const task_state_array[] = {
-       "R (running)",          /*   0 */
-       "S (sleeping)",         /*   1 */
-       "D (disk sleep)",       /*   2 */
-       "T (stopped)",          /*   4 */
-       "t (tracing stop)",     /*   8 */
-       "X (dead)",             /*  16 */
-       "Z (zombie)",           /*  32 */
+       /* states inside TASK_REPORT */
+
+       "R (running)",          /* 0x00 */
+       "S (sleeping)",         /* 0x01 */
+       "D (disk sleep)",       /* 0x02 */
+       "T (stopped)",          /* 0x04 */
+       "t (tracing stop)",     /* 0x08 */
+       "X (dead)",             /* 0x10 */
+       "Z (zombie)",           /* 0x20 */
+       "P (parked)",           /* 0x40 */
+
+       /* extra states, beyond TASK_REPORT */
+
+       "I (idle)",             /* 0x80 */
 };
 
 static inline const char *get_task_state(struct task_struct *tsk)
 {
-       unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
+       unsigned int tsk_state = READ_ONCE(tsk->state);
+       unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
 
-       /*
-        * Parked tasks do not run; they sit in __kthread_parkme().
-        * Without this check, we would report them as running, which is
-        * clearly wrong, so we report them as sleeping instead.
-        */
-       if (tsk->state == TASK_PARKED)
-               state = TASK_INTERRUPTIBLE;
+       if (tsk_state == TASK_IDLE)
+               state = TASK_REPORT_MAX;
 
-       BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
+       BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-2);
 
        return task_state_array[fls(state)];
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68b38335d33c..7ae81efb17bd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -65,25 +65,27 @@ struct task_group;
  */
 
 /* Used in tsk->state: */
-#define TASK_RUNNING                   0
-#define TASK_INTERRUPTIBLE             1
-#define TASK_UNINTERRUPTIBLE           2
-#define __TASK_STOPPED                 4
-#define __TASK_TRACED                  8
+#define TASK_RUNNING                   0x0000
+#define TASK_INTERRUPTIBLE             0x0001
+#define TASK_UNINTERRUPTIBLE           0x0002
+#define __TASK_STOPPED                 0x0004
+#define __TASK_TRACED                  0x0008
 /* Used in tsk->exit_state: */
-#define EXIT_DEAD                      16
-#define EXIT_ZOMBIE                    32
+#define EXIT_DEAD                      0x0010
+#define EXIT_ZOMBIE                    0x0020
 #define EXIT_TRACE                     (EXIT_ZOMBIE | EXIT_DEAD)
 /* Used in tsk->state again: */
-#define TASK_DEAD                      64
-#define TASK_WAKEKILL                  128
-#define TASK_WAKING                    256
-#define TASK_PARKED                    512
-#define TASK_NOLOAD                    1024
-#define TASK_NEW                       2048
-#define TASK_STATE_MAX                 4096
+#define TASK_PARKED                    0x0040
+#define TASK_REPORT_MAX                        0x0080
 
-#define TASK_STATE_TO_CHAR_STR         "RSDTtXZxKWPNn"
+/* Not in TASK_REPORT: */
+#define TASK_DEAD                      0x0080
+#define TASK_WAKEKILL                  0x0100
+#define TASK_WAKING                    0x0200
+#define TASK_NOLOAD                    0x0400
+#define TASK_NEW                       0x0800
+
+#define TASK_STATE_MAX                 0x1000
 
 /* Convenience macros for the sake of set_current_state: */
 #define TASK_KILLABLE                  (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -96,10 +98,11 @@ struct task_group;
 #define TASK_NORMAL                    (TASK_INTERRUPTIBLE | 
TASK_UNINTERRUPTIBLE)
 #define TASK_ALL                       (TASK_NORMAL | __TASK_STOPPED | 
__TASK_TRACED)
 
-/* get_task_state(): */
+/* task_state_to_char(), get_task_state(), trace_sched_switch() */
 #define TASK_REPORT                    (TASK_RUNNING | TASK_INTERRUPTIBLE | \
                                         TASK_UNINTERRUPTIBLE | __TASK_STOPPED 
| \
-                                        __TASK_TRACED | EXIT_ZOMBIE | 
EXIT_DEAD)
+                                        __TASK_TRACED | EXIT_DEAD | 
EXIT_ZOMBIE | \
+                                        TASK_PARKED)
 
 #define task_is_traced(task)           ((task->state & __TASK_TRACED) != 0)
 
@@ -1244,17 +1247,24 @@ static inline pid_t task_pgrp_nr(struct task_struct 
*tsk)
        return task_pgrp_nr_ns(tsk, &init_pid_ns);
 }
 
-static inline char task_state_to_char(struct task_struct *task)
+static inline char __task_state_to_char(unsigned int state)
 {
-       const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-       unsigned long state = task->state;
+       static const char state_char[] = "RSDTtXZPI";
 
-       state = state ? __ffs(state) + 1 : 0;
+       BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 3);
+
+       return state_char[fls(state)];
+}
+
+static inline char task_state_to_char(struct task_struct *task)
+{
+       unsigned int tsk_state = READ_ONCE(task->state);
+       unsigned int state = (tsk_state | task->exit_state) & TASK_REPORT;
 
-       /* Make sure the string lines up properly with the number of task 
states: */
-       BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != 
ilog2(TASK_STATE_MAX)+1);
+       if (tsk_state == TASK_IDLE)
+               state = TASK_REPORT_MAX;
 
-       return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?';
+       return __task_state_to_char(state);
 }
 
 /**
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ae1409ffe99a..af1858a01335 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -106,6 +106,8 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
 #ifdef CREATE_TRACE_POINTS
 static inline long __trace_sched_switch_state(bool preempt, struct task_struct 
*p)
 {
+       unsigned int state = READ_ONCE(p->state);
+
 #ifdef CONFIG_SCHED_DEBUG
        BUG_ON(p != current);
 #endif /* CONFIG_SCHED_DEBUG */
@@ -114,10 +116,18 @@ static inline long __trace_sched_switch_state(bool 
preempt, struct task_struct *
         * Preemption ignores task state, therefore preempted tasks are always
         * RUNNING (we will not have dequeued if state != RUNNING).
         */
-       return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state;
+       if (preempt)
+               return TASK_REPORT_MAX << 1;
+
+       if (state == TASK_IDLE)
+               return TASK_REPORT_MAX;
+
+       return (state | p->exit_state) & TASK_REPORT;
 }
 #endif /* CREATE_TRACE_POINTS */
 
+#define TRACE_REPORT_MASK ((TASK_REPORT_MAX << 1) - 1)
+
 /*
  * Tracepoint for task switches, performed by the scheduler:
  */
@@ -152,13 +162,13 @@ TRACE_EVENT(sched_switch,
 
        TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> 
next_comm=%s next_pid=%d next_prio=%d",
                __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-               __entry->prev_state & (TASK_STATE_MAX-1) ?
-                 __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
+
+               __entry->prev_state & TRACE_REPORT_MASK ?
+                 __print_flags(__entry->prev_state & TRACE_REPORT_MASK, "|",
                                { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
-                               { 16, "Z" }, { 32, "X" }, { 64, "x" },
-                               { 128, "K" }, { 256, "W" }, { 512, "P" },
-                               { 1024, "N" }) : "R",
-               __entry->prev_state & TASK_STATE_MAX ? "+" : "",
+                               { 16, "X" }, { 32, "Z" }, { 64, "P" },
+                               { 128, "I" }) : "R",
+               __entry->prev_state & (TASK_REPORT_MAX << 1) ? "+" : "",
                __entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 703f5831738e..431e2d6c709e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5164,6 +5164,26 @@ void sched_show_task(struct task_struct *p)
        put_task_stack(p);
 }
 
+static inline bool state_filter_match(unsigned long state_filter, struct 
task_struct *p)
+{
+       /* no filter, everything matches */
+       if (!state_filter)
+               return true;
+
+       /* filter, but doesn't match */
+       if (!(p->state & state_filter))
+               return false;
+
+       /*
+        * When looking for TASK_UNINTERRUPTIBLE, skip TASK_IDLE, but allow
+        * TASK_KILLABLE.
+        */
+       if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+               return false;
+
+       return true;
+}
+
 void show_state_filter(unsigned long state_filter)
 {
        struct task_struct *g, *p;
@@ -5186,7 +5206,7 @@ void show_state_filter(unsigned long state_filter)
                 */
                touch_nmi_watchdog();
                touch_all_softlockup_watchdogs();
-               if (!state_filter || (p->state & state_filter))
+               if (state_filter_match(state_filter, p))
                        sched_show_task(p);
        }
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4a23bbc3111b..244619e402cc 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -461,8 +461,6 @@ static char *task_group_path(struct task_group *tg)
 }
 #endif
 
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bac629af2285..c738e764e2a5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter)
        return !trace_seq_has_overflowed(s);
 }
 
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
-       int bit = state ? __ffs(state) + 1 : 0;
-
-       return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
 /**
  * ftrace_find_event - find a registered event
  * @type: the type of event to look for
@@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct 
trace_iterator *iter,
 
        trace_assign_type(field, iter->ent);
 
-       T = task_state_char(field->next_state);
-       S = task_state_char(field->prev_state);
+       T = __task_state_to_char(field->next_state);
+       S = __task_state_to_char(field->prev_state);
        trace_find_cmdline(field->next_pid, comm);
        trace_seq_printf(&iter->seq,
                         " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, 
char S)
        trace_assign_type(field, iter->ent);
 
        if (!S)
-               S = task_state_char(field->prev_state);
-       T = task_state_char(field->next_state);
+               S = __task_state_to_char(field->prev_state);
+       T = __task_state_to_char(field->next_state);
        trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
                         field->prev_pid,
                         field->prev_prio,
@@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, 
char S)
        trace_assign_type(field, iter->ent);
 
        if (!S)
-               S = task_state_char(field->prev_state);
-       T = task_state_char(field->next_state);
+               S = __task_state_to_char(field->prev_state);
+       T = __task_state_to_char(field->next_state);
 
        SEQ_PUT_HEX_FIELD(s, field->prev_pid);
        SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c 
b/kernel/trace/trace_sched_wakeup.c
index ddec53b67646..b14caa0afd35 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -380,7 +380,7 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct 
*task, int cpu)
 }
 
 static void
-tracing_sched_switch_trace(struct trace_array *tr,
+tracing_sched_switch_trace(bool preempt, struct trace_array *tr,
                           struct task_struct *prev,
                           struct task_struct *next,
                           unsigned long flags, int pc)
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->prev_pid                 = prev->pid;
        entry->prev_prio                = prev->prio;
-       entry->prev_state               = prev->state;
+       entry->prev_state               = __trace_sched_switch_state(preempt, 
prev);
        entry->next_pid                 = next->pid;
        entry->next_prio                = next->prio;
-       entry->next_state               = next->state;
+       entry->next_state               = __trace_sched_switch_state(false, 
next);
        entry->next_cpu = task_cpu(next);
 
        if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->prev_pid                 = curr->pid;
        entry->prev_prio                = curr->prio;
-       entry->prev_state               = curr->state;
+       entry->prev_state               = __trace_sched_switch_state(false, 
curr);
        entry->next_pid                 = wakee->pid;
        entry->next_prio                = wakee->prio;
-       entry->next_state               = wakee->state;
+       entry->next_state               = __trace_sched_switch_state(false, 
wakee);
        entry->next_cpu                 = task_cpu(wakee);
 
        if (!call_filter_check_discard(call, entry, buffer, event))
@@ -482,7 +482,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
        data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
 
        __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
-       tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
+       tracing_sched_switch_trace(preempt, wakeup_trace, prev, next, flags, 
pc);
 
        T0 = data->preempt_timestamp;
        T1 = ftrace_now(cpu);

Reply via email to