from:"Yafang Shao"

[PATCH] delayacct: clear right task's flag after blkio completes

2021-04-14 Thread Yafang Shao

When I was implementing a latency analyze tool by using task->delays and
other things, I found there's issue in delayacct. The issue is it should
clear the target's flag instead of current's in delayacct_blkio_end().

When I git blame delayacct, I found there're some similar issues we have
fixed in delayacct_blkio_end().
'Commit c96f5471ce7d ("delayacct: Account blkio completion on the correct 
task")'
fixed the issue that it should account blkio completion on the target
task instead of current.
'Commit b512719f771a ("delayacct: fix crash in delayacct_blkio_end() after 
delayacct init failure")'
fixed the issue that it should check target task's delays instead of
current task'. It seems that delayacct_blkio_{begin, end} are error prone.
So I introduce a new paratmeter - the target task 'p' into these helpers,
after that change, the callsite will specifilly set the right task, which
should make it less error prone.

Signed-off-by: Yafang Shao 
Cc: Tejun Heo 
Cc: Josh Snyder 
---
 include/linux/delayacct.h | 20 ++--
 mm/memory.c   |  8 
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 2d3bdcccf5eb..21651f946751 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -82,16 +82,16 @@ static inline int delayacct_is_task_waiting_on_io(struct 
task_struct *p)
return 0;
 }
 
-static inline void delayacct_set_flag(int flag)
+static inline void delayacct_set_flag(struct task_struct *p, int flag)
 {
-   if (current->delays)
-   current->delays->flags |= flag;
+   if (p->delays)
+   p->delays->flags |= flag;
 }
 
-static inline void delayacct_clear_flag(int flag)
+static inline void delayacct_clear_flag(struct task_struct *p, int flag)
 {
-   if (current->delays)
-   current->delays->flags &= ~flag;
+   if (p->delays)
+   p->delays->flags &= ~flag;
 }
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
@@ -114,7 +114,7 @@ static inline void delayacct_tsk_free(struct task_struct 
*tsk)
 
 static inline void delayacct_blkio_start(void)
 {
-   delayacct_set_flag(DELAYACCT_PF_BLKIO);
+   delayacct_set_flag(current, DELAYACCT_PF_BLKIO);
if (current->delays)
__delayacct_blkio_start();
 }
@@ -123,7 +123,7 @@ static inline void delayacct_blkio_end(struct task_struct 
*p)
 {
if (p->delays)
__delayacct_blkio_end(p);
-   delayacct_clear_flag(DELAYACCT_PF_BLKIO);
+   delayacct_clear_flag(p, DELAYACCT_PF_BLKIO);
 }
 
 static inline int delayacct_add_tsk(struct taskstats *d,
@@ -166,9 +166,9 @@ static inline void delayacct_thrashing_end(void)
 }
 
 #else
-static inline void delayacct_set_flag(int flag)
+static inline void delayacct_set_flag(struct task_struct *p, int flag)
 {}
-static inline void delayacct_clear_flag(int flag)
+static inline void delayacct_clear_flag(struct task_struct *p, int flag)
 {}
 static inline void delayacct_init(void)
 {}
diff --git a/mm/memory.c b/mm/memory.c
index 550405fc3b5e..a013d32a6611 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3296,7 +3296,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
 
 
-   delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+   delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
swapcache = page;
 
@@ -3347,7 +3347,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->address, >ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
-   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto unlock;
}
 
@@ -3361,13 +3361,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 * owner processes (which may be unknown at hwpoison time)
 */
ret = VM_FAULT_HWPOISON;
-   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto out_release;
}
 
locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
 
-   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
if (!locked) {
ret |= VM_FAULT_RETRY;
goto out_release;
-- 
2.18.2

Re: [PATCH v2 0/6] sched: support schedstats for RT sched class

2021-04-06 Thread Yafang Shao

On Sat, Mar 27, 2021 at 6:13 PM Yafang Shao  wrote:
>
> We want to measure the latency of RT tasks in our production
> environment with schedstats facility, but currently schedstats is only
> supported for fair sched class. In order to support if for other sched
> classes, we should make it independent of fair sched class. The struct
> sched_statistics is the schedular statistics of a task_struct or a
> task_group, both of which are independent of sched class. So we can move
> struct sched_statistics into struct task_struct and struct task_group to
> achieve the goal.
>
> After the patchset, schestats are orgnized as follows,
> struct task_struct {
> ...
> struct sched_statistics statistics;
> ...
> struct sched_entity *se;
> struct sched_rt_entity *rt;
> ...
> };
>
> struct task_group {|---> stats[0] : of CPU0
> ...|
> struct sched_statistics **stats; --|---> stats[1] : of CPU1
> ...|
>|---> stats[n] : of CPUn
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> struct sched_entity **se;
>  #endif
>  #ifdef CONFIG_RT_GROUP_SCHED
> struct sched_rt_entity  **rt_se;
>  #endif
> ...
> };
>
> The sched_statistics members may be frequently modified when schedstats is
> enabled, in order to avoid impacting on random data which may in the same
> cacheline with them, the struct sched_statistics is defined as cacheline
> aligned.
>
> Then we can use schedstats to trace RT tasks as well, for example,
> Interface File
>  task schedstats :  /proc/[pid]/sched
>  group schedstats:  /proc/sched_debug
>  tracepoints :  sched:sched_stat_{runtime, wait, sleep, iowait, blocked}
>
> As PATCH #2 and #3 changes the core struct in the scheduler, so I did
> 'perf bench sched pipe' to measure the sched performance before and after
> the change, suggested by Mel. Below is the data, which are all in
> usecs/op.
>  Before After
>   kernel.sched_schedstats=0  6.0~6.16.0~6.1
>   kernel.sched_schedstats=1  6.2~6.46.2~6.4
> No obvious difference after the change.
>
> Changes since v1:
> - Fix the build failure reported by kernel test robot.
> - Add the performance data with 'perf bench sched pipe', suggested by
>   Mel.
> - Make the struct sched_statistics cacheline aligned.
> - Introduce task block time in schedstats
>
> Changes since RFC:
> - improvement of schedstats helpers, per Mel.
> - make struct schedstats independent of fair sched class
>
> Yafang Shao (6):
>   sched, fair: use __schedstat_set() in set_next_entity()
>   sched: make struct sched_statistics independent of fair sched class
>   sched: make schedstats helpers independent of fair sched class
>   sched: introduce task block time in schedstats
>   sched, rt: support sched_stat_runtime tracepoint for RT sched class
>   sched, rt: support schedstats for RT sched class
>
>  include/linux/sched.h|   7 +-
>  kernel/sched/core.c  |  24 +++--
>  kernel/sched/deadline.c  |   4 +-
>  kernel/sched/debug.c |  90 +
>  kernel/sched/fair.c  | 210 ---
>  kernel/sched/rt.c| 143 +-
>  kernel/sched/sched.h |   3 +
>  kernel/sched/stats.c | 104 +++
>  kernel/sched/stats.h |  89 +
>  kernel/sched/stop_task.c |   4 +-
>  10 files changed, 489 insertions(+), 189 deletions(-)
>
> --
> 2.18.2
>

Peter, Ingo, Mel,

Any comments on this version ?

-- 
Thanks
Yafang

[PATCH v2 6/6] sched, rt: support schedstats for RT sched class

2021-03-27 Thread Yafang Shao

We want to measure the latency of RT tasks in our production
environment with schedstats facility, but currently schedstats is only
supported for fair sched class. This patch enable it for RT sched class
as well.

After we make the struct sched_statistics and the helpers of it
independent of fair sched class, we can easily use the schedstats
facility for RT sched class.

The schedstat usage in RT sched class is similar with fair sched class,
for example,
fairRT
enqueue update_stats_enqueue_fair   update_stats_enqueue_rt
dequeue update_stats_dequeue_fair   update_stats_dequeue_rt
put_prev_task   update_stats_wait_start update_stats_wait_start
set_next_task   update_stats_wait_end   update_stats_wait_end

The user can get the schedstats information in the same way in fair sched
class. For example,
Interface File
  task schedstats : /proc/[pid]/sched
  group schedstats: /proc/sched_debug

The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can
be used to trace RT tasks as well. The output of these tracepoints for a
RT tasks as follows,

- blocked & iowait
kworker/48:1-442 [048] d...   539.830872: sched_stat_iowait: 
comm=stress pid=10461 delay=158242 [ns]
kworker/48:1-442 [048] d...   539.830872: sched_stat_blocked: 
comm=stress pid=10461 delay=158242 [ns]

- wait
  stress-10460   [001] dN..   813.965304: sched_stat_wait: comm=stress 
pid=10462 delay=7536 [ns]
  stress-10462   [001] d.h.   813.966300: sched_stat_runtime: 
comm=stress pid=10462 runtime=993812 [ns] vruntime=0 [ns]
  [...]
  stress-10462   [001] d.h.   814.065300: sched_stat_runtime: 
comm=stress pid=10462 runtime=994484 [ns] vruntime=0 [ns]
  [ totally 100 times of sched_stat_runtime for pid 10462]
  [ The delay of pid 10462 is the sum of above runtime ]
  stress-10462   [001] dN..   814.065307: sched_stat_wait: comm=stress 
pid=10460 delay=11089 [ns]
  stress-10460   [001] d.h.   814.066299: sched_stat_runtime: 
comm=stress pid=10460 runtime=991934 [ns] vruntime=0 [ns]

- sleep
   sleep-15582   [041] dN..  1732.814348: sched_stat_sleep: 
comm=sleep.sh pid=15474 delay=1001223130 [ns]
   sleep-15584   [041] dN..  1733.815908: sched_stat_sleep: 
comm=sleep.sh pid=15474 delay=1001238954 [ns]
   [ In sleep.sh, it sleeps 1 sec each time. ]

[l...@intel.com: reported build failure in prev version]

Signed-off-by: Yafang Shao 
Cc: kernel test robot 
---
 kernel/sched/rt.c | 137 ++
 1 file changed, 137 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ae5282484710..e5f706ffcdbc 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1273,6 +1273,125 @@ static void __delist_rt_entity(struct sched_rt_entity 
*rt_se, struct rt_prio_arr
rt_se->on_list = 0;
 }
 
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline void
+__schedstats_from_sched_rt_entity(struct sched_rt_entity *rt_se,
+ struct sched_statistics **stats)
+{
+   struct task_struct *p;
+   struct task_group *tg;
+   struct rt_rq *rt_rq;
+   int cpu;
+
+   if (rt_entity_is_task(rt_se)) {
+   p = rt_task_of(rt_se);
+   *stats = >stats;
+   } else {
+   rt_rq = group_rt_rq(rt_se);
+   tg = rt_rq->tg;
+   cpu = cpu_of(rq_of_rt_rq(rt_rq));
+   *stats = tg->stats[cpu];
+   }
+}
+
+#else
+
+static inline void
+__schedstats_from_sched_rt_entity(struct sched_rt_entity *rt_se,
+ struct sched_statistics **stats)
+{
+   struct task_struct *p;
+
+   p = rt_task_of(rt_se);
+   *stats = >stats;
+}
+#endif
+
+static inline void
+update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+   struct sched_statistics *stats = NULL;
+   struct task_struct *p = NULL;
+
+   if (!schedstat_enabled())
+   return;
+
+   if (rt_entity_is_task(rt_se))
+   p = rt_task_of(rt_se);
+
+   __schedstats_from_sched_rt_entity(rt_se, );
+
+   __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity 
*rt_se)
+{
+   struct sched_statistics *stats = NULL;
+   struct task_struct *p = NULL;
+
+   if (!schedstat_enabled())
+   return;
+
+   if (rt_entity_is_task(rt_se))
+   p = rt_task_of(rt_se);
+
+   __schedstats_from_sched_rt_entity(rt_se, );
+
+   __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+   int flags)
+{
+   if (!schedstat_enabled())
+   return;
+
+   if (fla

[PATCH v2 5/6] sched, rt: support sched_stat_runtime tracepoint for RT sched class

2021-03-27 Thread Yafang Shao

The runtime of a RT task has already been there, so we only need to
add a tracepoint.

One difference between fair task and RT task is that there is no vruntime
in RT task. To reuse the sched_stat_runtime tracepoint, '0' is passed as
vruntime for RT task.

The output of this tracepoint for RT task as follows,
  stress-9748[039] d.h.   113.519352: sched_stat_runtime: 
comm=stress pid=9748 runtime=997573 [ns] vruntime=0 [ns]
  stress-9748[039] d.h.   113.520352: sched_stat_runtime: 
comm=stress pid=9748 runtime=997627 [ns] vruntime=0 [ns]
  stress-9748[039] d.h.   113.521352: sched_stat_runtime: 
comm=stress pid=9748 runtime=998203 [ns] vruntime=0 [ns]

Signed-off-by: Yafang Shao 
---
 kernel/sched/rt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 34ad07fb924e..ae5282484710 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1012,6 +1012,8 @@ static void update_curr_rt(struct rq *rq)
schedstat_set(curr->stats.exec_max,
  max(curr->stats.exec_max, delta_exec));
 
+   trace_sched_stat_runtime(curr, delta_exec, 0);
+
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
 
-- 
2.18.2

[PATCH v2 4/6] sched: introduce task block time in schedstats

2021-03-27 Thread Yafang Shao

Currently in schedstats we have sum_sleep_runtime and iowait_sum, but
there's no metric to show how long the task is in D state.  Once a task in
D state, it means the task is blocked in the kernel, for example the
task may be waiting for a mutex. The D state is more frequent than
iowait, and it is more critital than S state. So it is worth to add a
metric to measure it.

Signed-off-by: Yafang Shao 
---
 include/linux/sched.h | 2 ++
 kernel/sched/debug.c  | 6 --
 kernel/sched/stats.c  | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b687bb38897b..2b885481b8bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -428,6 +428,8 @@ struct sched_statistics {
 
u64 block_start;
u64 block_max;
+   s64 sum_block_runtime;
+
u64 exec_max;
u64 slice_max;
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index d1bc616936d9..0995412dd3c0 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -499,10 +499,11 @@ print_task(struct seq_file *m, struct rq *rq, struct 
task_struct *p)
(long long)(p->nvcsw + p->nivcsw),
p->prio);
 
-   SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+   SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
-   SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)));
+   SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+   SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
 
 #ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@ -941,6 +942,7 @@ void proc_sched_show_task(struct task_struct *p, struct 
pid_namespace *ns,
u64 avg_atom, avg_per_cpu;
 
PN_SCHEDSTAT(stats.sum_sleep_runtime);
+   PN_SCHEDSTAT(stats.sum_block_runtime);
PN_SCHEDSTAT(stats.wait_start);
PN_SCHEDSTAT(stats.sleep_start);
PN_SCHEDSTAT(stats.block_start);
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index b2542f4d3192..21fae41c06f5 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -82,6 +82,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct 
task_struct *p,
 
__schedstat_set(stats->block_start, 0);
__schedstat_add(stats->sum_sleep_runtime, delta);
+   __schedstat_add(stats->sum_block_runtime, delta);
 
if (p) {
if (p->in_iowait) {
-- 
2.18.2

[PATCH v2 3/6] sched: make schedstats helpers independent of fair sched class

2021-03-27 Thread Yafang Shao

The original prototype of the schedstats helpers are

update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se)

The cfs_rq in these helpers is used to get the rq_clock, and the se is
used to get the struct sched_statistics and the struct task_struct. In
order to make these helpers available by all sched classes, we can pass
the rq, sched_statistics and task_struct directly.

Then the new helpers are

update_stats_wait_*(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)

which are independent of fair sched class.

To avoid vmlinux growing too large or introducing ovehead when
!schedstat_enabled(), some new helpers after schedstat_enabled() are also
introduced, Suggested by Mel. These helpers are in sched/stats.c,

__update_stats_wait_*(struct rq *rq, struct task_struct *p,
  struct sched_statistics *stats)

The size of vmlinux as follows,
  Before  After
  Size of vmlinux 826308552   826304640
The size is a litte smaller as some functions are not inlined again after
the change.

I also compared the sched performance with 'perf bench sched pipe',
suggested by Mel. The result as follows,
 Before   After
  kernel.sched_schedstats=0  6.0~6.1  6.0~6.1
  kernel.sched_schedstats=1  6.2~6.4  6.2~6.4
No obvious difference after the change.

No functional change.

[l...@intel.com: reported build failure in prev version]

Signed-off-by: Yafang Shao 
Acked-by: Mel Gorman 
Cc: kernel test robot 
---
 kernel/sched/fair.c  | 133 +++
 kernel/sched/stats.c | 103 +
 kernel/sched/stats.h |  34 +++
 3 files changed, 156 insertions(+), 114 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5f72fef1cc0a..77a615b6310d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -904,32 +904,28 @@ static void update_curr_fair(struct rq *rq)
 }
 
 static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
struct sched_statistics *stats = NULL;
-   u64 wait_start, prev_wait_start;
+   struct task_struct *p = NULL;
 
if (!schedstat_enabled())
return;
 
__schedstats_from_sched_entity(se, );
 
-   wait_start = rq_clock(rq_of(cfs_rq));
-   prev_wait_start = schedstat_val(stats->wait_start);
+   if (entity_is_task(se))
+   p = task_of(se);
 
-   if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
-   likely(wait_start > prev_wait_start))
-   wait_start -= prev_wait_start;
+   __update_stats_wait_start(rq_of(cfs_rq), p, stats);
 
-   __schedstat_set(stats->wait_start, wait_start);
 }
 
 static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
struct sched_statistics *stats = NULL;
struct task_struct *p = NULL;
-   u64 delta;
 
if (!schedstat_enabled())
return;
@@ -945,105 +941,34 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
if (unlikely(!schedstat_val(stats->wait_start)))
return;
 
-   delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start);
-
-   if (entity_is_task(se)) {
+   if (entity_is_task(se))
p = task_of(se);
-   if (task_on_rq_migrating(p)) {
-   /*
-* Preserve migrating task's wait time so wait_start
-* time stamp can be adjusted to accumulate wait time
-* prior to migration.
-*/
-   __schedstat_set(stats->wait_start, delta);
-   return;
-   }
-   trace_sched_stat_wait(p, delta);
-   }
 
-   __schedstat_set(stats->wait_max,
- max(schedstat_val(stats->wait_max), delta));
-   __schedstat_inc(stats->wait_count);
-   __schedstat_add(stats->wait_sum, delta);
-   __schedstat_set(stats->wait_start, 0);
+   __update_stats_wait_end(rq_of(cfs_rq), p, stats);
 }
 
 static inline void
-update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
 {
struct sched_statistics *stats = NULL;
struct task_struct *tsk = NULL;
-   u64 sleep_start, block_start;
 
if (!schedstat_enabled())
return;
 
__schedstats_from_sched_entity(se, );
 
-   sleep_start = schedstat_val(stats->sleep_start);
-   block_start = schedstat_val(stats->block_start);
-
i

[PATCH v2 2/6] sched: make struct sched_statistics independent of fair sched class

2021-03-27 Thread Yafang Shao

If we want to use the schedstats facility to trace other sched classes, we
should make it independent of fair sched class. The struct sched_statistics
is the schedular statistics of a task_struct or a task_group. So we can
move it into struct task_struct and struct task_group to achieve the goal.

After the patch, schestats are orgnized as follows,

struct task_struct {
...
struct sched_statistics statistics;
...
struct sched_entity *se;
struct sched_rt_entity *rt;
...
};

struct task_group {|---> stats[0] : of CPU0
...|
struct sched_statistics **stats; --|---> stats[1] : of CPU1
...|
   |---> stats[n] : of CPUn
 #ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity **se;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity  **rt_se;
 #endif
...
};

The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.

As this patch changes the core struct of scheduler, so I verified the
performance it may impact on the scheduler with 'perf bench sched
pipe', suggested by Mel. Below is the result, in which all the values
are in usecs/op.

 Before After
  kernel.sched_schedstats=0  6.0~6.16.0~6.1
  kernel.sched_schedstats=1  6.2~6.46.2~6.4

No obvious impact on the sched performance.

No functional change.

[l...@intel.com: reported build failure in prev version]

Signed-off-by: Yafang Shao 
Acked-by: Mel Gorman 
Cc: kernel test robot 
---
 include/linux/sched.h|   5 +-
 kernel/sched/core.c  |  24 
 kernel/sched/deadline.c  |   4 +-
 kernel/sched/debug.c |  86 ++--
 kernel/sched/fair.c  | 121 ---
 kernel/sched/rt.c|   4 +-
 kernel/sched/sched.h |   3 +
 kernel/sched/stats.h |  55 ++
 kernel/sched/stop_task.c |   4 +-
 9 files changed, 210 insertions(+), 96 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 05572e2140ad..b687bb38897b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -447,7 +447,7 @@ struct sched_statistics {
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
 #endif
-};
+} cacheline_aligned;
 
 struct sched_entity {
/* For load-balancing: */
@@ -463,8 +463,6 @@ struct sched_entity {
 
u64 nr_migrations;
 
-   struct sched_statistics statistics;
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
struct sched_entity *parent;
@@ -697,6 +695,7 @@ struct task_struct {
unsigned intrt_priority;
 
const struct sched_class*sched_class;
+   struct sched_statistics stats;
struct sched_entity se;
struct sched_rt_entity  rt;
 #ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3384ea74cad4..d55681b4f9a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2913,11 +2913,11 @@ ttwu_stat(struct task_struct *p, int cpu, int 
wake_flags)
 #ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
-   __schedstat_inc(p->se.statistics.nr_wakeups_local);
+   __schedstat_inc(p->stats.nr_wakeups_local);
} else {
struct sched_domain *sd;
 
-   __schedstat_inc(p->se.statistics.nr_wakeups_remote);
+   __schedstat_inc(p->stats.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
@@ -2929,14 +2929,14 @@ ttwu_stat(struct task_struct *p, int cpu, int 
wake_flags)
}
 
if (wake_flags & WF_MIGRATED)
-   __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+   __schedstat_inc(p->stats.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
 
__schedstat_inc(rq->ttwu_count);
-   __schedstat_inc(p->se.statistics.nr_wakeups);
+   __schedstat_inc(p->stats.nr_wakeups);
 
if (wake_flags & WF_SYNC)
-   __schedstat_inc(p->se.statistics.nr_wakeups_sync);
+   __schedstat_inc(p->stats.nr_wakeups_sync);
 }
 
 /*
@@ -3572,7 +3572,7 @@ static void __sched_fork(unsigned long clone_flags, 
struct task_struct *p)
 
 #ifdef CONFIG_SCHEDSTATS
/* Even if schedstat is disabled, there should not be garbage */
-   memset(>se.statistics, 0, sizeof(p->se.statistics));
+   memset(>stats, 0, sizeof(p->st

[PATCH v2 0/6] sched: support schedstats for RT sched class

2021-03-27 Thread Yafang Shao

We want to measure the latency of RT tasks in our production
environment with schedstats facility, but currently schedstats is only
supported for fair sched class. In order to support if for other sched
classes, we should make it independent of fair sched class. The struct
sched_statistics is the schedular statistics of a task_struct or a
task_group, both of which are independent of sched class. So we can move
struct sched_statistics into struct task_struct and struct task_group to
achieve the goal.

After the patchset, schestats are orgnized as follows,
struct task_struct {
...
struct sched_statistics statistics;
...
struct sched_entity *se;
struct sched_rt_entity *rt;
...
};

struct task_group {|---> stats[0] : of CPU0
...|
struct sched_statistics **stats; --|---> stats[1] : of CPU1
...|
   |---> stats[n] : of CPUn
 #ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity **se;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity  **rt_se;
 #endif
...
};

The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.

Then we can use schedstats to trace RT tasks as well, for example,
Interface File
 task schedstats :  /proc/[pid]/sched
 group schedstats:  /proc/sched_debug
 tracepoints :  sched:sched_stat_{runtime, wait, sleep, iowait, blocked}

As PATCH #2 and #3 changes the core struct in the scheduler, so I did
'perf bench sched pipe' to measure the sched performance before and after
the change, suggested by Mel. Below is the data, which are all in
usecs/op.
 Before After
  kernel.sched_schedstats=0  6.0~6.16.0~6.1
  kernel.sched_schedstats=1  6.2~6.46.2~6.4
No obvious difference after the change. 

Changes since v1:
- Fix the build failure reported by kernel test robot.
- Add the performance data with 'perf bench sched pipe', suggested by
  Mel.
- Make the struct sched_statistics cacheline aligned.
- Introduce task block time in schedstats

Changes since RFC:
- improvement of schedstats helpers, per Mel.
- make struct schedstats independent of fair sched class

Yafang Shao (6):
  sched, fair: use __schedstat_set() in set_next_entity()
  sched: make struct sched_statistics independent of fair sched class
  sched: make schedstats helpers independent of fair sched class
  sched: introduce task block time in schedstats
  sched, rt: support sched_stat_runtime tracepoint for RT sched class
  sched, rt: support schedstats for RT sched class

 include/linux/sched.h|   7 +-
 kernel/sched/core.c  |  24 +++--
 kernel/sched/deadline.c  |   4 +-
 kernel/sched/debug.c |  90 +
 kernel/sched/fair.c  | 210 ---
 kernel/sched/rt.c| 143 +-
 kernel/sched/sched.h |   3 +
 kernel/sched/stats.c | 104 +++
 kernel/sched/stats.h |  89 +
 kernel/sched/stop_task.c |   4 +-
 10 files changed, 489 insertions(+), 189 deletions(-)

-- 
2.18.2

[PATCH v2 1/6] sched, fair: use __schedstat_set() in set_next_entity()

2021-03-27 Thread Yafang Shao

schedstat_enabled() has been already checked, so we can use
__schedstat_set() directly.

Signed-off-by: Yafang Shao 
Acked-by: Mel Gorman 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aaa0dfa29d53..114eec730698 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4429,7 +4429,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 */
if (schedstat_enabled() &&
rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
-   schedstat_set(se->statistics.slice_max,
+   __schedstat_set(se->statistics.slice_max,
max((u64)schedstat_val(se->statistics.slice_max),
se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
-- 
2.18.2

Re: [PATCH v6 resend 0/3] mm, vsprintf: dump full information of page flags in pGp

2021-03-19 Thread Yafang Shao

On Fri, Mar 19, 2021 at 6:13 PM Yafang Shao  wrote:
>
> The existed pGp shows the names of page flags only, rather than the full
> information including section, node, zone, last cpuipid and kasan tag.
> While it is not easy to parse these information manually because there
> are so many flavors. We'd better interpret them in printf.
>
> To be compitable with the existed format of pGp, the new introduced ones
> also use '|' as the separator, then the user tools parsing pGp won't
> need to make change, suggested by Matthew. The new added information is
> tracked onto the end of the existed one, e.g.
> [ 8838.835456] Slab 0x2828b78a objects=33 used=3 
> fp=0xd04efc88 
> flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)
>
> The documentation and test cases are also updated. The result of the
> test cases as follows,
> [68599.816764] test_printf: loaded.
> [68599.819068] test_printf: all 388 tests passed
> [68599.830367] test_printf: unloaded.
>
> This patchset also includes some code cleanup in mm/slub.c.
>
> v6:
> - fixes the build failure and test failure reported by kernel test robot
>
> v5:
> - remove the bitmap and better name the struct, per Petr
>
> v4:
> - extend %pGp instead of introducing new format, per Matthew
>
> v3:
> - coding improvement, per Joe and Andy
> - the possible impact on debugfs and the fix of it, per Joe and Matthew
> - introduce new format instead of changing pGp, per Andy
>
> v2:
> - various coding improvement, per Joe, Miaohe, Vlastimil and Andy
> - remove the prefix completely in patch #2, per Vlastimil
> - Update the test cases, per Andy
>
> Yafang Shao (3):
>   mm, slub: use pGp to print page flags
>   mm, slub: don't combine pr_err with INFO
>   vsprintf: dump full information of page flags in pGp
>
>  Documentation/core-api/printk-formats.rst |  2 +-
>  lib/test_printf.c | 90 ---
>  lib/vsprintf.c| 66 +++--
>  mm/slub.c | 13 ++--
>  4 files changed, 149 insertions(+), 22 deletions(-)
>
> --
> 2.18.2
>

Hi Petr,

Any comments on this version ?

Thanks
Yafang

[PATCH v6 resend 2/3] mm, slub: don't combine pr_err with INFO

2021-03-19 Thread Yafang Shao

It is strange to combine "pr_err" with "INFO", so let's remove the
prefix completely.
This patch is motivated by David's comment[1].

- before the patch
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

- after the patch
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

[1]. 
https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t

Suggested-by: Vlastimil Babka 
Signed-off-by: Yafang Shao 
Acked-by: Vlastimil Babka 
Reviewed-by: Miaohe Lin 
Reviewed-by: Andy Shevchenko 
Reviewed-by: David Hildenbrand 
Acked-by: David Rientjes 
Cc: Matthew Wilcox 
---
 mm/slub.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index ed3f728c1367..7ed388077633 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -624,7 +624,7 @@ static void print_track(const char *s, struct track *t, 
unsigned long pr_time)
if (!t->addr)
return;
 
-   pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+   pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
   s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 #ifdef CONFIG_STACKTRACE
{
@@ -650,7 +650,7 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+   pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
   page, page->objects, page->inuse, page->freelist,
   page->flags, >flags);
 
@@ -707,7 +707,7 @@ static void print_trailer(struct kmem_cache *s, struct page 
*page, u8 *p)
 
print_page_info(page);
 
-   pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+   pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
   p, p - addr, get_freepointer(s, p));
 
if (s->flags & SLAB_RED_ZONE)
@@ -800,7 +800,7 @@ static int check_bytes_and_report(struct kmem_cache *s, 
struct page *page,
end--;
 
slab_bug(s, "%s overwritten", what);
-   pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+   pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
print_trailer(s, page, object);
@@ -3899,7 +3899,7 @@ static void list_slab_objects(struct kmem_cache *s, 
struct page *page,
for_each_object(p, s, addr, page->objects) {
 
if (!test_bit(__obj_to_index(s, addr, p), map)) {
-   pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+   pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
-- 
2.18.2

[PATCH v6 resend 0/3] mm, vsprintf: dump full information of page flags in pGp

2021-03-19 Thread Yafang Shao

The existed pGp shows the names of page flags only, rather than the full
information including section, node, zone, last cpuipid and kasan tag.
While it is not easy to parse these information manually because there
are so many flavors. We'd better interpret them in printf.

To be compitable with the existed format of pGp, the new introduced ones
also use '|' as the separator, then the user tools parsing pGp won't
need to make change, suggested by Matthew. The new added information is
tracked onto the end of the existed one, e.g.
[ 8838.835456] Slab 0x2828b78a objects=33 used=3 fp=0xd04efc88 
flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)

The documentation and test cases are also updated. The result of the
test cases as follows,
[68599.816764] test_printf: loaded.
[68599.819068] test_printf: all 388 tests passed
[68599.830367] test_printf: unloaded.

This patchset also includes some code cleanup in mm/slub.c.

v6:
- fixes the build failure and test failure reported by kernel test robot

v5:
- remove the bitmap and better name the struct, per Petr

v4:
- extend %pGp instead of introducing new format, per Matthew

v3:
- coding improvement, per Joe and Andy
- the possible impact on debugfs and the fix of it, per Joe and Matthew
- introduce new format instead of changing pGp, per Andy

v2:
- various coding improvement, per Joe, Miaohe, Vlastimil and Andy
- remove the prefix completely in patch #2, per Vlastimil
- Update the test cases, per Andy

Yafang Shao (3):
  mm, slub: use pGp to print page flags
  mm, slub: don't combine pr_err with INFO
  vsprintf: dump full information of page flags in pGp

 Documentation/core-api/printk-formats.rst |  2 +-
 lib/test_printf.c | 90 ---
 lib/vsprintf.c| 66 +++--
 mm/slub.c | 13 ++--
 4 files changed, 149 insertions(+), 22 deletions(-)

-- 
2.18.2

[PATCH v6 resend 3/3] vsprintf: dump full information of page flags in pGp

2021-03-19 Thread Yafang Shao

Currently the pGp only shows the names of page flags, rather than
the full information including section, node, zone, last cpupid and
kasan tag. While it is not easy to parse these information manually
because there're so many flavors. Let's interpret them in pGp as well.

To be compitable with the existed format of pGp, the new introduced ones
also use '|' as the separator, then the user tools parsing pGp won't
need to make change, suggested by Matthew. The new information is
tracked onto the end of the existed one.

On example of the output in mm/slub.c as follows,
- Before the patch,
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

- After the patch,
[ 8448.272530] Slab 0x90797883 objects=33 used=3 fp=0x790f1c26 
flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)

The documentation and test cases are also updated. The output of the
test cases as follows,
[68599.816764] test_printf: loaded.
[68599.819068] test_printf: all 388 tests passed
[68599.830367] test_printf: unloaded.

[l...@intel.com: reported issues in the prev version in test_printf.c]

Signed-off-by: Yafang Shao 
Cc: David Hildenbrand 
Cc: Joe Perches 
Cc: Miaohe Lin 
Cc: Vlastimil Babka 
Cc: Andy Shevchenko 
Cc: Matthew Wilcox 
Cc: Petr Mladek 
Cc: kernel test robot 
---
 Documentation/core-api/printk-formats.rst |  2 +-
 lib/test_printf.c | 90 ---
 lib/vsprintf.c| 66 +++--
 3 files changed, 142 insertions(+), 16 deletions(-)

diff --git a/Documentation/core-api/printk-formats.rst 
b/Documentation/core-api/printk-formats.rst
index 160e710d992f..00d07c7eefd4 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -540,7 +540,7 @@ Flags bitfields such as page flags, gfp_flags
 
 ::
 
-   %pGpreferenced|uptodate|lru|active|private
+   %pGp
referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1f
%pGgGFP_USER|GFP_DMA32|GFP_NOWARN
%pGvread|exec|mayread|maywrite|mayexec|denywrite
 
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 95a2f82427c7..27b964ec723d 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -577,24 +577,98 @@ netdev_features(void)
 {
 }
 
+struct page_flags_test {
+   int width;
+   int shift;
+   int mask;
+   unsigned long value;
+   const char *fmt;
+   const char *name;
+};
+
+static struct page_flags_test pft[] = {
+   {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK,
+0, "%d", "section"},
+   {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK,
+0, "%d", "node"},
+   {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK,
+0, "%d", "zone"},
+   {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK,
+0, "%#x", "lastcpupid"},
+   {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK,
+0, "%#x", "kasantag"},
+};
+
+static void __init
+page_flags_test(int section, int node, int zone, int last_cpupid,
+   int kasan_tag, int flags, const char *name, char *cmp_buf)
+{
+   unsigned long values[] = {section, node, zone, last_cpupid, kasan_tag};
+   unsigned long page_flags = 0;
+   unsigned long size = 0;
+   bool append = false;
+   int i;
+
+   flags &= BIT(NR_PAGEFLAGS) - 1;
+   if (flags) {
+   page_flags |= flags;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "%s", name);
+   size = strlen(cmp_buf);
+#if SECTIONS_WIDTH || NODES_WIDTH || ZONES_WIDTH || \
+   LAST_CPUPID_WIDTH || KASAN_TAG_WIDTH
+   /* Other information also included in page flags */
+   snprintf(cmp_buf + size, BUF_SIZE - size, "|");
+   size = strlen(cmp_buf);
+#endif
+   }
+
+   /* Set the test value */
+   for (i = 0; i < ARRAY_SIZE(pft); i++)
+   pft[i].value = values[i];
+
+   for (i = 0; i < ARRAY_SIZE(pft); i++) {
+   if (!pft[i].width)
+   continue;
+
+   if (append) {
+   snprintf(cmp_buf + size, BUF_SIZE - size, "|");
+   size = strlen(cmp_buf);
+   }
+
+   page_flags |= (pft[i].value & pft[i].mask) << pft[i].shift;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "%s=", pft[i].name);
+   size = strlen(cmp_buf);
+   snprintf(cmp_buf + size, BUF_SIZE - size, pft[i].fmt,
+pft[i].value & pft[i].mask);
+   size = strlen(cmp_buf);
+   append = true;
+   }
+
+   test(cmp_buf, "%pGp", _flags);
+}
+
 static void __init
 flags(void)
 {
unsigned long flags;
-   gfp_t gfp;
char *cmp

[PATCH v6 resend 1/3] mm, slub: use pGp to print page flags

2021-03-19 Thread Yafang Shao

As pGp has been already introduced in printk, we'd better use it to make
the output human readable.

Before this change, the output is,
[ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 
fp=0x8cd1579c flags=0x17c0010200

While after this change, the output is,
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

Signed-off-by: Yafang Shao 
Reviewed-by: David Hildenbrand 
Reviewed-by: Vlastimil Babka 
Acked-by: David Rientjes 
Acked-by: Christoph Lameter 
Reviewed-by: Matthew Wilcox (Oracle) 
Reviewed-by: Miaohe Lin 
Reviewed-by: Andy Shevchenko 
---
 mm/slub.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 3021ce9bf1b3..ed3f728c1367 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -650,8 +650,9 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
-  page, page->objects, page->inuse, page->freelist, page->flags);
+   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+  page, page->objects, page->inuse, page->freelist,
+  page->flags, >flags);
 
 }
 
-- 
2.18.2

Re: linux-next: Tree for Mar 10 (lib/test_printf.c)

2021-03-14 Thread Yafang Shao

On Wed, Mar 10, 2021 at 5:52 PM Petr Mladek  wrote:
>
> On Tue 2021-03-09 21:57:48, Randy Dunlap wrote:
> > On 3/9/21 8:02 PM, Stephen Rothwell wrote:
> > > Hi all,
> > >
> >
> > on i386 (at least):
> >
> > ../lib/test_printf.c: In function 'page_flags_test':
> > ../lib/test_printf.c:595:17: error: 'sec' undeclared (first use in this 
> > function); did you mean 'sem'?
> >   page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT;
> >  ^~~
> >
> >
> > Should that be 'section'?
>
> Yup, it looks like.
>
> There seems to be one more problem found by the test robot:
>
>lib/test_printf.c:595:17: note: each undeclared identifier is reported 
> only once for each function it appears in
> >> lib/test_printf.c:612:17: error: 'tag' undeclared (first use in this 
> >> function)
>  612 |  page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
>
>
> Yafang is going to send a fix. I have temporary removed the
> problematic patch from printk/linux.git, for-next branch.
>
> I am sorry for the troubles. Anyway, it is great that linux-next
> and all the test robots are around.
>

Hi Petr,

I sent a new version v6[1] to fix the test failure and build failure
report by kernel test robot.

Sorry for the late fix as I'm very busy recently.

[1]. 
https://lore.kernel.org/linux-mm/20210314083717.96380-1-laoar.s...@gmail.com/T/#t


-- 
Thanks
Yafang

[PATCH v6 2/3] mm, slub: don't combine pr_err with INFO

2021-03-14 Thread Yafang Shao

It is strange to combine "pr_err" with "INFO", so let's remove the
prefix completely.
This patch is motivated by David's comment[1].

- before the patch
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

- after the patch
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

[1]. 
https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t

Suggested-by: Vlastimil Babka 
Signed-off-by: Yafang Shao 
Acked-by: Vlastimil Babka 
Reviewed-by: Miaohe Lin 
Reviewed-by: Andy Shevchenko 
Reviewed-by: David Hildenbrand 
Cc: Matthew Wilcox 
---
 mm/slub.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index ed3f728c1367..7ed388077633 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -624,7 +624,7 @@ static void print_track(const char *s, struct track *t, 
unsigned long pr_time)
if (!t->addr)
return;
 
-   pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+   pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
   s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 #ifdef CONFIG_STACKTRACE
{
@@ -650,7 +650,7 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+   pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
   page, page->objects, page->inuse, page->freelist,
   page->flags, >flags);
 
@@ -707,7 +707,7 @@ static void print_trailer(struct kmem_cache *s, struct page 
*page, u8 *p)
 
print_page_info(page);
 
-   pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+   pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
   p, p - addr, get_freepointer(s, p));
 
if (s->flags & SLAB_RED_ZONE)
@@ -800,7 +800,7 @@ static int check_bytes_and_report(struct kmem_cache *s, 
struct page *page,
end--;
 
slab_bug(s, "%s overwritten", what);
-   pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+   pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
print_trailer(s, page, object);
@@ -3899,7 +3899,7 @@ static void list_slab_objects(struct kmem_cache *s, 
struct page *page,
for_each_object(p, s, addr, page->objects) {
 
if (!test_bit(__obj_to_index(s, addr, p), map)) {
-   pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+   pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
-- 
2.18.2

[PATCH v6 1/3] mm, slub: use pGp to print page flags

2021-03-14 Thread Yafang Shao

As pGp has been already introduced in printk, we'd better use it to make
the output human readable.

Before this change, the output is,
[ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 
fp=0x8cd1579c flags=0x17c0010200

While after this change, the output is,
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

Signed-off-by: Yafang Shao 
Reviewed-by: David Hildenbrand 
Reviewed-by: Vlastimil Babka 
Acked-by: David Rientjes 
Acked-by: Christoph Lameter 
Reviewed-by: Matthew Wilcox (Oracle) 
Reviewed-by: Miaohe Lin 
Reviewed-by: Andy Shevchenko 
---
 mm/slub.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 3021ce9bf1b3..ed3f728c1367 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -650,8 +650,9 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
-  page, page->objects, page->inuse, page->freelist, page->flags);
+   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+  page, page->objects, page->inuse, page->freelist,
+  page->flags, >flags);
 
 }
 
-- 
2.18.2

[PATCH v6 3/3] vsprintf: dump full information of page flags in pGp

2021-03-14 Thread Yafang Shao

Currently the pGp only shows the names of page flags, rather than
the full information including section, node, zone, last cpupid and
kasan tag. While it is not easy to parse these information manually
because there're so many flavors. Let's interpret them in pGp as well.

To be compitable with the existed format of pGp, the new introduced ones
also use '|' as the separator, then the user tools parsing pGp won't
need to make change, suggested by Matthew. The new information is
tracked onto the end of the existed one.

On example of the output in mm/slub.c as follows,
- Before the patch,
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

- After the patch,
[ 8448.272530] Slab 0x90797883 objects=33 used=3 fp=0x790f1c26 
flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)

The documentation and test cases are also updated. The output of the
test cases as follows,
[68599.816764] test_printf: loaded.
[68599.819068] test_printf: all 388 tests passed
[68599.830367] test_printf: unloaded.

[l...@intel.com: reported issues in the prev version in test_printf.c]

Signed-off-by: Yafang Shao 
Cc: David Hildenbrand 
Cc: Joe Perches 
Cc: Miaohe Lin 
Cc: Vlastimil Babka 
Cc: Andy Shevchenko 
Cc: Matthew Wilcox 
Cc: Petr Mladek 
Cc: kernel test robot 
---
 Documentation/core-api/printk-formats.rst |  2 +-
 lib/test_printf.c | 90 ---
 lib/vsprintf.c| 66 +++--
 3 files changed, 142 insertions(+), 16 deletions(-)

diff --git a/Documentation/core-api/printk-formats.rst 
b/Documentation/core-api/printk-formats.rst
index 160e710d992f..00d07c7eefd4 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -540,7 +540,7 @@ Flags bitfields such as page flags, gfp_flags
 
 ::
 
-   %pGpreferenced|uptodate|lru|active|private
+   %pGp
referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1f
%pGgGFP_USER|GFP_DMA32|GFP_NOWARN
%pGvread|exec|mayread|maywrite|mayexec|denywrite
 
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 95a2f82427c7..f87e433d6fa9 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -577,24 +577,98 @@ netdev_features(void)
 {
 }
 
+struct page_flags_test {
+   int width;
+   int shift;
+   int mask;
+   unsigned long value;
+   const char *fmt;
+   const char *name;
+};
+
+static struct page_flags_test pft[] = {
+   {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK,
+0, "%d", "section"},
+   {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK,
+0, "%d", "node"},
+   {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK,
+0, "%d", "zone"},
+   {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK,
+0, "%#x", "lastcpupid"},
+   {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK,
+0, "%#x", "kasantag"},
+};
+
+static void __init
+page_flags_test(int section, int node, int zone, int last_cpupid,
+   int kasan_tag, int flags, const char *name, char *cmp_buf)
+{
+   unsigned long values[] = {section, node, zone, last_cpupid, kasan_tag};
+   unsigned long page_flags = 0;
+   unsigned long size = 0;
+   bool append = false;
+   int i;
+
+   flags &= BIT(NR_PAGEFLAGS) - 1;
+   if (flags) {
+   page_flags |= flags;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "%s", name);
+   size = strlen(cmp_buf);
+#if SECTIONS_WIDTH || NODES_WIDTH || ZONES_WIDTH || \
+   LAST_CPUPID_WIDTH || KASAN_TAG_WIDTH
+   /* Other information also included in page flags */
+   snprintf(cmp_buf + size, BUF_SIZE - size, "|");
+   size = strlen(cmp_buf);
+#endif
+   }
+
+   /* Set the test value */
+   for (i = 0; i < ARRAY_SIZE(pft); i++)
+   pft[i].value = values[i];
+
+   for (i = 0; i < ARRAY_SIZE(pft); i++) {
+   if (!pft[i].width)
+   continue;
+
+   if (append) {
+   snprintf(cmp_buf + size, BUF_SIZE - size, "|");
+   size = strlen(cmp_buf);
+   }
+
+   page_flags |= (pft[i].value & pft[i].mask) << pft[i].shift;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "%s=", pft[i].name);
+   size = strlen(cmp_buf);
+   snprintf(cmp_buf + size, BUF_SIZE - size, pft[i].fmt,
+pft[i].value & pft[i].mask);
+   size = strlen(cmp_buf);
+   append = true;
+   }
+
+   test(cmp_buf, "%pGp", _flags);
+}
+
 static void __init
 flags(void)
 {
unsigned long flags;
-   gfp_t gfp;
char *cmp

[PATCH v6 0/3] mm, vsprintf: dump full information of page flags in pGp

2021-03-14 Thread Yafang Shao

The existed pGp shows the names of page flags only, rather than the full
information including section, node, zone, last cpuipid and kasan tag.
While it is not easy to parse these information manually because there
are so many flavors. We'd better interpret them in printf.

To be compitable with the existed format of pGp, the new introduced ones
also use '|' as the separator, then the user tools parsing pGp won't
need to make change, suggested by Matthew. The new added information is
tracked onto the end of the existed one, e.g.
[ 8838.835456] Slab 0x2828b78a objects=33 used=3 fp=0xd04efc88 
flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)

The documentation and test cases are also updated. The result of the
test cases as follows,
[68599.816764] test_printf: loaded.
[68599.819068] test_printf: all 388 tests passed
[68599.830367] test_printf: unloaded.

This patchset also includes some code cleanup in mm/slub.c.

v6:
- fixes the build failure and test failure reported by kernel test robot

v5:
- remove the bitmap and better name the struct, per Petr

v4:
- extend %pGp instead of introducing new format, per Matthew

v3:
- coding improvement, per Joe and Andy
- the possible impact on debugfs and the fix of it, per Joe and Matthew
- introduce new format instead of changing pGp, per Andy

v2:
- various coding improvement, per Joe, Miaohe, Vlastimil and Andy
- remove the prefix completely in patch #2, per Vlastimil
- Update the test cases, per Andy

Yafang Shao (3):
  mm, slub: use pGp to print page flags
  mm, slub: don't combine pr_err with INFO
  vsprintf: dump full information of page flags in pGp

 Documentation/core-api/printk-formats.rst |  2 +-
 lib/test_printf.c | 90 ---
 lib/vsprintf.c| 66 +++--
 mm/slub.c | 13 ++--
 4 files changed, 149 insertions(+), 22 deletions(-)

-- 
2.18.2

[PATCH 1/1] vsprintf: dump full information of page flags in pGp fix

2021-02-22 Thread Yafang Shao

The name of the flag should be printed using default_str_spec.
There's no difference in the output after this change because the string is
printed as-is with both default_dec_spec and default_flag_spec.

This patch is a followup of the patchset
"mm, vsprintf: dump full information of page flags in pGp" [1]

[1]. 
https://lore.kernel.org/linux-mm/20210215155141.47432-1-laoar.s...@gmail.com/
Signed-off-by: Yafang Shao 
Cc: Petr Mladek 
Cc: Matthew Wilcox 
Cc: Andy Shevchenko 
Cc: Vlastimil Babka 
Cc: Miaohe Lin 
Cc: Joe Perches 
Cc: David Hildenbrand 
---
 lib/vsprintf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 533ac5404180..5d034e799c06 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1963,7 +1963,7 @@ char *format_page_flags(char *buf, char *end, unsigned 
long flags)
buf++;
}
 
-   buf = string(buf, end, p->name, *p->spec);
+   buf = string(buf, end, p->name, default_str_spec);
if (buf < end)
*buf = '=';
buf++;
-- 
2.17.1

Re: [PATCH v5 3/3] vsprintf: dump full information of page flags in pGp

2021-02-22 Thread Yafang Shao

On Mon, Feb 22, 2021 at 8:38 PM Petr Mladek  wrote:
>
> Hello,
>
> first, I am sorry for the late reply. I have marked the thread as
> proceed by mistake last week...
>
>
> On Mon 2021-02-15 23:51:41, Yafang Shao wrote:
> > Currently the pGp only shows the names of page flags, rather than
> > the full information including section, node, zone, last cpupid and
> > kasan tag. While it is not easy to parse these information manually
> > because there're so many flavors. Let's interpret them in pGp as well.
> >
> > To be compitable with the existed format of pGp, the new introduced ones
> > also use '|' as the separator, then the user tools parsing pGp won't
> > need to make change, suggested by Matthew. The new information is
> > tracked onto the end of the existed one.
> >
> > One example of the output in mm/slub.c as follows,
> > - Before the patch,
> > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 
> > fp=0x9ae06ffc flags=0x17c0010200(slab|head)
> >
> > - After the patch,
> > [ 8448.272530] Slab 0x90797883 objects=33 used=3 
> > fp=0x790f1c26 
> > flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)
> >
> > The documentation and test cases are also updated. The output of the
> > test cases as follows,
> > [11585.830272] test_printf: loaded.
> > [11585.830454] test_printf: all 388 tests passed
> > [11585.831401] test_printf: unloaded.
> >
> > --- a/lib/vsprintf.c
> > +++ b/lib/vsprintf.c
> > +static
> > +char *format_page_flags(char *buf, char *end, unsigned long flags)
> > +{
> > + unsigned long main_flags = flags & (BIT(NR_PAGEFLAGS) - 1);
> > + bool append = false;
> > + int i;
> > +
> > + /* Page flags from the main area. */
> > + if (main_flags) {
> > + buf = format_flags(buf, end, main_flags, pageflag_names);
> > + append = true;
> > + }
> > +
> > + /* Page flags from the fields area */
> > + for (i = 0; i < ARRAY_SIZE(pff); i++) {
> > + /* Skip undefined fields. */
> > + if (!pff[i].width)
> > + continue;
> > +
> > + /* Format: Flag Name + '=' (equals sign) + Number + '|' 
> > (separator) */
> > + if (append) {
> > + if (buf < end)
> > + *buf = '|';
> > + buf++;
> > + }
> > +
> > + buf = string(buf, end, pff[i].name, *pff[i].spec);
>
> I have found one more small issue.
>
> The purpose of the flag-specific printk_spec is to define the format
> how the value is printed. The name of the flag should be printed
> using default_str_spec.
>
> It works because the string is printed as-is with both
> default_dec_spec and default_flag_spec. But it would be better
> to use the string format.
>

Thanks for the explanation.

> > + if (buf < end)
> > + *buf = '=';
> > + buf++;
> > + buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask,
> > +  *pff[i].spec);
> > +
> > + append = true;
> > + }
> > +
> > + return buf;
> > +}
>
> Otherwise, the patch looks to me. The issue is cosmetic and might be
> fixed either by re-spinning just this patch or by a followup patch.

I will send a separate followup patch.

> Either way, feel free to use:
>
> Reviewed-by: Petr Mladek 
>

Thanks

> Another question where to push this change. It is pity the we
> finalized it in the middle of the merge window. It has to spend
> at least few days in linux-next.
>
> I would like to hear from Andy before I push it into linux-next.
> There is still theoretical chance to get it into 5.12 when Linus
> prolongs the merge window by one week. it has been delayed by
> a long lasting power outage.
>
> Best Regards,
> Petr



-- 
Thanks
Yafang

Re: [PATCH v5 3/3] vsprintf: dump full information of page flags in pGp

2021-02-22 Thread Yafang Shao

On Mon, Feb 15, 2021 at 11:53 PM Yafang Shao  wrote:
>
> Currently the pGp only shows the names of page flags, rather than
> the full information including section, node, zone, last cpupid and
> kasan tag. While it is not easy to parse these information manually
> because there're so many flavors. Let's interpret them in pGp as well.
>
> To be compitable with the existed format of pGp, the new introduced ones
> also use '|' as the separator, then the user tools parsing pGp won't
> need to make change, suggested by Matthew. The new information is
> tracked onto the end of the existed one.
>
> One example of the output in mm/slub.c as follows,
> - Before the patch,
> [ 6343.396602] Slab 0x4382e02b objects=33 used=3 
> fp=0x9ae06ffc flags=0x17c0010200(slab|head)
>
> - After the patch,
> [ 8448.272530] Slab 0x90797883 objects=33 used=3 
> fp=0x790f1c26 
> flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)
>
> The documentation and test cases are also updated. The output of the
> test cases as follows,
> [11585.830272] test_printf: loaded.
> [11585.830454] test_printf: all 388 tests passed
> [11585.831401] test_printf: unloaded.
>
> Signed-off-by: Yafang Shao 
> Cc: David Hildenbrand 
> Cc: Joe Perches 
> Cc: Miaohe Lin 
> Cc: Vlastimil Babka 
> Cc: Andy Shevchenko 
> Cc: Matthew Wilcox 
> Cc: Petr Mladek 
> ---
>  Documentation/core-api/printk-formats.rst |  2 +-
>  lib/test_printf.c | 60 +
>  lib/vsprintf.c| 66 +--
>  3 files changed, 112 insertions(+), 16 deletions(-)
>
> diff --git a/Documentation/core-api/printk-formats.rst 
> b/Documentation/core-api/printk-formats.rst
> index 160e710d992f..00d07c7eefd4 100644
> --- a/Documentation/core-api/printk-formats.rst
> +++ b/Documentation/core-api/printk-formats.rst
> @@ -540,7 +540,7 @@ Flags bitfields such as page flags, gfp_flags
>
>  ::
>
> -   %pGpreferenced|uptodate|lru|active|private
> +   %pGp
> referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1f
> %pGgGFP_USER|GFP_DMA32|GFP_NOWARN
> %pGvread|exec|mayread|maywrite|mayexec|denywrite
>
> diff --git a/lib/test_printf.c b/lib/test_printf.c
> index 7ac87f18a10f..148773dfe97a 100644
> --- a/lib/test_printf.c
> +++ b/lib/test_printf.c
> @@ -569,24 +569,68 @@ netdev_features(void)
>  {
>  }
>
> +static void __init
> +page_flags_test(int section, int node, int zone, int last_cpupid,
> +   int kasan_tag, int flags, const char *name, char *cmp_buf)
> +{
> +   unsigned long page_flags = 0;
> +   unsigned long size = 0;
> +
> +   flags &= BIT(NR_PAGEFLAGS) - 1;
> +   if (flags) {
> +   page_flags |= flags;
> +   snprintf(cmp_buf + size, BUF_SIZE - size, "%s|", name);
> +   size = strlen(cmp_buf);
> +   }
> +
> +#ifdef SECTION_IN_PAGE_FLAGS
> +   page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT;
> +   snprintf(cmp_buf + size, BUF_SIZE - size, "section=%#x|", sec);
> +   size = strlen(cmp_buf);
> +#endif
> +
> +   page_flags |= ((node & NODES_MASK) << NODES_PGSHIFT) |
> +   ((zone & ZONES_MASK) << ZONES_PGSHIFT);
> +   snprintf(cmp_buf + size, BUF_SIZE - size, "node=%d|zone=%d", node, 
> zone);
> +   size = strlen(cmp_buf);
> +
> +#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS
> +   page_flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
> +   snprintf(cmp_buf + size, BUF_SIZE - size, "|lastcpupid=%#x", 
> last_cpupid);
> +   size = strlen(cmp_buf);
> +#endif
> +
> +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
> +   page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
> +   snprintf(cmp_buf + size, BUF_SIZE - size, "|kasantag=%#x", tag);
> +   size = strlen(cmp_buf);
> +#endif
> +
> +   test(cmp_buf, "%pGp", _flags);
> +}
> +
>  static void __init
>  flags(void)
>  {
> unsigned long flags;
> -   gfp_t gfp;
> char *cmp_buffer;
> +   gfp_t gfp;
> +
> +   cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
> +   if (!cmp_buffer)
> +   return;
>
> flags = 0;
> -   test("", "%pGp", );
> +   page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer);
>
> -   /* Page flags should filter the zone id */
> flags = 1UL << NR_PAGEFLAGS;
> -   te

[PATCH v5 2/3] mm, slub: don't combine pr_err with INFO

2021-02-15 Thread Yafang Shao

It is strange to combine "pr_err" with "INFO", so let's remove the
prefix completely.
This patch is motivated by David's comment[1].

- before the patch
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

- after the patch
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

[1]. 
https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t

Suggested-by: Vlastimil Babka 
Signed-off-by: Yafang Shao 
Acked-by: Vlastimil Babka 
Reviewed-by: Miaohe Lin 
Reviewed-by: Andy Shevchenko 
Reviewed-by: David Hildenbrand 
Cc: Matthew Wilcox 
---
 mm/slub.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index ddd429832577..d66a7ebd21fe 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -615,7 +615,7 @@ static void print_track(const char *s, struct track *t, 
unsigned long pr_time)
if (!t->addr)
return;
 
-   pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+   pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
   s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 #ifdef CONFIG_STACKTRACE
{
@@ -641,7 +641,7 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+   pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
   page, page->objects, page->inuse, page->freelist,
   page->flags, >flags);
 
@@ -698,7 +698,7 @@ static void print_trailer(struct kmem_cache *s, struct page 
*page, u8 *p)
 
print_page_info(page);
 
-   pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+   pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
   p, p - addr, get_freepointer(s, p));
 
if (s->flags & SLAB_RED_ZONE)
@@ -791,7 +791,7 @@ static int check_bytes_and_report(struct kmem_cache *s, 
struct page *page,
end--;
 
slab_bug(s, "%s overwritten", what);
-   pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+   pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
print_trailer(s, page, object);
@@ -3855,7 +3855,7 @@ static void list_slab_objects(struct kmem_cache *s, 
struct page *page,
for_each_object(p, s, addr, page->objects) {
 
if (!test_bit(__obj_to_index(s, addr, p), map)) {
-   pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+   pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
-- 
2.18.2

[PATCH v5 0/3] mm, vsprintf: dump full information of page flags in pGp

2021-02-15 Thread Yafang Shao

The existed pGp shows the names of page flags only, rather than the full
information including section, node, zone, last cpuipid and kasan tag.
While it is not easy to parse these information manually because there
are so many flavors. We'd better interpret them in printf.

To be compitable with the existed format of pGp, the new introduced ones
also use '|' as the separator, then the user tools parsing pGp won't
need to make change, suggested by Matthew. The new added information is
tracked onto the end of the existed one, e.g.
[ 8838.835456] Slab 0x2828b78a objects=33 used=3 fp=0xd04efc88 
flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)

The documentation and test cases are also updated. The result of the 
test cases as follows,
[11585.830272] test_printf: loaded.
[11585.830454] test_printf: all 388 tests passed
[11585.831401] test_printf: unloaded.

This patchset also includes some code cleanup in mm/slub.c.

v5:
- remove the bitmap and better name the struct, per Petr 

v4:
- extend %pGp instead of introducing new format, per Matthew

v3:
- coding improvement, per Joe and Andy
- the possible impact on debugfs and the fix of it, per Joe and Matthew
- introduce new format instead of changing pGp, per Andy

v2:
- various coding improvement, per Joe, Miaohe, Vlastimil and Andy
- remove the prefix completely in patch #2, per Vlastimil
- Update the test cases, per Andy

Yafang Shao (3):
  mm, slub: use pGp to print page flags
  mm, slub: don't combine pr_err with INFO
  vsprintf: dump full information of page flags in pGp

 Documentation/core-api/printk-formats.rst |  2 +-
 lib/test_printf.c | 60 +
 lib/vsprintf.c| 66 +--
 mm/slub.c | 13 ++---
 4 files changed, 119 insertions(+), 22 deletions(-)

-- 
2.18.2

[PATCH v5 3/3] vsprintf: dump full information of page flags in pGp

2021-02-15 Thread Yafang Shao

Currently the pGp only shows the names of page flags, rather than
the full information including section, node, zone, last cpupid and
kasan tag. While it is not easy to parse these information manually
because there're so many flavors. Let's interpret them in pGp as well.

To be compitable with the existed format of pGp, the new introduced ones
also use '|' as the separator, then the user tools parsing pGp won't
need to make change, suggested by Matthew. The new information is
tracked onto the end of the existed one.

One example of the output in mm/slub.c as follows,
- Before the patch,
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

- After the patch,
[ 8448.272530] Slab 0x90797883 objects=33 used=3 fp=0x790f1c26 
flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)

The documentation and test cases are also updated. The output of the
test cases as follows,
[11585.830272] test_printf: loaded.
[11585.830454] test_printf: all 388 tests passed
[11585.831401] test_printf: unloaded.

Signed-off-by: Yafang Shao 
Cc: David Hildenbrand 
Cc: Joe Perches 
Cc: Miaohe Lin 
Cc: Vlastimil Babka 
Cc: Andy Shevchenko 
Cc: Matthew Wilcox 
Cc: Petr Mladek 
---
 Documentation/core-api/printk-formats.rst |  2 +-
 lib/test_printf.c | 60 +
 lib/vsprintf.c| 66 +--
 3 files changed, 112 insertions(+), 16 deletions(-)

diff --git a/Documentation/core-api/printk-formats.rst 
b/Documentation/core-api/printk-formats.rst
index 160e710d992f..00d07c7eefd4 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -540,7 +540,7 @@ Flags bitfields such as page flags, gfp_flags
 
 ::
 
-   %pGpreferenced|uptodate|lru|active|private
+   %pGp
referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1f
%pGgGFP_USER|GFP_DMA32|GFP_NOWARN
%pGvread|exec|mayread|maywrite|mayexec|denywrite
 
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 7ac87f18a10f..148773dfe97a 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -569,24 +569,68 @@ netdev_features(void)
 {
 }
 
+static void __init
+page_flags_test(int section, int node, int zone, int last_cpupid,
+   int kasan_tag, int flags, const char *name, char *cmp_buf)
+{
+   unsigned long page_flags = 0;
+   unsigned long size = 0;
+
+   flags &= BIT(NR_PAGEFLAGS) - 1;
+   if (flags) {
+   page_flags |= flags;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "%s|", name);
+   size = strlen(cmp_buf);
+   }
+
+#ifdef SECTION_IN_PAGE_FLAGS
+   page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "section=%#x|", sec);
+   size = strlen(cmp_buf);
+#endif
+
+   page_flags |= ((node & NODES_MASK) << NODES_PGSHIFT) |
+   ((zone & ZONES_MASK) << ZONES_PGSHIFT);
+   snprintf(cmp_buf + size, BUF_SIZE - size, "node=%d|zone=%d", node, 
zone);
+   size = strlen(cmp_buf);
+
+#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+   page_flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "|lastcpupid=%#x", 
last_cpupid);
+   size = strlen(cmp_buf);
+#endif
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+   page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "|kasantag=%#x", tag);
+   size = strlen(cmp_buf);
+#endif
+
+   test(cmp_buf, "%pGp", _flags);
+}
+
 static void __init
 flags(void)
 {
unsigned long flags;
-   gfp_t gfp;
char *cmp_buffer;
+   gfp_t gfp;
+
+   cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
+   if (!cmp_buffer)
+   return;
 
flags = 0;
-   test("", "%pGp", );
+   page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer);
 
-   /* Page flags should filter the zone id */
flags = 1UL << NR_PAGEFLAGS;
-   test("", "%pGp", );
+   page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer);
 
flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru
| 1UL << PG_active | 1UL << PG_swapbacked;
-   test("uptodate|dirty|lru|active|swapbacked", "%pGp", );
-
+   page_flags_test(1, 1, 1, 0x1f, 1, flags,
+   "uptodate|dirty|lru|active|swapbacked",
+   cmp_buffer);
 
flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC
| VM_DENYWRITE;
@@ -601,10 +645,6 @@ flags(void)
gfp = __GF

[PATCH v5 1/3] mm, slub: use pGp to print page flags

2021-02-15 Thread Yafang Shao

As pGp has been already introduced in printk, we'd better use it to make
the output human readable.

Before this change, the output is,
[ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 
fp=0x8cd1579c flags=0x17c0010200

While after this change, the output is,
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

Signed-off-by: Yafang Shao 
Reviewed-by: David Hildenbrand 
Reviewed-by: Vlastimil Babka 
Acked-by: David Rientjes 
Acked-by: Christoph Lameter 
Reviewed-by: Matthew Wilcox (Oracle) 
Reviewed-by: Miaohe Lin 
Reviewed-by: Andy Shevchenko 
---
 mm/slub.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 7ecbbbe5bc0c..ddd429832577 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -641,8 +641,9 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
-  page, page->objects, page->inuse, page->freelist, page->flags);
+   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+  page, page->objects, page->inuse, page->freelist,
+  page->flags, >flags);
 
 }
 
-- 
2.18.2

Re: [PATCH v4 3/3] vsprintf: dump full information of page flags in pGp

2021-02-10 Thread Yafang Shao

On Wed, Feb 10, 2021 at 8:51 PM Petr Mladek  wrote:
>
> On Wed 2021-02-10 00:21:37, Yafang Shao wrote:
> > On Tue, Feb 9, 2021 at 9:53 PM Petr Mladek  wrote:
> > >
> > > On Tue 2021-02-09 18:56:13, Yafang Shao wrote:
> > > > Currently the pGp only shows the names of page flags, rather than
> > > > the full information including section, node, zone, last cpupid and
> > > > kasan tag. While it is not easy to parse these information manually
> > > > because there're so many flavors. Let's interpret them in pGp as well.
> > > >
> > > > To be compitable with the existed format of pGp, the new introduced ones
> > > > also use '|' as the separator, then the user tools parsing pGp won't
> > > > need to make change, suggested by Matthew. The new information is
> > > > tracked onto the end of the existed one.
> > > >
> > > > On example of the output in mm/slub.c as follows,
> > > > - Before the patch,
> > > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 
> > > > fp=0x9ae06ffc flags=0x17c0010200(slab|head)
> > > >
> > > > - After the patch,
> > > > [ 8838.835456] Slab 0x2828b78a objects=33 used=3 
> > > > fp=0xd04efc88 
> > > > flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)
> > > >
> > > > The documentation and test cases are also updated. The output of the
> > > > test cases as follows,
> > > > [  501.485081] test_printf: loaded.
> > > > [  501.485768] test_printf: all 388 tests passed
> > > > [  501.488762] test_printf: unloaded.
> > > >
> > >
> > > > diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> > > > index 14c9a6af1b23..3f26611adb34 100644
> > > > --- a/lib/vsprintf.c
> > > > +++ b/lib/vsprintf.c
> > > > @@ -1916,6 +1916,66 @@ char *format_flags(char *buf, char *end, 
> > > > unsigned long flags,
> > > >   return buf;
> > > >  }
> > > >
> > > > +struct page_flags_layout {
> > > > + int width;
> > > > + int shift;
> > > > + int mask;
> > > > + const struct printf_spec *spec;
> > > > + const char *name;
> > > > +};
> > > > +
> > > > +static const struct page_flags_layout pfl[] = {
> > > > + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK,
> > > > +  _dec_spec, "section"},
> > > > + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK,
> > > > +  _dec_spec, "node"},
> > > > + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK,
> > > > +  _dec_spec, "zone"},
> > > > + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK,
> > > > +  _flag_spec, "lastcpupid"},
> > > > + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK,
> > > > +  _flag_spec, "kasantag"},
> > > > +};
> > > > +
> > > > +static
> > > > +char *format_page_flags(char *buf, char *end, unsigned long flags)
> > > > +{
> > > > + DECLARE_BITMAP(mask, ARRAY_SIZE(pfl));
> > > > + unsigned long last;
> > > > + int i;
> > > > +
> > > > + if (flags & (BIT(NR_PAGEFLAGS) - 1)) {
> > > > + if (buf < end)
> > > > + *buf = '|';
> > > > + buf++;
> > > > + }
> > >
> > > This is far from obvious. You print '|' here because you printed
> > > something somewhere else. See below.
> > >
> > > > +
> > > > + for (i = 0; i < ARRAY_SIZE(pfl); i++)
> > > > + __assign_bit(i, mask, pfl[i].width);
> > >
> > > The bitmap looks like an overkill. If I get it correctly, it is a
> > > tricky way to handle only flags defined by the used build
> > > configuration. See below.
> > >
> > > > + last = find_last_bit(mask, ARRAY_SIZE(pfl));
> > > > +
> > > > + for_each_set_bit(i, mask, ARRAY_SIZE(pfl)) {
> > > > + /* Format: Flag Name + '=' (equals sign) + Number + '|' 
> > > > (separator) */
> > > > + buf = string(buf, end, pfl[i].name, *pfl[i].spec);
> > > > +
> > > > + if (buf < end)
> > > > + *buf = '=';
> > >

Re: [PATCH v4 3/3] vsprintf: dump full information of page flags in pGp

2021-02-09 Thread Yafang Shao

On Tue, Feb 9, 2021 at 9:53 PM Petr Mladek  wrote:
>
> On Tue 2021-02-09 18:56:13, Yafang Shao wrote:
> > Currently the pGp only shows the names of page flags, rather than
> > the full information including section, node, zone, last cpupid and
> > kasan tag. While it is not easy to parse these information manually
> > because there're so many flavors. Let's interpret them in pGp as well.
> >
> > To be compitable with the existed format of pGp, the new introduced ones
> > also use '|' as the separator, then the user tools parsing pGp won't
> > need to make change, suggested by Matthew. The new information is
> > tracked onto the end of the existed one.
> >
> > On example of the output in mm/slub.c as follows,
> > - Before the patch,
> > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 
> > fp=0x9ae06ffc flags=0x17c0010200(slab|head)
> >
> > - After the patch,
> > [ 8838.835456] Slab 0x2828b78a objects=33 used=3 
> > fp=0xd04efc88 
> > flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)
> >
> > The documentation and test cases are also updated. The output of the
> > test cases as follows,
> > [  501.485081] test_printf: loaded.
> > [  501.485768] test_printf: all 388 tests passed
> > [  501.488762] test_printf: unloaded.
> >
>
> > diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> > index 14c9a6af1b23..3f26611adb34 100644
> > --- a/lib/vsprintf.c
> > +++ b/lib/vsprintf.c
> > @@ -1916,6 +1916,66 @@ char *format_flags(char *buf, char *end, unsigned 
> > long flags,
> >   return buf;
> >  }
> >
> > +struct page_flags_layout {
> > + int width;
> > + int shift;
> > + int mask;
> > + const struct printf_spec *spec;
> > + const char *name;
> > +};
> > +
> > +static const struct page_flags_layout pfl[] = {
> > + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK,
> > +  _dec_spec, "section"},
> > + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK,
> > +  _dec_spec, "node"},
> > + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK,
> > +  _dec_spec, "zone"},
> > + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK,
> > +  _flag_spec, "lastcpupid"},
> > + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK,
> > +  _flag_spec, "kasantag"},
> > +};
> > +
> > +static
> > +char *format_page_flags(char *buf, char *end, unsigned long flags)
> > +{
> > + DECLARE_BITMAP(mask, ARRAY_SIZE(pfl));
> > + unsigned long last;
> > + int i;
> > +
> > + if (flags & (BIT(NR_PAGEFLAGS) - 1)) {
> > + if (buf < end)
> > + *buf = '|';
> > + buf++;
> > + }
>
> This is far from obvious. You print '|' here because you printed
> something somewhere else. See below.
>
> > +
> > + for (i = 0; i < ARRAY_SIZE(pfl); i++)
> > + __assign_bit(i, mask, pfl[i].width);
>
> The bitmap looks like an overkill. If I get it correctly, it is a
> tricky way to handle only flags defined by the used build
> configuration. See below.
>
> > + last = find_last_bit(mask, ARRAY_SIZE(pfl));
> > +
> > + for_each_set_bit(i, mask, ARRAY_SIZE(pfl)) {
> > + /* Format: Flag Name + '=' (equals sign) + Number + '|' 
> > (separator) */
> > + buf = string(buf, end, pfl[i].name, *pfl[i].spec);
> > +
> > + if (buf < end)
> > + *buf = '=';
> > + buf++;
> > + buf = number(buf, end, (flags >> pfl[i].shift) & pfl[i].mask,
> > +  *pfl[i].spec);
> > +
> > + /* No separator for the last entry */
> > + if (i != last) {
> > + if (buf < end)
> > + *buf = '|';
> > + buf++;
> > + }
> > + }
> > +
> > + return buf;
> > +}
> > +
> >  static noinline_for_stack
> >  char *flags_string(char *buf, char *end, void *flags_ptr,
> >  struct printf_spec spec, const char *fmt)
> > @@ -1929,10 +1989,10 @@ char *flags_string(char *buf, char *end, void 
> > *flags_ptr,
> >   switch (fmt[1]) {
> >   case 'p':
> >   flags = *(unsigned long *)flags_ptr;
> > - /* Remove zone id */
> > - flags &= (1UL << NR_PAGEFLAGS)

[PATCH v4 3/3] vsprintf: dump full information of page flags in pGp

2021-02-09 Thread Yafang Shao

Currently the pGp only shows the names of page flags, rather than
the full information including section, node, zone, last cpupid and
kasan tag. While it is not easy to parse these information manually
because there're so many flavors. Let's interpret them in pGp as well.

To be compitable with the existed format of pGp, the new introduced ones
also use '|' as the separator, then the user tools parsing pGp won't
need to make change, suggested by Matthew. The new information is
tracked onto the end of the existed one.

On example of the output in mm/slub.c as follows,
- Before the patch,
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

- After the patch,
[ 8838.835456] Slab 0x2828b78a objects=33 used=3 fp=0xd04efc88 
flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)

The documentation and test cases are also updated. The output of the
test cases as follows,
[  501.485081] test_printf: loaded.
[  501.485768] test_printf: all 388 tests passed
[  501.488762] test_printf: unloaded.

Signed-off-by: Yafang Shao 
Cc: David Hildenbrand 
Cc: Joe Perches 
Cc: Miaohe Lin 
Cc: Vlastimil Babka 
Cc: Andy Shevchenko 
Cc: Matthew Wilcox 
---
 Documentation/core-api/printk-formats.rst |  2 +-
 lib/test_printf.c | 60 +
 lib/vsprintf.c| 66 +--
 3 files changed, 114 insertions(+), 14 deletions(-)

diff --git a/Documentation/core-api/printk-formats.rst 
b/Documentation/core-api/printk-formats.rst
index 6d26c5c6ac48..93c3e48ff30d 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -538,7 +538,7 @@ Flags bitfields such as page flags, gfp_flags
 
 ::
 
-   %pGpreferenced|uptodate|lru|active|private
+   %pGp
referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1f
%pGgGFP_USER|GFP_DMA32|GFP_NOWARN
%pGvread|exec|mayread|maywrite|mayexec|denywrite
 
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 7ac87f18a10f..148773dfe97a 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -569,24 +569,68 @@ netdev_features(void)
 {
 }
 
+static void __init
+page_flags_test(int section, int node, int zone, int last_cpupid,
+   int kasan_tag, int flags, const char *name, char *cmp_buf)
+{
+   unsigned long page_flags = 0;
+   unsigned long size = 0;
+
+   flags &= BIT(NR_PAGEFLAGS) - 1;
+   if (flags) {
+   page_flags |= flags;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "%s|", name);
+   size = strlen(cmp_buf);
+   }
+
+#ifdef SECTION_IN_PAGE_FLAGS
+   page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "section=%#x|", sec);
+   size = strlen(cmp_buf);
+#endif
+
+   page_flags |= ((node & NODES_MASK) << NODES_PGSHIFT) |
+   ((zone & ZONES_MASK) << ZONES_PGSHIFT);
+   snprintf(cmp_buf + size, BUF_SIZE - size, "node=%d|zone=%d", node, 
zone);
+   size = strlen(cmp_buf);
+
+#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+   page_flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "|lastcpupid=%#x", 
last_cpupid);
+   size = strlen(cmp_buf);
+#endif
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+   page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "|kasantag=%#x", tag);
+   size = strlen(cmp_buf);
+#endif
+
+   test(cmp_buf, "%pGp", _flags);
+}
+
 static void __init
 flags(void)
 {
unsigned long flags;
-   gfp_t gfp;
char *cmp_buffer;
+   gfp_t gfp;
+
+   cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
+   if (!cmp_buffer)
+   return;
 
flags = 0;
-   test("", "%pGp", );
+   page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer);
 
-   /* Page flags should filter the zone id */
flags = 1UL << NR_PAGEFLAGS;
-   test("", "%pGp", );
+   page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer);
 
flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru
| 1UL << PG_active | 1UL << PG_swapbacked;
-   test("uptodate|dirty|lru|active|swapbacked", "%pGp", );
-
+   page_flags_test(1, 1, 1, 0x1f, 1, flags,
+   "uptodate|dirty|lru|active|swapbacked",
+   cmp_buffer);
 
flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC
| VM_DENYWRITE;
@@ -601,10 +645,6 @@ flags(void)
gfp = __GFP_ATOMIC;
t

[PATCH v4 2/3] mm, slub: don't combine pr_err with INFO

2021-02-09 Thread Yafang Shao

It is strange to combine "pr_err" with "INFO", so let's remove the
prefix completely.
This patch is motivated by David's comment[1].

- before the patch
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

- after the patch
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

[1]. 
https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t

Suggested-by: Vlastimil Babka 
Signed-off-by: Yafang Shao 
Acked-by: Vlastimil Babka 
Reviewed-by: Miaohe Lin 
Cc: David Hildenbrand 
Cc: Matthew Wilcox 
---
 mm/slub.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 87ff086e68a4..2514c37ab4e4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -612,7 +612,7 @@ static void print_track(const char *s, struct track *t, 
unsigned long pr_time)
if (!t->addr)
return;
 
-   pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+   pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
   s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 #ifdef CONFIG_STACKTRACE
{
@@ -638,7 +638,7 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+   pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
   page, page->objects, page->inuse, page->freelist,
   page->flags, >flags);
 
@@ -695,7 +695,7 @@ static void print_trailer(struct kmem_cache *s, struct page 
*page, u8 *p)
 
print_page_info(page);
 
-   pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+   pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
   p, p - addr, get_freepointer(s, p));
 
if (s->flags & SLAB_RED_ZONE)
@@ -788,7 +788,7 @@ static int check_bytes_and_report(struct kmem_cache *s, 
struct page *page,
end--;
 
slab_bug(s, "%s overwritten", what);
-   pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+   pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
print_trailer(s, page, object);
@@ -3854,7 +3854,7 @@ static void list_slab_objects(struct kmem_cache *s, 
struct page *page,
for_each_object(p, s, addr, page->objects) {
 
if (!test_bit(__obj_to_index(s, addr, p), map)) {
-   pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+   pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
-- 
2.17.1

[PATCH v4 1/3] mm, slub: use pGp to print page flags

2021-02-09 Thread Yafang Shao

As pGp has been already introduced in printk, we'd better use it to make
the output human readable.

Before this change, the output is,
[ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 
fp=0x8cd1579c flags=0x17c0010200

While after this change, the output is,
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

Signed-off-by: Yafang Shao 
Reviewed-by: David Hildenbrand 
Reviewed-by: Vlastimil Babka 
Acked-by: David Rientjes 
Acked-by: Christoph Lameter 
Reviewed-by: Matthew Wilcox (Oracle) 
Reviewed-by: Miaohe Lin 
---
 mm/slub.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 34dcc09e2ec9..87ff086e68a4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -638,8 +638,9 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
-  page, page->objects, page->inuse, page->freelist, page->flags);
+   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+  page, page->objects, page->inuse, page->freelist,
+  page->flags, >flags);
 
 }
 
-- 
2.17.1

[PATCH v4 0/3] mm, vsprintf: dump full information of page flags in pGp

2021-02-09 Thread Yafang Shao

The existed pGp shows the names of page flags only, rather than the full
information including section, node, zone, last cpuipid and kasan tag.
While it is not easy to parse these information manually because there
are so many flavors. We'd better interpret them in printf.

To be compitable with the existed format of pGp, the new introduced ones
also use '|' as the separator, then the user tools parsing pGp won't
need to make change, suggested by Matthew. The new added information is
tracked onto the end of the existed one, e.g.
[ 8838.835456] Slab 0x2828b78a objects=33 used=3 fp=0xd04efc88 
flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)

The documentation and test cases are also updated. The result of the
test cases as follows,
[  501.485081] test_printf: loaded.
[  501.485768] test_printf: all 388 tests passed
[  501.488762] test_printf: unloaded.

This patchset also includes some code cleanup in mm/slub.c.

v4:
- extend %pGp instead of introducing new format, per Matthew

v3:
- coding improvement, per Joe and Andy
- the possible impact on debugfs and the fix of it, per Joe and Matthew
- introduce new format instead of changing pGp, per Andy

v2:
- various coding improvement, per Joe, Miaohe, Vlastimil and Andy
- remove the prefix completely in patch #2, per Vlastimil
- Update the test cases, per Andy

Yafang Shao (3):
  mm, slub: use pGp to print page flags
  mm, slub: don't combine pr_err with INFO
  vsprintf: dump full information of page flags in pGp

 Documentation/core-api/printk-formats.rst |  2 +-
 lib/test_printf.c | 60 +
 lib/vsprintf.c| 66 +--
 mm/slub.c | 13 ++---
 4 files changed, 121 insertions(+), 20 deletions(-)

-- 
2.17.1

Re: [PATCH v3 0/3] mm, vsprintf: introduce new format to dump full information of page flags

2021-02-08 Thread Yafang Shao

On Mon, Feb 8, 2021 at 8:20 PM Matthew Wilcox  wrote:
>
> On Mon, Feb 08, 2021 at 06:14:36PM +0800, Yafang Shao wrote:
> > To avoid breaking some tools which parsing pGp via debugfs or affecting
> > the printf buffer, other new formats are introduced, so the user can choose
> > what and in which order they want, suggested by Andy. These new introduced
> > format as follows,
> > pGpb: print other information first and then the names of page flags
> > pGpl: print the names of page flags first and then the other info
>
> This is overengineering things.  We already print in little-endian order,
> and the other information should be tacked onto the end.  Just extend
> %pGp.  Andy's suggestion to add another flag was a bad one.
>

All right. I will do it in the next version.

-- 
Thanks
Yafang

[PATCH v3 1/3] mm, slub: use pGp to print page flags

2021-02-08 Thread Yafang Shao

As pGp has been already introduced in printk, we'd better use it to make
the output human readable.

Before this change, the output is,
[ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 
fp=0x8cd1579c flags=0x17c0010200

While after this change, the output is,
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

Reviewed-by: David Hildenbrand 
Reviewed-by: Vlastimil Babka 
Acked-by: David Rientjes 
Acked-by: Christoph Lameter 
Signed-off-by: Yafang Shao 
---
 mm/slub.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 34dcc09e2ec9..87ff086e68a4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -638,8 +638,9 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
-  page, page->objects, page->inuse, page->freelist, page->flags);
+   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+  page, page->objects, page->inuse, page->freelist,
+  page->flags, >flags);
 
 }
 
-- 
2.17.1

[PATCH v3 2/3] mm, slub: don't combine pr_err with INFO

2021-02-08 Thread Yafang Shao

It is strange to combine "pr_err" with "INFO", so let's remove the
prefix completely.
This patch is motivated by David's comment[1].

- before the patch
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

- after the patch
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

[1]. 
https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t

Suggested-by: Vlastimil Babka 
Cc: David Hildenbrand 
Signed-off-by: Yafang Shao 
---
 mm/slub.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 87ff086e68a4..2514c37ab4e4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -612,7 +612,7 @@ static void print_track(const char *s, struct track *t, 
unsigned long pr_time)
if (!t->addr)
return;
 
-   pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+   pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
   s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 #ifdef CONFIG_STACKTRACE
{
@@ -638,7 +638,7 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+   pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
   page, page->objects, page->inuse, page->freelist,
   page->flags, >flags);
 
@@ -695,7 +695,7 @@ static void print_trailer(struct kmem_cache *s, struct page 
*page, u8 *p)
 
print_page_info(page);
 
-   pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+   pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
   p, p - addr, get_freepointer(s, p));
 
if (s->flags & SLAB_RED_ZONE)
@@ -788,7 +788,7 @@ static int check_bytes_and_report(struct kmem_cache *s, 
struct page *page,
end--;
 
slab_bug(s, "%s overwritten", what);
-   pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+   pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
print_trailer(s, page, object);
@@ -3854,7 +3854,7 @@ static void list_slab_objects(struct kmem_cache *s, 
struct page *page,
for_each_object(p, s, addr, page->objects) {
 
if (!test_bit(__obj_to_index(s, addr, p), map)) {
-   pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+   pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
-- 
2.17.1

[PATCH v3 0/3] mm, vsprintf: introduce new format to dump full information of page flags

2021-02-08 Thread Yafang Shao

The existed pGp shows the names of page flags only, rather than the full
information including section, node, zone, last cpuipid and kasan tag.
While it is not easy to parse these information manually because there
are so many flavors. We'd better interpret them in printf.

To avoid breaking some tools which parsing pGp via debugfs or affecting
the printf buffer, other new formats are introduced, so the user can choose
what and in which order they want, suggested by Andy. These new introduced
format as follows,
pGpb: print other information first and then the names of page flags
pGpl: print the names of page flags first and then the other info

The differece between them looks like the difference between big-endian and
little-endian, that's why they are named like that. The examples of the
output as follows,
%pGpb 0x17c0010200(node=0|zone=2|lastcpupid=0x1f|slab|head)
%pGpl 0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)

To be compitable with the existed format of pGp, the new introduced ones
also use '|' as the separator, then the user tools parsing pGp won't
need to make change, suggested by Matthew.

The doc and test cases are also updated. Below is the output of the
test cases,
[ 4299.847655] test_printf: loaded.
[ 4299.848301] test_printf: all 404 tests passed
[ 4299.850371] test_printf: unloaded.

This patchset also includes some code cleanup in mm/slub.c.

v3:
- coding improvement, per Joe and Andy
- the possible impact on debugfs and the fix of it, per Joe and Matthew
- introduce new format instead of changing pGp, per Andy

v2:
- various coding improvement, per Joe, Miaohe, Vlastimil and Andy
- remove the prefix completely in patch #2, per Vlastimil
- Update the test cases, per Andy

Yafang Shao (3):
  mm, slub: use pGp to print page flags
  mm, slub: don't combine pr_err with INFO
  vsprintf: introduce new format to dump full information of page flags

 Documentation/core-api/printk-formats.rst |   2 +
 lib/test_printf.c | 126 +++---
 lib/vsprintf.c| 115 +++-
 mm/slub.c |  13 +--
 4 files changed, 233 insertions(+), 23 deletions(-)

-- 
2.17.1

[PATCH v3 3/3] vsprintf: introduce new format to dump full information of page flags

2021-02-08 Thread Yafang Shao

The existed pGp shows the names of page flags only, rather than the full
information including section, node, zone, last cpuipid and kasan tag.
While it is not easy to parse these information manually because there
are so many flavors. We'd better interpret them in printf.

To avoid breaking some tools which parsing pGp via debugfs or affecting
the printf buffer, other new formats are introduced, so the user can choose
what and in which order they want, suggested by Andy. These new introduced
format as follows,
pGpb: print other information first and then the names of page flags
pGpl: print the names of page flags first and then the other info

The differece between them looks like the difference between big-endian and
little-endian, that's why they are named like that. The examples of the
output as follows,
%pGpb 0x17c0010200(node=0|zone=2|lastcpupid=0x1f|slab|head)
%pGpl 0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)

To be compitable with the existed format of pGp, the new introduced ones
also use '|' as the separator, then the user tools parsing pGp won't
need to make change, suggested by Matthew.

The doc and test cases are also updated. Below is the output of the
test cases,
[ 4299.847655] test_printf: loaded.
[ 4299.848301] test_printf: all 404 tests passed
[ 4299.850371] test_printf: unloaded.

Cc: David Hildenbrand 
Cc: Joe Perches 
Cc: Miaohe Lin 
Cc: Vlastimil Babka 
Cc: Andy Shevchenko 
Cc: Matthew Wilcox 
Signed-off-by: Yafang Shao 
---
 Documentation/core-api/printk-formats.rst |   2 +
 lib/test_printf.c | 126 +++---
 lib/vsprintf.c| 115 +++-
 3 files changed, 226 insertions(+), 17 deletions(-)

diff --git a/Documentation/core-api/printk-formats.rst 
b/Documentation/core-api/printk-formats.rst
index 6d26c5c6ac48..56f8e0fc3963 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -539,6 +539,8 @@ Flags bitfields such as page flags, gfp_flags
 ::
 
%pGpreferenced|uptodate|lru|active|private
+   %pGpb   node=0|zone=2|referenced|uptodate|lru|active|private
+   %pGpl   referenced|uptodate|lru|active|private|node=0|zone=2
%pGgGFP_USER|GFP_DMA32|GFP_NOWARN
%pGvread|exec|mayread|maywrite|mayexec|denywrite
 
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 7ac87f18a10f..af2945bb730a 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -569,24 +569,130 @@ netdev_features(void)
 {
 }
 
+static void  __init
+page_flags_build_info(char *cmp_buf, int prefix, int sec, int node, int zone,
+ int last_cpupid, int kasan_tag, unsigned long *flags)
+{
+   unsigned long size = prefix;
+
+#ifdef SECTION_IN_PAGE_FLAGS
+   *flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "section=%#x|", sec);
+   size = strlen(cmp_buf);
+#endif
+
+   *flags |= ((node & NODES_MASK) << NODES_PGSHIFT) |
+ ((zone & ZONES_MASK) << ZONES_PGSHIFT);
+   snprintf(cmp_buf + size, BUF_SIZE - size, "node=%d|zone=%d", node, 
zone);
+   size = strlen(cmp_buf);
+
+#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+   *flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "|lastcpupid=%#x", 
last_cpupid);
+   size = strlen(cmp_buf);
+#endif
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+   *flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "|kasantag=%#x", tag);
+   size = strlen(cmp_buf);
+#endif
+}
+
 static void __init
-flags(void)
+page_flags_build_names(char *cmp_buf, int prefix, const char *expect,
+  unsigned long flags, unsigned long *page_flags)
 {
+   *page_flags |= flags;
+   snprintf(cmp_buf + prefix, BUF_SIZE - prefix, "%s", expect);
+}
+
+static void __init
+__page_flags_test(const char *expect, unsigned long flags)
+{
+   test(expect, "%pGp", );
+}
+
+static void __init
+page_flags_test_be(char *cmp_buf, int sec, int node, int zone,
+  int last_cpupid, int kasan_tag, const char *name,
+  unsigned long flags)
+{
+   unsigned long page_flags = 0;
+   int size;
+
+   page_flags_build_info(cmp_buf, 0, sec, node, zone, last_cpupid,
+ kasan_tag, _flags);
+
+   if (*name) {
+   size = strlen(cmp_buf);
+   if (size < BUF_SIZE - 2) {
+   *(cmp_buf + size) = '|';
+   *(cmp_buf + size + 1) = '\0';
+   }
+   page_flags_build_names(cmp_buf, strlen(cmp_buf), name, flags, 
_flags);
+   }
+
+   test(cmp_buf, "%pGpb", _flags);
+}
+
+static void __in

Re: [PATCH v2 3/3] vsprintf: dump full information of page flags in pGp

2021-02-02 Thread Yafang Shao

On Tue, Feb 2, 2021 at 12:16 AM Andy Shevchenko
 wrote:
>
> On Mon, Feb 01, 2021 at 09:49:59PM +0800, Yafang Shao wrote:
> > On Mon, Feb 1, 2021 at 9:27 PM Andy Shevchenko
> >  wrote:
> > > On Mon, Feb 01, 2021 at 07:56:10PM +0800, Yafang Shao wrote:
>
> ...
>
> > > > - Before the patch,
> > > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 
> > > > fp=0x9ae06ffc flags=0x17c0010200(slab|head)
> > > >
> > > > - After the patch,
> > > > [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 
> > > > fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid 
> > > > 0x1f,slab|head)
> > > >
> > > > The Documentation and test cases are also updated.
> > >
> > > Thanks for an update, my comments below.
> > >
> > > ...
> > >
> > > > - %pGpreferenced|uptodate|lru|active|private
> > > > + %pGpNode 0,Zone 2,referenced|uptodate|lru|active|private
> > >
> > > Since of the nature of printf() buffer, I wonder if these should be at 
> > > the end.
> > > I.o.w. the question is is the added material more important to user to 
> > > see than
> > > the existed one?
> > >
> >
> > The existing one should be more important than the added one.
> > But the order of output will not match with the value for page->flags.
> > E.g.
> > flags=0x17c0010200(slab|head,Node 0,Zone 2,Lastcpupid 0x1f)
> > It may be strange to compare the value with the string.
>
> More I'm looking at it, more I'm thinking it should have different specifiers
> for each group of desired flags to be printed.
>
> So, you leave %pGp as is and then add another letter to add more details, so
> user will choose what and in which order they want.
>
> For example, let's assume %pGp == %pGpf and P is a new specifier for what you
> are initially adding here:
>
>   %pGpfP => referenced|uptodate|lru|active|private,Node 0,Zone 2
>   %pGpPf => Node 0,Zone 2,referenced|uptodate|lru|active|private
>
> and so on.

Thanks for your suggestion. I will think about it.

-- 
Thanks
Yafang

Re: [PATCH v2 3/3] vsprintf: dump full information of page flags in pGp

2021-02-01 Thread Yafang Shao

On Mon, Feb 1, 2021 at 10:15 PM Matthew Wilcox  wrote:
>
> On Mon, Feb 01, 2021 at 07:56:10PM +0800, Yafang Shao wrote:
> > - Before the patch,
> > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 
> > fp=0x9ae06ffc flags=0x17c0010200(slab|head)
> >
> > - After the patch,
> > [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 
> > fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid 
> > 0x1f,slab|head)
>
> I would suggest it will be easier to parse as:
>
> flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f)
>
> That should alleviate the concerns about debugfs format change -- we've
> never guaranteed that flag names won't change, and they now look enough
> like flags that parsers shouldn't fall over them.

Good suggestion!
I will do it as you suggested in the next version.

-- 
Thanks
Yafang

Re: [PATCH v2 3/3] vsprintf: dump full information of page flags in pGp

2021-02-01 Thread Yafang Shao

On Mon, Feb 1, 2021 at 9:34 PM Andy Shevchenko
 wrote:
>
> On Mon, Feb 01, 2021 at 02:23:33PM +0100, David Hildenbrand wrote:
> > On 01.02.21 12:56, Yafang Shao wrote:
> > > Currently the pGp only shows the names of page flags, rather than
> > > the full information including section, node, zone, last cpupid and
> > > kasan tag. While it is not easy to parse these information manually
> > > because there're so many flavors. Let's interpret them in pGp as well.
> > >
> > > - Before the patch,
> > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 
> > > fp=0x9ae06ffc flags=0x17c0010200(slab|head)
> > >
> > > - After the patch,
> > > [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 
> > > fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid 
> > > 0x1f,slab|head)
> >
> > For debugging purposes, it might be helpful to have the actual zone name
> > (and to know if the value is sane). You could obtain it (without other
> > modifications) via
> >
> > const char zname = "Invalid";
> >
> > if (zone < MAX_NR_ZONES)
> >   zname = first_online_pgdat()->node_zones[zone].name;
> >
> >
> > Similarly, it might also be helpful to indicate if a node is
> > online/offline/invalid/.
> >
> > const char nstate = "Invalid";
> >
> > if (node_online(nid))
> >   nstate = "Online";
> > else if (node_possible(nid))
> >   nstate = "Offline";
> >
> >
> > Printing that in addition to the raw value could be helpful. Just some
> > thoughts.
>
> printf() buffer is not a black hole, esp. when you get it messed with critical
> messages (Oops). I suggest to reduce a burden as much as possible. If you wish
> to get this, make it caller-configurable, i.e. adding another letter to the
> specifier.
>

I think David's suggestion will help us to identify issues.

You mean that we should use another one like "%pGpd" - means pGp debug
- to implement it ?

-- 
Thanks
Yafang

Re: [PATCH v2 3/3] vsprintf: dump full information of page flags in pGp

2021-02-01 Thread Yafang Shao

On Mon, Feb 1, 2021 at 9:27 PM Andy Shevchenko
 wrote:
>
> On Mon, Feb 01, 2021 at 07:56:10PM +0800, Yafang Shao wrote:
> > Currently the pGp only shows the names of page flags, rather than
> > the full information including section, node, zone, last cpupid and
> > kasan tag. While it is not easy to parse these information manually
> > because there're so many flavors. Let's interpret them in pGp as well.
> >
> > - Before the patch,
> > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 
> > fp=0x9ae06ffc flags=0x17c0010200(slab|head)
> >
> > - After the patch,
> > [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 
> > fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid 
> > 0x1f,slab|head)
> >
> > The Documentation and test cases are also updated.
>
> Thanks for an update, my comments below.
>
> ...
>
> > - %pGpreferenced|uptodate|lru|active|private
> > + %pGpNode 0,Zone 2,referenced|uptodate|lru|active|private
>
> Since of the nature of printf() buffer, I wonder if these should be at the 
> end.
> I.o.w. the question is is the added material more important to user to see 
> than
> the existed one?
>

The existing one should be more important than the added one.
But the order of output will not match with the value for page->flags.
E.g.
flags=0x17c0010200(slab|head,Node 0,Zone 2,Lastcpupid 0x1f)
It may be strange to compare the value with the string.

> ...
>
> > +static void __init
> > +page_flags_test(int section, int node, int zone, int last_cpupid,
> > + int kasan_tag, int flags, const char *name, char *cmp_buf)
> > +{
> > + unsigned long page_flags = 0;
> > + unsigned long size = 0;
> > +
> > +#ifdef SECTION_IN_PAGE_FLAGS
> > + page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT;
> > + snprintf(cmp_buf, BUF_SIZE, "Section %#x,", sec);
>
> I would keep it in the same form as latter ones, i.e.
>
> snprintf(cmp_buf + size, BUF_SIZE - size, "Section %#x,", sec);
>
> In this case it will be easier if at some point we might need to reshuffle.
>

Good suggestion.

> > + size = strlen(cmp_buf);
> > +#endif
> > +
> > + page_flags |= (node & NODES_MASK) << NODES_PGSHIFT;
> > + snprintf(cmp_buf + size, BUF_SIZE - size, "Node %d", node);
> > + size = strlen(cmp_buf);
> > +
> > + page_flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
> > + snprintf(cmp_buf + size, BUF_SIZE - size, ",Zone %d", zone);
> > + size = strlen(cmp_buf);
> > +
> > +#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS
> > + page_flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
> > + snprintf(cmp_buf + size, BUF_SIZE - size, ",Lastcpupid %#x", 
> > last_cpupid);
> > + size = strlen(cmp_buf);
> > +#endif
> > +
> > +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
> > + page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
> > + snprintf(cmp_buf + size, BUF_SIZE - size, ",Kasantag %#x", tag);
> > + size = strlen(cmp_buf);
> > +#endif
> > +
> > + test(cmp_buf, "%pGp", _flags);
> > +
> > + if (flags) {
>
> If below will be always for flags set, I would rewrite this condition as
>
> if (!flags)
> return;
>
> but it's up to you.
>
> > + page_flags |= flags;
> > + snprintf(cmp_buf + size, BUF_SIZE - size, ",%s", name);
> > + test(cmp_buf, "%pGp", _flags);
> > + }
> > +}
>
> ...
>
> > - flags = 0;
>
> > - flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru
> > - | 1UL << PG_active | 1UL << PG_swapbacked;
>
> I would leave this untouched and reuse below...
>
> > + cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
> > + if (!cmp_buffer)
> > + return;
>
> ...as
>
> > + page_flags_test(0, 0, 0, 0, 0, 0, NULL, cmp_buffer);
>
> flags = 0;
> page_flags_test(0, 0, 0, 0, 0, flags, NULL, cmp_buffer);
>
> > + page_flags_test(1, 1, 1, 0x1, 1,
> > + (1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru
> > +  | 1UL << PG_active | 1UL << PG_swapbacked),
> > + "uptodate|dirty|lru|active|swapbacked",
> > +

Re: [PATCH v2 3/3] vsprintf: dump full information of page flags in pGp

2021-02-01 Thread Yafang Shao

On Mon, Feb 1, 2021 at 9:05 PM Joe Perches  wrote:
>
> On Mon, 2021-02-01 at 19:56 +0800, Yafang Shao wrote:
> > Currently the pGp only shows the names of page flags, rather than
> > the full information including section, node, zone, last cpupid and
> > kasan tag. While it is not easy to parse these information manually
> > because there're so many flavors. Let's interpret them in pGp as well.
> >
> > - Before the patch,
> > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 
> > fp=0x9ae06ffc flags=0x17c0010200(slab|head)
> >
> > - After the patch,
> > [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 
> > fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid 
> > 0x1f,slab|head)
>
> While debugfs is not an ABI, this format is exported in debugfs to
> userspace via mm/page_owner.c read_page_owner/print_page_owner.
>

Right, the page_owner will be affected by this  change.

> Does changing the output format matter to anyone?
>

The user tools which parse the page_owner may be affected.
If we don't want to affect the userspace tools, I think we can make a
little change in page_owner as follows,

unsigned long masked_flags = page->flags &  (BIT(NR_PAGEFLAGS) - 1);
snprintf("..., %#lx(%pGp)\n", page->flags, _flags);

> > diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> []
> > +static
> > +char *format_page_flags(char *buf, char *end, unsigned long page_flags)
> > +{
> > + unsigned long flags = page_flags & ((1UL << NR_PAGEFLAGS) - 1);
> > + int size = ARRAY_SIZE(pfl);
>
> There's no real value in used-once variables.
>
> > + bool separator = false;
> > + int i;
> > +
> > + for (i = 0; i < size; i++) {
>
> Use ARRAY_SIZE here instead
>

Sure

> for (i = 0; i < ARRAY_SIZE(pfl); i++) {
>
> > + if (pfl[i].width == 0)
> > + continue;
> > +
> > + if (separator) {
> > + if (buf < end)
> > + *buf = ',';
> > + buf++;
> > + }
> > +
> > +
> > + buf = string(buf, end, pfl[i].name, *pfl[i].spec);
> > +
> > + buf = number(buf, end, (page_flags >> pfl[i].shift) & 
> > pfl[i].mask,
> > +  *pfl[i].spec);
> > + separator = true;
> > + }
>
> Style question:
> Might this array be more intelligible with pointers instead of indexes?

Good suggestion!
I will change it in the next version.

> Something like:
>
> struct page_flags_layout *p;
>
> for (p = pfl; p < pfl + ARRAY_SIZE(pfl); p++) {
> if (p->width == 0)
> continue;
>

> if (p > pfl) {
> if (buf < end)
> *buf = ',';
> buf++;
> }

It doesn't work, because there's a 'continue' above, meaning that the
p may be greater than pfl without doing anything.

>
> buf = string(buf, end, p->name, *p->spec);
> buf = number(buf, end, (page_flags >> p->shift) & p->mask, 
> *p->spec);
> }
>
> > +
> > + if (flags) {
>
> Maybe:
>
> if (page_flags & (BIT(NR_PAGEFLAGS) - 1)) {
>

Sure.

> > + if (buf < end)
> > + *buf = ',';
> > + buf++;
> > + }
> > +
> > + return buf;
> > +}
> > +
>
>


--
Thanks
Yafang

[PATCH v2 3/3] vsprintf: dump full information of page flags in pGp

2021-02-01 Thread Yafang Shao

Currently the pGp only shows the names of page flags, rather than
the full information including section, node, zone, last cpupid and
kasan tag. While it is not easy to parse these information manually
because there're so many flavors. Let's interpret them in pGp as well.

- Before the patch,
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

- After the patch,
[ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 fp=0xc4902159 
flags=0x17c0010200(Node 0,Zone 2,Lastcpupid 0x1f,slab|head)

The Documentation and test cases are also updated.

Cc: David Hildenbrand 
Cc: Joe Perches 
Cc: Miaohe Lin 
Cc: Vlastimil Babka 
Cc: Andy Shevchenko 
Signed-off-by: Yafang Shao 
---
 Documentation/core-api/printk-formats.rst |  2 +-
 lib/test_printf.c | 65 ++-
 lib/vsprintf.c| 58 +++-
 3 files changed, 109 insertions(+), 16 deletions(-)

diff --git a/Documentation/core-api/printk-formats.rst 
b/Documentation/core-api/printk-formats.rst
index 6d26c5c6ac48..1374cdd04f0f 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -538,7 +538,7 @@ Flags bitfields such as page flags, gfp_flags
 
 ::
 
-   %pGpreferenced|uptodate|lru|active|private
+   %pGpNode 0,Zone 2,referenced|uptodate|lru|active|private
%pGgGFP_USER|GFP_DMA32|GFP_NOWARN
%pGvread|exec|mayread|maywrite|mayexec|denywrite
 
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 7ac87f18a10f..4c5e064cbe2e 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -569,6 +569,48 @@ netdev_features(void)
 {
 }
 
+static void __init
+page_flags_test(int section, int node, int zone, int last_cpupid,
+   int kasan_tag, int flags, const char *name, char *cmp_buf)
+{
+   unsigned long page_flags = 0;
+   unsigned long size = 0;
+
+#ifdef SECTION_IN_PAGE_FLAGS
+   page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+   snprintf(cmp_buf, BUF_SIZE, "Section %#x,", sec);
+   size = strlen(cmp_buf);
+#endif
+
+   page_flags |= (node & NODES_MASK) << NODES_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, "Node %d", node);
+   size = strlen(cmp_buf);
+
+   page_flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, ",Zone %d", zone);
+   size = strlen(cmp_buf);
+
+#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+   page_flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, ",Lastcpupid %#x", 
last_cpupid);
+   size = strlen(cmp_buf);
+#endif
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+   page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+   snprintf(cmp_buf + size, BUF_SIZE - size, ",Kasantag %#x", tag);
+   size = strlen(cmp_buf);
+#endif
+
+   test(cmp_buf, "%pGp", _flags);
+
+   if (flags) {
+   page_flags |= flags;
+   snprintf(cmp_buf + size, BUF_SIZE - size, ",%s", name);
+   test(cmp_buf, "%pGp", _flags);
+   }
+}
+
 static void __init
 flags(void)
 {
@@ -576,17 +618,16 @@ flags(void)
gfp_t gfp;
char *cmp_buffer;
 
-   flags = 0;
-   test("", "%pGp", );
-
-   /* Page flags should filter the zone id */
-   flags = 1UL << NR_PAGEFLAGS;
-   test("", "%pGp", );
-
-   flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru
-   | 1UL << PG_active | 1UL << PG_swapbacked;
-   test("uptodate|dirty|lru|active|swapbacked", "%pGp", );
+   cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
+   if (!cmp_buffer)
+   return;
 
+   page_flags_test(0, 0, 0, 0, 0, 0, NULL, cmp_buffer);
+   page_flags_test(1, 1, 1, 0x1, 1,
+   (1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru
+| 1UL << PG_active | 1UL << PG_swapbacked),
+   "uptodate|dirty|lru|active|swapbacked",
+   cmp_buffer);
 
flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC
| VM_DENYWRITE;
@@ -601,10 +642,6 @@ flags(void)
gfp = __GFP_ATOMIC;
test("__GFP_ATOMIC", "%pGg", );
 
-   cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
-   if (!cmp_buffer)
-   return;
-
/* Any flags not translated by the table should remain numeric */
gfp = ~__GFP_BITS_MASK;
snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
i

[PATCH v2 2/3] mm, slub: don't combine pr_err with INFO

2021-02-01 Thread Yafang Shao

It is strange to combine "pr_err" with "INFO", so let's remove the
prefix completely.
This patch is motivated by David's comment[1].

- before the patch
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

- after the patch
[ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc 
flags=0x17c0010200(slab|head)

[1]. 
https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t

Suggested-by: Vlastimil Babka 
Cc: David Hildenbrand 
Signed-off-by: Yafang Shao 
---
 mm/slub.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 87ff086e68a4..2514c37ab4e4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -612,7 +612,7 @@ static void print_track(const char *s, struct track *t, 
unsigned long pr_time)
if (!t->addr)
return;
 
-   pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+   pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
   s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 #ifdef CONFIG_STACKTRACE
{
@@ -638,7 +638,7 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+   pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
   page, page->objects, page->inuse, page->freelist,
   page->flags, >flags);
 
@@ -695,7 +695,7 @@ static void print_trailer(struct kmem_cache *s, struct page 
*page, u8 *p)
 
print_page_info(page);
 
-   pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+   pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
   p, p - addr, get_freepointer(s, p));
 
if (s->flags & SLAB_RED_ZONE)
@@ -788,7 +788,7 @@ static int check_bytes_and_report(struct kmem_cache *s, 
struct page *page,
end--;
 
slab_bug(s, "%s overwritten", what);
-   pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+   pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
print_trailer(s, page, object);
@@ -3854,7 +3854,7 @@ static void list_slab_objects(struct kmem_cache *s, 
struct page *page,
for_each_object(p, s, addr, page->objects) {
 
if (!test_bit(__obj_to_index(s, addr, p), map)) {
-   pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+   pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
-- 
2.17.1

[PATCH v2 1/3] mm, slub: use pGp to print page flags

2021-02-01 Thread Yafang Shao

As pGp has been already introduced in printk, we'd better use it to make
the output human readable.

Before this change, the output is,
[ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 
fp=0x8cd1579c flags=0x17c0010200

While after this change, the output is,
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

Reviewed-by: David Hildenbrand 
Reviewed-by: Vlastimil Babka 
Acked-by: David Rientjes 
Signed-off-by: Yafang Shao 
---
 mm/slub.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 34dcc09e2ec9..87ff086e68a4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -638,8 +638,9 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
-  page, page->objects, page->inuse, page->freelist, page->flags);
+   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+  page, page->objects, page->inuse, page->freelist,
+  page->flags, >flags);
 
 }
 
-- 
2.17.1

[PATCH v2 0/3] mm, vsprintf: dump full information of page flags in pGp

2021-02-01 Thread Yafang Shao

Currently the pGp only shows the names of page flags, rather than
the full information including section, node, zone, last cpupid and
kasan tag. While it is not easy to parse these information manually
because there're so many flavors. Let's interpret them in pGp as well.

This patchset also includes some code cleanup in mm/slub.c.

Below is the example of the output in mm/slub.c.
- Before the patchset
[ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 
fp=0x8cd1579c flags=0x17c0010200

- After the patchset
[ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 fp=0xc4902159 
flags=0x17c0010200(Node 0,Zone 2,Lastcpupid 0x1f,slab|head)

The documentation and test cases of pGp are also updated. 

Below is the result of the test_printf,
[ 5091.307308] test_printf: loaded.
[ 5091.308285] test_printf: all 388 tests passed
[ 5091.309105] test_printf: unloaded.

v2:
- various coding improvement, per Joe, Miaohe, Vlastimil and Andy
- remove the prefix completely in patch #2, per Vlastimil
- Update the test cases, per Andy

Yafang Shao (3):
  mm, slub: use pGp to print page flags
  mm, slub: don't combine pr_err with INFO
  vsprintf: dump full information of page flags in pGp

 Documentation/core-api/printk-formats.rst |  2 +-
 lib/test_printf.c | 65 ++-
 lib/vsprintf.c| 58 +++-
 mm/slub.c | 13 ++---
 4 files changed, 116 insertions(+), 22 deletions(-)

-- 
2.17.1

Re: [PATCH 3/3] printk: dump full information of page flags in pGp

2021-01-28 Thread Yafang Shao

On Thu, Jan 28, 2021 at 10:50 PM Andy Shevchenko
 wrote:
>
> On Thu, Jan 28, 2021 at 09:18:24PM +0800, Yafang Shao wrote:
> > On Thu, Jan 28, 2021 at 8:11 PM Andy Shevchenko
> >  wrote:
>
> ...
>
> > Thanks for the explanation.
> > I will change it as you suggested.
>
> You are welcome!
>
> Just note, that kasprintf() should work for this as well as for the rest
> printf() cases. That's why it's very important to return proper buffer 
> pointer.
>
> Also read `man snprintf(3)` RETURN VALUE section to understand it.
>

Thanks for the information. I will take a look at it.

-- 
Thanks
Yafang

Re: [PATCH 0/3] mm, printk: dump full information of page flags in pGp

2021-01-28 Thread Yafang Shao

On Thu, Jan 28, 2021 at 8:12 PM Andy Shevchenko
 wrote:
>
> On Thu, Jan 28, 2021 at 10:19:44AM +0800, Yafang Shao wrote:
> > Currently the pGp only shows the names of page flags, rather than
> > the full information including section, node, zone, last cpupid and
> > kasan tag. While it is not easy to parse these information manually
> > because there're so many flavors. Let's interpret them in pGp as well.
> >
> > This patchset also includes some code cleanup in mm/slub.c.
> >
> > Below is the example of the output in mm/slub.c.
> > - Before the patchset
> > [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 
> > fp=0x8cd1579c flags=0x17c0010200
> >
> > - After the patchset
> > [ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 
> > fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid 
> > 0x1f,slab|head)
>
>
> Please, add a corresponding test cases to test_printf.c. W/o test cases NAK.
>

Sure. Will add the test cases in the next version.


-- 
Thanks
Yafang

Re: [PATCH 3/3] printk: dump full information of page flags in pGp

2021-01-28 Thread Yafang Shao

On Thu, Jan 28, 2021 at 8:11 PM Andy Shevchenko
 wrote:
>
> On Thu, Jan 28, 2021 at 10:19:47AM +0800, Yafang Shao wrote:
> > Currently the pGp only shows the names of page flags, rather than
> > the full information including section, node, zone, last cpupid and
> > kasan tag. While it is not easy to parse these information manually
> > because there're so many flavors. Let's interpret them in pGp as well.
> >
> > - Before the patch,
> > [ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 
> > fp=0x6d0779d1 flags=0x17c0010200(slab|head)
> >
> > - After the patch,
> > [ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 
> > fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid 
> > 0x1f,slab|head)
>
> > + int i;
> > +
> > + for (i = 0; i < sizeof(pfl) / sizeof(struct page_flags_layout) && buf 
> > < end; i++) {
>
> 'buf < end' is redundant.
>

Thanks for pointing this out.

> > + if (pfl[i].width == 0)
> > + continue;
> > +
> > + buf = string(buf, end, pfl[i].name, default_str_spec);
>
> > + if (buf >= end)
> > + break;
>
> Can you rather use usual patter, i.e.
>
> if (buf < end) {
> ...do something...
> }
> buf++; // or whatever increase should be done
>
> Moreover, number() and string() IIRC have the proper checks embedded into 
> them.
>

I will take a look at the detail in these two functions.

> > + buf = number(buf, end, (flags >> pfl[i].shift) & pfl[i].mask,
> > +  default_flag_spec);
>
> > + if (buf >= end)
> > + break;
> > + *buf = ',';
> > + buf++;
>
> Here is a very standard pattern can be used, see code around
>
> if (buf < end)
> *buf = ',';
> buf++;
>
> > + }
>

Thanks for the explanation.
I will change it as you suggested.


-- 
Thanks
Yafang

Re: [PATCH 3/3] printk: dump full information of page flags in pGp

2021-01-28 Thread Yafang Shao

On Thu, Jan 28, 2021 at 6:42 PM Vlastimil Babka  wrote:
>
> On 1/28/21 3:19 AM, Yafang Shao wrote:
> > Currently the pGp only shows the names of page flags, rather than
> > the full information including section, node, zone, last cpupid and
> > kasan tag. While it is not easy to parse these information manually
> > because there're so many flavors. Let's interpret them in pGp as well.
> >
> > - Before the patch,
> > [ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 
> > fp=0x6d0779d1 flags=0x17c0010200(slab|head)
> >
> > - After the patch,
> > [ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 
> > fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid 
> > 0x1f,slab|head)
>
> node, zone could be decimal?

Make sense to me. Will change it.

>
> >
> > Cc: David Hildenbrand 
> > Signed-off-by: Yafang Shao 
> > ---
> >  lib/vsprintf.c | 42 +-
> >  1 file changed, 41 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> > index 3b53c73580c5..bd809f4f1b82 100644
> > --- a/lib/vsprintf.c
> > +++ b/lib/vsprintf.c
> > @@ -1916,6 +1916,46 @@ char *format_flags(char *buf, char *end, unsigned 
> > long flags,
> >   return buf;
> >  }
> >
> > +struct page_flags_layout {
> > + int width;
> > + int shift;
> > + int mask;
> > + char *name;
> > +};
> > +
> > +struct page_flags_layout pfl[] = {
> > + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, "Section "},
> > + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, "Node "},
> > + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, "Zone "},
> > + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, 
> > "Lastcpupid "},
> > + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, "Kasantag "},
> > +};
> > +
> > +static
> > +char *format_layout(char *buf, char *end, unsigned long flags)
> > +{
> > + int i;
> > +
> > + for (i = 0; i < sizeof(pfl) / sizeof(struct page_flags_layout) && buf 
> > < end; i++) {
> > + if (pfl[i].width == 0)
> > + continue;
> > +
> > + buf = string(buf, end, pfl[i].name, default_str_spec);
> > +
> > + if (buf >= end)
> > + break;
> > + buf = number(buf, end, (flags >> pfl[i].shift) & pfl[i].mask,
> > +  default_flag_spec);
> > +
> > + if (buf >= end)
> > + break;
> > + *buf = ',';
> > + buf++;
> > + }
> > +
> > + return buf;
> > +}
> > +
> >  static noinline_for_stack
> >  char *flags_string(char *buf, char *end, void *flags_ptr,
> >  struct printf_spec spec, const char *fmt)
> > @@ -1929,7 +1969,7 @@ char *flags_string(char *buf, char *end, void 
> > *flags_ptr,
> >   switch (fmt[1]) {
> >   case 'p':
> >   flags = *(unsigned long *)flags_ptr;
> > - /* Remove zone id */
> > + buf = format_layout(buf, end, flags & ~((1UL << NR_PAGEFLAGS) 
> > - 1));
> >   flags &= (1UL << NR_PAGEFLAGS) - 1;
> >   names = pageflag_names;
> >   break;
> >
>


-- 
Thanks
Yafang

Re: [PATCH 2/3] mm, slub: don't combine pr_err with INFO

2021-01-28 Thread Yafang Shao

On Thu, Jan 28, 2021 at 6:35 PM Vlastimil Babka  wrote:
>
> On 1/28/21 3:19 AM, Yafang Shao wrote:
> > It is strange to combine "pr_err" with "INFO", so let's clean them up.
> > This patch is motivated by David's comment[1].
> >
> > - before the patch
> > [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
> > fp=0x60d32ca8 flags=0x17c0010200(slab|head)
> >
> > - after the patch
> > [ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 
> > fp=0x6d0779d1 flags=0x17c0010200(slab|head)
> >
> > [1]. 
> > https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t
> >
> > Cc: David Hildenbrand 
> > Signed-off-by: Yafang Shao 
>
> These are usually printed as part of slab_bug() with its prominent banner. In
> that sense it's additional details, thus INFO. The details itself are not 
> error,
> thus ERR makes little sense imho. How about removing the prefix completely, or
> just replacing with an ident to make it visually part of the BUG report.
>

Thanks for the explanation. I will remove the prefix completely in the
next version.

> > ---
> >  mm/slub.c | 10 +-
> >  1 file changed, 5 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 4b9ab267afbc..18b4474c8fa2 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -615,7 +615,7 @@ static void print_track(const char *s, struct track *t, 
> > unsigned long pr_time)
> >   if (!t->addr)
> >   return;
> >
> > - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
> > + pr_err("ERR: %s in %pS age=%lu cpu=%u pid=%d\n",
> >  s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
> >  #ifdef CONFIG_STACKTRACE
> >   {
> > @@ -641,7 +641,7 @@ void print_tracking(struct kmem_cache *s, void *object)
> >
> >  static void print_page_info(struct page *page)
> >  {
> > - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p 
> > flags=%#lx(%pGp)\n",
> > + pr_err("ERR: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
> >  page, page->objects, page->inuse, page->freelist,
> >  page->flags, >flags);
> >
> > @@ -698,7 +698,7 @@ static void print_trailer(struct kmem_cache *s, struct 
> > page *page, u8 *p)
> >
> >   print_page_info(page);
> >
> > - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
> > + pr_err("ERR: Object 0x%p @offset=%tu fp=0x%p\n\n",
> >  p, p - addr, get_freepointer(s, p));
> >
> >   if (s->flags & SLAB_RED_ZONE)
> > @@ -791,7 +791,7 @@ static int check_bytes_and_report(struct kmem_cache *s, 
> > struct page *page,
> >   end--;
> >
> >   slab_bug(s, "%s overwritten", what);
> > - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 
> > 0x%x\n",
> > + pr_err("ERR: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 
> > 0x%x\n",
> >   fault, end - 1, fault - addr,
> >   fault[0], value);
> >   print_trailer(s, page, object);
> > @@ -3855,7 +3855,7 @@ static void list_slab_objects(struct kmem_cache *s, 
> > struct page *page,
> >   for_each_object(p, s, addr, page->objects) {
> >
> >   if (!test_bit(__obj_to_index(s, addr, p), map)) {
> > - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - 
> > addr);
> > + pr_err("ERR: Object 0x%p @offset=%tu\n", p, p - addr);
> >   print_tracking(s, p);
> >   }
> >   }
> >
>


-- 
Thanks
Yafang

Re: [PATCH 3/3] printk: dump full information of page flags in pGp

2021-01-27 Thread Yafang Shao

On Thu, Jan 28, 2021 at 10:52 AM Miaohe Lin  wrote:
>
> Hi:
> On 2021/1/28 10:19, Yafang Shao wrote:
> > Currently the pGp only shows the names of page flags, rather than
> > the full information including section, node, zone, last cpupid and
> > kasan tag. While it is not easy to parse these information manually
> > because there're so many flavors. Let's interpret them in pGp as well.
> >
> > - Before the patch,
> > [ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 
> > fp=0x6d0779d1 flags=0x17c0010200(slab|head)
> >
> > - After the patch,
> > [ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 
> > fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid 
> > 0x1f,slab|head)
> >
>
> Thanks. This really helps!
>
> > Cc: David Hildenbrand 
> > Signed-off-by: Yafang Shao 
> > ---
> >  lib/vsprintf.c | 42 +-
> >  1 file changed, 41 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> > index 3b53c73580c5..bd809f4f1b82 100644
> > --- a/lib/vsprintf.c
> > +++ b/lib/vsprintf.c
> > @@ -1916,6 +1916,46 @@ char *format_flags(char *buf, char *end, unsigned 
> > long flags,
> >   return buf;
> >  }
> >
> > +struct page_flags_layout {
> > + int width;
> > + int shift;
> > + int mask;
> > + char *name;
>
> Should we add const for name ?
>

Good suggestion.

> > +};
> > +
> > +struct page_flags_layout pfl[] = {
>
> Should we add static const for pfl[] as we won't change its value and use it 
> outside this file ?
>

Sure.

> > + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, "Section "},
> > + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, "Node "},
> > + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, "Zone "},
> > + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, 
> > "Lastcpupid "},
> > + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, "Kasantag "},
> > +};
> > +
> > +static
> > +char *format_layout(char *buf, char *end, unsigned long flags)
> > +{
> > + int i;
> > +
> > + for (i = 0; i < sizeof(pfl) / sizeof(struct page_flags_layout) && buf 
> > < end; i++) {
>
> I think we can use ARRAY_SIZE here.
>

Sure.

> > + if (pfl[i].width == 0)
> > + continue;
> > +
> > + buf = string(buf, end, pfl[i].name, default_str_spec);
> > +
> > + if (buf >= end)
> > + break;
> > + buf = number(buf, end, (flags >> pfl[i].shift) & pfl[i].mask,
> > +  default_flag_spec);
> > +
> > + if (buf >= end)
> > + break;
> > + *buf = ',';
> > + buf++;
> > + }
> > +
> > + return buf;
> > +}
> > +
> >  static noinline_for_stack
> >  char *flags_string(char *buf, char *end, void *flags_ptr,
> >  struct printf_spec spec, const char *fmt)
> > @@ -1929,7 +1969,7 @@ char *flags_string(char *buf, char *end, void 
> > *flags_ptr,
> >   switch (fmt[1]) {
> >   case 'p':
> >   flags = *(unsigned long *)flags_ptr;
> > - /* Remove zone id */
> > + buf = format_layout(buf, end, flags & ~((1UL << NR_PAGEFLAGS) 
> > - 1));
> >   flags &= (1UL << NR_PAGEFLAGS) - 1;
> >   names = pageflag_names;
> >   break;
> >
> Many thanks.



-- 
Thanks
Yafang

Re: [PATCH 3/3] printk: dump full information of page flags in pGp

2021-01-27 Thread Yafang Shao

On Thu, Jan 28, 2021 at 10:35 AM Joe Perches  wrote:
>
> On Thu, 2021-01-28 at 10:19 +0800, Yafang Shao wrote:
> > Currently the pGp only shows the names of page flags, rather than
> > the full information including section, node, zone, last cpupid and
> > kasan tag. While it is not easy to parse these information manually
> > because there're so many flavors. Let's interpret them in pGp as well.
> []
> > diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> []
> > @@ -1916,6 +1916,46 @@ char *format_flags(char *buf, char *end, unsigned 
> > long flags,
> >   return buf;
> >  }
> >
> > +struct page_flags_layout {
> > + int width;
> > + int shift;
> > + int mask;
> > + char *name;
> > +};
> > +
> > +struct page_flags_layout pfl[] = {
>
> static const struct page_flags_layout pfl[] = {

Sure.

>
> > + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, "Section "},
> > + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, "Node "},
> > + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, "Zone "},
> > + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, 
> > "Lastcpupid "},
> > + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, "Kasantag "},
> > +};
> > +
> > +static
> > +char *format_layout(char *buf, char *end, unsigned long flags)
>
> poor name.  perhaps format_page_flags
>

Thanks for the suggestion.

> > +{
> > + int i;
> > +
> > + for (i = 0; i < sizeof(pfl) / sizeof(struct page_flags_layout) && buf 
> > < end; i++) {
>
> for (i = 0; i < ARRAY_SIZE(pfl) && buf < end; i++) {
>

Sure.


>
> > @@ -1929,7 +1969,7 @@ char *flags_string(char *buf, char *end, void 
> > *flags_ptr,
> >   switch (fmt[1]) {
> >   case 'p':
> >   flags = *(unsigned long *)flags_ptr;
> > - /* Remove zone id */
> > + buf = format_layout(buf, end, flags & ~((1UL << NR_PAGEFLAGS) 
> > - 1));
> >   flags &= (1UL << NR_PAGEFLAGS) - 1;
>
> Perhaps store the bitshift into a temp and use the temp twice
>
> foo = BIT(NR_PAGEFLAGS) - 1;
>
> buf = format_layout(buf, end, flags & ~foo);
> flags &= foo;
>
>

Thanks for the suggestion. I will change them all.


-- 
Thanks
Yafang

[PATCH 0/3] mm, printk: dump full information of page flags in pGp

2021-01-27 Thread Yafang Shao

Currently the pGp only shows the names of page flags, rather than
the full information including section, node, zone, last cpupid and
kasan tag. While it is not easy to parse these information manually
because there're so many flavors. Let's interpret them in pGp as well.

This patchset also includes some code cleanup in mm/slub.c.

Below is the example of the output in mm/slub.c.
- Before the patchset
[ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 
fp=0x8cd1579c flags=0x17c0010200

- After the patchset
[ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 
fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid 
0x1f,slab|head)

Yafang Shao (3):
  mm, slub: use pGp to print page flags
  mm, slub: don't combine pr_err with INFO
  printk: dump full information of page flags in pGp

 lib/vsprintf.c | 42 +-
 mm/slub.c  | 13 +++--
 2 files changed, 48 insertions(+), 7 deletions(-)

-- 
2.17.1

[PATCH 3/3] printk: dump full information of page flags in pGp

2021-01-27 Thread Yafang Shao

Currently the pGp only shows the names of page flags, rather than
the full information including section, node, zone, last cpupid and
kasan tag. While it is not easy to parse these information manually
because there're so many flavors. Let's interpret them in pGp as well.

- Before the patch,
[ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 
fp=0x6d0779d1 flags=0x17c0010200(slab|head)

- After the patch,
[ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 
fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid 
0x1f,slab|head)

Cc: David Hildenbrand 
Signed-off-by: Yafang Shao 
---
 lib/vsprintf.c | 42 +-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3b53c73580c5..bd809f4f1b82 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1916,6 +1916,46 @@ char *format_flags(char *buf, char *end, unsigned long 
flags,
return buf;
 }
 
+struct page_flags_layout {
+   int width;
+   int shift;
+   int mask;
+   char *name;
+};
+
+struct page_flags_layout pfl[] = {
+   {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, "Section "},
+   {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, "Node "},
+   {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, "Zone "},
+   {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, "Lastcpupid 
"},
+   {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, "Kasantag "},
+};
+
+static
+char *format_layout(char *buf, char *end, unsigned long flags)
+{
+   int i;
+
+   for (i = 0; i < sizeof(pfl) / sizeof(struct page_flags_layout) && buf < 
end; i++) {
+   if (pfl[i].width == 0)
+   continue;
+
+   buf = string(buf, end, pfl[i].name, default_str_spec);
+
+   if (buf >= end)
+   break;
+   buf = number(buf, end, (flags >> pfl[i].shift) & pfl[i].mask,
+default_flag_spec);
+
+   if (buf >= end)
+   break;
+   *buf = ',';
+   buf++;
+   }
+
+   return buf;
+}
+
 static noinline_for_stack
 char *flags_string(char *buf, char *end, void *flags_ptr,
   struct printf_spec spec, const char *fmt)
@@ -1929,7 +1969,7 @@ char *flags_string(char *buf, char *end, void *flags_ptr,
switch (fmt[1]) {
case 'p':
flags = *(unsigned long *)flags_ptr;
-   /* Remove zone id */
+   buf = format_layout(buf, end, flags & ~((1UL << NR_PAGEFLAGS) - 
1));
flags &= (1UL << NR_PAGEFLAGS) - 1;
names = pageflag_names;
break;
-- 
2.17.1

[PATCH 1/3] mm, slub: use pGp to print page flags

2021-01-27 Thread Yafang Shao

As pGp has been already introduced in printk, we'd better use it to make
the output human readable.

Before this change, the output is,
[ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 
fp=0x8cd1579c flags=0x17c0010200

While after this change, the output is,
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

Reviewed-by: David Hildenbrand 
Signed-off-by: Yafang Shao 
---
 mm/slub.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 69742ab9a21d..4b9ab267afbc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -641,8 +641,9 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
-  page, page->objects, page->inuse, page->freelist, page->flags);
+   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+  page, page->objects, page->inuse, page->freelist,
+  page->flags, >flags);
 
 }
 
-- 
2.17.1

[PATCH 2/3] mm, slub: don't combine pr_err with INFO

2021-01-27 Thread Yafang Shao

It is strange to combine "pr_err" with "INFO", so let's clean them up.
This patch is motivated by David's comment[1].

- before the patch
[ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 
fp=0x60d32ca8 flags=0x17c0010200(slab|head)

- after the patch
[ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 
fp=0x6d0779d1 flags=0x17c0010200(slab|head)

[1]. 
https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t

Cc: David Hildenbrand 
Signed-off-by: Yafang Shao 
---
 mm/slub.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 4b9ab267afbc..18b4474c8fa2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -615,7 +615,7 @@ static void print_track(const char *s, struct track *t, 
unsigned long pr_time)
if (!t->addr)
return;
 
-   pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+   pr_err("ERR: %s in %pS age=%lu cpu=%u pid=%d\n",
   s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 #ifdef CONFIG_STACKTRACE
{
@@ -641,7 +641,7 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-   pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+   pr_err("ERR: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
   page, page->objects, page->inuse, page->freelist,
   page->flags, >flags);
 
@@ -698,7 +698,7 @@ static void print_trailer(struct kmem_cache *s, struct page 
*page, u8 *p)
 
print_page_info(page);
 
-   pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+   pr_err("ERR: Object 0x%p @offset=%tu fp=0x%p\n\n",
   p, p - addr, get_freepointer(s, p));
 
if (s->flags & SLAB_RED_ZONE)
@@ -791,7 +791,7 @@ static int check_bytes_and_report(struct kmem_cache *s, 
struct page *page,
end--;
 
slab_bug(s, "%s overwritten", what);
-   pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+   pr_err("ERR: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
print_trailer(s, page, object);
@@ -3855,7 +3855,7 @@ static void list_slab_objects(struct kmem_cache *s, 
struct page *page,
for_each_object(p, s, addr, page->objects) {
 
if (!test_bit(__obj_to_index(s, addr, p), map)) {
-   pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+   pr_err("ERR: Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
-- 
2.17.1

Re: [xfs] db962cd266: Assertion_failed

2021-01-04 Thread Yafang Shao

On Sat, Jan 2, 2021 at 5:53 AM Dave Chinner  wrote:
>
> On Fri, Jan 01, 2021 at 05:10:49PM +0800, Yafang Shao wrote:
> > On Thu, Dec 31, 2020 at 10:46 AM kernel test robot
> >  wrote:
> .
> > > [  552.905799] XFS: Assertion failed: !current->journal_info, file: 
> > > fs/xfs/xfs_trans.h, line: 280
> > > [  553.104459]  xfs_trans_reserve+0x225/0x320 [xfs]
> > > [  553.110556]  xfs_trans_roll+0x6e/0xe0 [xfs]
> > > [  553.116134]  xfs_defer_trans_roll+0x104/0x2a0 [xfs]
> > > [  553.122489]  ? xfs_extent_free_create_intent+0x62/0xc0 [xfs]
> > > [  553.129780]  xfs_defer_finish_noroll+0xb8/0x620 [xfs]
> > > [  553.136299]  xfs_defer_finish+0x11/0xa0 [xfs]
> > > [  553.142017]  xfs_itruncate_extents_flags+0x141/0x440 [xfs]
> > > [  553.149053]  xfs_setattr_size+0x3da/0x480 [xfs]
> > > [  553.154939]  ? setattr_prepare+0x6a/0x1e0
> > > [  553.160250]  xfs_vn_setattr+0x70/0x120 [xfs]
> > > [  553.165833]  notify_change+0x364/0x500
> > > [  553.170820]  ? do_truncate+0x76/0xe0
> > > [  553.175673]  do_truncate+0x76/0xe0
> > > [  553.180184]  path_openat+0xe6c/0x10a0
> > > [  553.184981]  do_filp_open+0x91/0x100
> > > [  553.189707]  ? __check_object_size+0x136/0x160
> > > [  553.195493]  do_sys_openat2+0x20d/0x2e0
> > > [  553.200481]  do_sys_open+0x44/0x80
> > > [  553.204926]  do_syscall_64+0x33/0x40
> > > [  553.209588]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> >
> > Thanks for the report.
> >
> > At a first glance, it seems we should make a similar change as we did
> > in xfs_trans_context_clear().
> >
> > static inline void
> > xfs_trans_context_set(struct xfs_trans *tp)
> > {
> > /*
> >  * We have already handed over the context via xfs_trans_context_swap().
> >  */
> > if (current->journal_info)
> > return;
> > current->journal_info = tp;
> > tp->t_pflags = memalloc_nofs_save();
> > }
>
> Ah, no.
>
> Remember how I said "split out the wrapping of transaction
> context setup in xfs_trans_reserve() from
> the lifting of the context setting into xfs_trans_alloc()"?
>
> Well, you did the former and dropped the latter out of the patch
> set.
>

I misunderstood what you mean.

> Now when a transaction rolls after xfs_trans_context_swap(), it
> calls xfs_trans_reserve() and tries to do transaction context setup
> work inside a transaction context that already exists.  IOWs, you
> need to put the patch that lifts of the context setting up into
> xfs_trans_alloc() back into the patchset before adding the
> current->journal functionality patch.
>

Sure.

> Also, you need to test XFS code with CONFIG_XFS_DEBUG=y so that
> asserts are actually built into the code and exercised, because this
> ASSERT should have fired on the first rolling transaction that the
> kernel executes...
>

I really forgot to enable CONFIG_XFS_DEBUG...   -_-b


-- 
Thanks
Yafang

Re: [xfs] db962cd266: Assertion_failed

2021-01-01 Thread Yafang Shao

On Thu, Dec 31, 2020 at 10:46 AM kernel test robot
 wrote:
>
>
> Greeting,
>
> FYI, we noticed the following commit (built with gcc-9):
>
> commit: db962cd266c4d3230436aec2899186510797d49f ("[PATCH v14 4/4] xfs: use 
> current->journal_info to avoid transaction reservation recursion")
> url: 
> https://github.com/0day-ci/linux/commits/Yafang-Shao/xfs-avoid-transaction-reservation-recursion/20201222-092315
> base: https://git.kernel.org/cgit/fs/xfs/xfs-linux.git for-next
>
> in testcase: filebench
> version: filebench-x86_64-22620e6-1_20201112
> with following parameters:
>
> disk: 1HDD
> fs: btrfs
> test: fivestreamwrite.f
> cpufreq_governor: performance
> ucode: 0x42e
>
>
>
> on test machine: 48 threads Intel(R) Xeon(R) CPU E5-2697 v2 @ 2.70GHz with 
> 112G memory
>
> caused below changes (please refer to attached dmesg/kmsg for entire 
> log/backtrace):
>
>
> If you fix the issue, kindly add following tag
> Reported-by: kernel test robot 
>
>
> [  552.503501]
> [  552.525993] /usr/bin/wget -q --timeout=1800 --tries=1 
> --local-encoding=UTF-8 
> http://internal-lkp-server:80/~lkp/pkg/linux/x86_64-rhel-8.3/gcc-9/5c8fe583cce542aa0b84adc939ce85293de36e5e/modules.cgz
>  -N -P 
> /opt/rootfs/tmp/pkg/linux/x86_64-rhel-8.3/gcc-9/5c8fe583cce542aa0b84adc939ce85293de36e5e
> [  552.525995]
> [  552.884581] 
> /opt/rootfs/tmp/pkg/linux/x86_64-rhel-8.3/gcc-9/5c8fe583cce542aa0b84adc939ce85293de36e5e/modules.cgz
>  isn't modified
> [  552.884583]
> [  552.905799] XFS: Assertion failed: !current->journal_info, file: 
> fs/xfs/xfs_trans.h, line: 280
> [  552.907594] /usr/bin/wget -q --timeout=1800 --tries=1 
> --local-encoding=UTF-8 
> http://internal-lkp-server:80/~lkp/osimage/ucode/intel-ucode-20201117.cgz -N 
> -P /opt/rootfs/tmp/osimage/ucode
> [  552.916568]
> [  552.916574] [ cut here ]
> [  552.939361] /opt/rootfs/tmp/osimage/ucode/intel-ucode-20201117.cgz isn't 
> modified
> [  552.940036] kernel BUG at fs/xfs/xfs_message.c:110!
> [  552.946338]
> [  552.955784] invalid opcode:  [#1] SMP PTI
> [  552.971010] CPU: 46 PID: 3793 Comm: kexec-lkp Not tainted 
> 5.10.0-rc5-00044-gdb962cd266c4 #1
> [  552.981331] Hardware name: Intel Corporation S2600WP/S2600WP, BIOS 
> SE5C600.86B.02.02.0002.122320131210 12/23/2013
> [  552.993907] RIP: 0010:assfail+0x23/0x28 [xfs]
> [  552.999797] Code: 67 fc ff ff 0f 0b c3 0f 1f 44 00 00 41 89 c8 48 89 d1 48 
> 89 f2 48 c7 c6 30 58 be c0 e8 82 f9 ff ff 80 3d b1 80 0a 00 00 74 02 <0f> 0b 
> 0f 0b c3 48 8d 45 10 48 89 e2 4c 89 e6 48 89 1c 24 48 89 44
> [  553.022798] RSP: 0018:c90006a139c8 EFLAGS: 00010202
> [  553.029624] RAX:  RBX: 889c3edea700 RCX: 
> 
> [  553.038646] RDX: ffc0 RSI: 000a RDI: 
> c0bd7bab
> [  553.047600] RBP: c90006a13a14 R08:  R09: 
> 
> [  553.056536] R10: 000a R11: f000 R12: 
> 
> [  553.065546] R13:  R14: 889c3ede91c8 R15: 
> 888f44608000
> [  553.074455] FS:  77fc9580() GS:889bea38() 
> knlGS:
> [  553.084494] CS:  0010 DS:  ES:  CR0: 80050033
> [  553.091837] CR2: 555a1420 CR3: 001c30fbe002 CR4: 
> 001706e0
> [  553.100720] Call Trace:
> [  553.104459]  xfs_trans_reserve+0x225/0x320 [xfs]
> [  553.110556]  xfs_trans_roll+0x6e/0xe0 [xfs]
> [  553.116134]  xfs_defer_trans_roll+0x104/0x2a0 [xfs]
> [  553.122489]  ? xfs_extent_free_create_intent+0x62/0xc0 [xfs]
> [  553.129780]  xfs_defer_finish_noroll+0xb8/0x620 [xfs]
> [  553.136299]  xfs_defer_finish+0x11/0xa0 [xfs]
> [  553.142017]  xfs_itruncate_extents_flags+0x141/0x440 [xfs]
> [  553.149053]  xfs_setattr_size+0x3da/0x480 [xfs]
> [  553.154939]  ? setattr_prepare+0x6a/0x1e0
> [  553.160250]  xfs_vn_setattr+0x70/0x120 [xfs]
> [  553.165833]  notify_change+0x364/0x500
> [  553.170820]  ? do_truncate+0x76/0xe0
> [  553.175673]  do_truncate+0x76/0xe0
> [  553.180184]  path_openat+0xe6c/0x10a0
> [  553.184981]  do_filp_open+0x91/0x100
> [  553.189707]  ? __check_object_size+0x136/0x160
> [  553.195493]  do_sys_openat2+0x20d/0x2e0
> [  553.200481]  do_sys_open+0x44/0x80
> [  553.204926]  do_syscall_64+0x33/0x40
> [  553.209588]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  553.215867] RIP: 0033:0x77ef11ae
> [  553.220579] Code: 25 00 00 41 00 3d 00 00 41 00 74 48 48 8d 05 59 65 0d 00 
> 8b 00 85 c0 75 69 89 f2 b8 01 01 00 00 48 89 fe bf 9c ff ff ff 0f 05 <48> 3d 
> 00 f0 ff ff 0f 87 a6 00 00 00 48 8b 4c 24 28 64 48 33 0c 25
> [  553.24287

Re: [PATCH 3/6] sched: make struct sched_statistics independent of fair sched class

2020-12-01 Thread Yafang Shao

On Tue, Dec 1, 2020 at 8:41 PM Mel Gorman  wrote:
>
> On Tue, Dec 01, 2020 at 07:54:13PM +0800, Yafang Shao wrote:
> > If we want to use schedstats facility, we should move out of
> > struct sched_statistics from the struct sched_entity or add it into other
> > sctructs of sched entity as well. Obviously the latter one is bad because
> > that requires more spaces. So we should move it into a common struct which
> > can be used by all sched classes.
> >
> > The struct sched_statistics is the schedular statistics of a task_struct
> > or a task_group. So we can move it into struct task_struct and
> > struct task_group to achieve the goal.
> >
> > Below is the detailed explaination of the change in the structs.
> >
> > - Before this patch
> >
> > struct task_struct {|-> struct sched_entity {
> > ... |   ...
> > struct sched_entity *se; ---|   struct sched_statistics statistics;
> > struct sched_rt_entity *rt; ...
> > ... ...
> > };  };
> >
> > struct task_group { |--> se[0]->statistics : schedstats of CPU0
> > ... |
> >  #ifdef CONFIG_FAIR_GROUP_SCHED |
> > struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1
> > |
> >  #endif |
> > |--> se[N]->statistics : schedstats of CPUn
> >
> >  #ifdef CONFIG_FAIR_GROUP_SCHED
> > struct sched_rt_entity  **rt_se; (N/A)
> >  #endif
> > ...
> > };
> >
> > The '**se' in task_group is allocated in the fair sched class, which is
> > hard to be reused by other sched class.
> >
> > - After this patch
> >
> > struct task_struct {
> > ...
> > struct sched_statistics statistics;
> > ...
> > struct sched_entity *se;
> > struct sched_rt_entity *rt;
> > ...
> > };
> >
> > struct task_group {|---> stats[0] : of CPU0
> > ...|
> > struct sched_statistics **stats; --|---> stats[1] : of CPU1
> > ...|
> >|---> stats[n] : of CPUn
> >  #ifdef CONFIG_FAIR_GROUP_SCHED
> > struct sched_entity **se;
> >  #endif
> >  #ifdef CONFIG_RT_GROUP_SCHED
> > struct sched_rt_entity  **rt_se;
> >  #endif
> > ...
> > };
> >
> > After the patch it is clearly that both of se or rt_se can easily get the
> > sched_statistics by a task_struct or a task_group.
> >
> > Signed-off-by: Yafang Shao 
>
> I didn't see anything wrong as such, it's mostly a mechanical
> conversion. The one slight caveat is the potential change in cache
> location for the statistics but it's not necessarily negative. The stats
> potentially move to a different cache line but it's less obvious whether
> that even matters given the location is very similar.
>
> There is increased overhead now when schedstats are *enabled* because
> _schedstat_from_sched_entity() has to be called but it appears that it is
> protected by a schedstat_enabled() check. So ok, schedstats when enabled
> are now a bit more expensive but they were expensive in the first place
> so does it matter?
>
> I'd have been happied if there was a comparison with schedstats enabled
> just in case the overhead is too high but it could also do with a second
> set of eyeballs.
>

Sure, I will do the comparison. Thanks for the review again.


> It's somewhat tentative but
>
> Acked-by: Mel Gorman 
>
> --
> Mel Gorman
> SUSE Labs



-- 
Thanks
Yafang

Re: [PATCH 4/6] sched: make schedstats helpers independent of fair sched class

2020-12-01 Thread Yafang Shao

On Tue, Dec 1, 2020 at 8:46 PM Mel Gorman  wrote:
>
> On Tue, Dec 01, 2020 at 07:54:14PM +0800, Yafang Shao wrote:
> > The original prototype of the schedstats helpers are
> >
> > update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >
> > The cfs_rq in these helpers is used to get the rq_clock, and the se is
> > used to get the struct sched_statistics and the struct task_struct. In
> > order to make these helpers available by all sched classes, we can pass
> > the rq, sched_statistics and task_struct directly.
> >
> > Then the new helpers are
> >
> > update_stats_wait_*(struct rq *rq, struct task_struct *p,
> >   struct sched_statistics *stats)
> >
> > which are independent of fair sched class.
> >
> > To avoid vmlinux growing too large or introducing ovehead when
> > !schedstat_enabled(), some new helpers after schedstat_enabled() are also
> > introduced, Suggested by Mel. These helpers are in sched/stats.c,
> >
> > __update_stats_wait_*(struct rq *rq, struct task_struct *p,
> > struct sched_statistics *stats)
> >
> > Cc: Mel Gorman 
> > Signed-off-by: Yafang Shao 
>
> Think it's ok, it's mostly code shuffling. I'd have been happier if
> there was evidence showing a before/after comparison of the sched stats
> for something simple like "perf bench sched pipe" and a clear statement
> of no functional change as well as a comparison of the vmlinux files but
> I think it's ok so;
>

Thanks for the review, I will do the comparison as you suggested.

> Acked-by: Mel Gorman 
>

Thanks!

> I didn't look at the rt.c parts
>
> --
> Mel Gorman
> SUSE Labs



-- 
Thanks
Yafang

[PATCH 5/6] sched, rt: support sched_stat_runtime tracepoint for RT sched class

2020-12-01 Thread Yafang Shao

The runtime of a RT task has already been there, so we only need to
place the sched_stat_runtime tracepoint there.

One difference between fair task and RT task is that there is no vruntime
in RT task. To reuse the sched_stat_runtime tracepoint, '0' is passed as
vruntime for RT task.

The output of this tracepoint for RT task as follows,
  stress-9748[039] d.h.   113.519352: sched_stat_runtime: 
comm=stress pid=9748 runtime=997573 [ns] vruntime=0 [ns]
  stress-9748[039] d.h.   113.520352: sched_stat_runtime: 
comm=stress pid=9748 runtime=997627 [ns] vruntime=0 [ns]
  stress-9748[039] d.h.   113.521352: sched_stat_runtime: 
comm=stress pid=9748 runtime=998203 [ns] vruntime=0 [ns]

Signed-off-by: Yafang Shao 
---
 kernel/sched/rt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2d543a270dfe..da989653b0a2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1011,6 +1011,8 @@ static void update_curr_rt(struct rq *rq)
schedstat_set(curr->stats.exec_max,
  max(curr->stats.exec_max, delta_exec));
 
+   trace_sched_stat_runtime(curr, delta_exec, 0);
+
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
 
-- 
2.18.4

[PATCH 6/6] sched, rt: support schedstats for RT sched class

2020-12-01 Thread Yafang Shao

We want to measure the latency of RT tasks in our production
environment with schedstats facility, but currently schedstats is only
supported for fair sched class. This patch enable it for RT sched class
as well.

After we make the struct sched_statistics and the helpers of it
independent of fair sched class, we can easily use the schedstats
facility for RT sched class.

The schedstat usage in RT sched class is similar with fair sched class,
for example,
fairRT
enqueue update_stats_enqueue_fair   update_stats_enqueue_rt
dequeue update_stats_dequeue_fair   update_stats_dequeue_rt
put_prev_task   update_stats_wait_start update_stats_wait_start
set_next_task   update_stats_wait_end   update_stats_wait_end

The user can get the schedstats information in the same way in fair sched
class. For example,
fairRT
task show   /proc/[pid]/sched   /proc/[pid]/sched
group show  cpu.stat in cgroup  cpu.stat in cgroup

The output of a RT task's schedstats as follows,
$ cat /proc/10461/sched
...
stats.sum_sleep_runtime  : 37966.502936
stats.wait_start : 0.00
stats.sleep_start: 0.00
stats.block_start:279182.986012
stats.sleep_max  : 9.001121
stats.block_max  : 9.292692
stats.exec_max   : 0.090009
stats.slice_max  : 0.00
stats.wait_max   : 0.005305
stats.wait_sum   : 0.352352
stats.wait_count :   236173
stats.iowait_sum : 37875.625128
stats.iowait_count   :   235933
stats.nr_migrations_cold :0
stats.nr_failed_migrations_affine:0
stats.nr_failed_migrations_running   :0
stats.nr_failed_migrations_hot   :0
stats.nr_forced_migrations   :0
stats.nr_wakeups :   236172
stats.nr_wakeups_sync:0
stats.nr_wakeups_migrate :2
stats.nr_wakeups_local   :   235865
stats.nr_wakeups_remote  :  307
stats.nr_wakeups_affine  :0
stats.nr_wakeups_affine_attempts :0
stats.nr_wakeups_passive :0
stats.nr_wakeups_idle:0
...

The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can
be used to trace RT tasks as well.

The output of these tracepoints for a RT tasks as follows,

- blocked & iowait
kworker/48:1-442 [048] d...   539.830872: sched_stat_iowait: 
comm=stress pid=10461 delay=158242 [ns]
kworker/48:1-442 [048] d...   539.830872: sched_stat_blocked: 
comm=stress pid=10461 delay=158242 [ns]

- wait
  stress-10460   [001] dN..   813.965304: sched_stat_wait: comm=stress 
pid=10462 delay=7536 [ns]
  stress-10462   [001] d.h.   813.966300: sched_stat_runtime: 
comm=stress pid=10462 runtime=993812 [ns] vruntime=0 [ns]
  [...]
  stress-10462   [001] d.h.   814.065300: sched_stat_runtime: 
comm=stress pid=10462 runtime=994484 [ns] vruntime=0 [ns]
  [ totally 100 times of sched_stat_runtime for pid 10462]
  [ The delay of pid 10462 is the sum of above runtime ]
  stress-10462   [001] dN..   814.065307: sched_stat_wait: comm=stress 
pid=10460 delay=11089 [ns]
  stress-10460   [001] d.h.   814.066299: sched_stat_runtime: 
comm=stress pid=10460 runtime=991934 [ns] vruntime=0 [ns]

- sleep
   sleep-15582   [041] dN..  1732.814348: sched_stat_sleep: 
comm=sleep.sh pid=15474 delay=1001223130 [ns]
   sleep-15584   [041] dN..  1733.815908: sched_stat_sleep: 
comm=sleep.sh pid=15474 delay=1001238954 [ns]
   [ In sleep.sh, it sleeps 1 sec each time. ]

Signed-off-by: Yafang Shao 
---
 kernel/sched/rt.c | 134 +-
 1 file changed, 133 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index da989653b0a2..f764c2b9070d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1271,6 +1271,121 @@ static void __delist_rt_entity(struct sched_rt_entity 
*rt_se, struct rt_prio_arr
rt_se->on_list = 0;
 }
 
+static inline void
+__schedstat_from_sched_rt_entity(struct sched_rt_entity

[PATCH 4/6] sched: make schedstats helpers independent of fair sched class

2020-12-01 Thread Yafang Shao

The original prototype of the schedstats helpers are

update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se)

The cfs_rq in these helpers is used to get the rq_clock, and the se is
used to get the struct sched_statistics and the struct task_struct. In
order to make these helpers available by all sched classes, we can pass
the rq, sched_statistics and task_struct directly.

Then the new helpers are

update_stats_wait_*(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)

which are independent of fair sched class.

To avoid vmlinux growing too large or introducing ovehead when
!schedstat_enabled(), some new helpers after schedstat_enabled() are also
introduced, Suggested by Mel. These helpers are in sched/stats.c,

__update_stats_wait_*(struct rq *rq, struct task_struct *p,
  struct sched_statistics *stats)

Cc: Mel Gorman 
Signed-off-by: Yafang Shao 
---
 kernel/sched/fair.c  | 140 +++
 kernel/sched/stats.c | 104 
 kernel/sched/stats.h |  32 ++
 3 files changed, 157 insertions(+), 119 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 14d8df308d44..b869a83fac29 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -917,69 +917,44 @@ static void update_curr_fair(struct rq *rq)
 }
 
 static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
struct sched_statistics *stats = NULL;
-   u64 wait_start, prev_wait_start;
+   struct task_struct *p = NULL;
 
if (!schedstat_enabled())
return;
 
-   __schedstat_from_sched_entity(se, );
-
-   wait_start = rq_clock(rq_of(cfs_rq));
-   prev_wait_start = schedstat_val(stats->wait_start);
+   if (entity_is_task(se))
+   p = task_of(se);
 
-   if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
-   likely(wait_start > prev_wait_start))
-   wait_start -= prev_wait_start;
+   __schedstat_from_sched_entity(se, );
 
-   __schedstat_set(stats->wait_start, wait_start);
+   __update_stats_wait_start(rq_of(cfs_rq), p, stats);
 }
 
 static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
struct sched_statistics *stats = NULL;
struct task_struct *p = NULL;
-   u64 delta;
 
if (!schedstat_enabled())
return;
 
-   __schedstat_from_sched_entity(se, );
-
-   delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start);
-   if (entity_is_task(se)) {
+   if (entity_is_task(se))
p = task_of(se);
 
-   if (task_on_rq_migrating(p)) {
-   /*
-* Preserve migrating task's wait time so wait_start
-* time stamp can be adjusted to accumulate wait time
-* prior to migration.
-*/
-   __schedstat_set(stats->wait_start, delta);
-
-   return;
-   }
-
-   trace_sched_stat_wait(p, delta);
-   }
+   __schedstat_from_sched_entity(se, );
 
-   __schedstat_set(stats->wait_max,
-   max(schedstat_val(stats->wait_max), delta));
-   __schedstat_inc(stats->wait_count);
-   __schedstat_add(stats->wait_sum, delta);
-   __schedstat_set(stats->wait_start, 0);
+   __update_stats_wait_end(rq_of(cfs_rq), p, stats);
 }
 
 static inline void
-update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
 {
struct sched_statistics *stats = NULL;
struct task_struct *p = NULL;
-   u64 sleep_start, block_start;
 
if (!schedstat_enabled())
return;
@@ -989,67 +964,14 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, 
struct sched_entity *se)
 
__schedstat_from_sched_entity(se, );
 
-   sleep_start = schedstat_val(stats->sleep_start);
-   block_start = schedstat_val(stats->block_start);
-
-   if (sleep_start) {
-   u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
-
-   if ((s64)delta < 0)
-   delta = 0;
-
-   if (unlikely(delta > schedstat_val(stats->sleep_max)))
-   __schedstat_set(stats->sleep_max, delta);
-
-   __schedstat_set(stats->sleep_start, 0);
-   __schedstat_add(stats->sum_sleep_runtime, delta);
-
-   if (p) {
-   account_scheduler_latency(p, delta >> 10, 1);
-   trace_sched_stat_sleep(p, delta)

[PATCH 2/6] sched, fair: use __schedstat_set() in set_next_entity()

2020-12-01 Thread Yafang Shao

schedstat_enabled() has been already checked, so we can use
__schedstat_set() directly.

Signed-off-by: Yafang Shao 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8ff1daa3d9bb..7e7c03cede94 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4414,7 +4414,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 */
if (schedstat_enabled() &&
rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
-   schedstat_set(se->statistics.slice_max,
+   __schedstat_set(se->statistics.slice_max,
max((u64)schedstat_val(se->statistics.slice_max),
se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
-- 
2.18.4

[PATCH 3/6] sched: make struct sched_statistics independent of fair sched class

2020-12-01 Thread Yafang Shao

If we want to use schedstats facility, we should move out of
struct sched_statistics from the struct sched_entity or add it into other
sctructs of sched entity as well. Obviously the latter one is bad because
that requires more spaces. So we should move it into a common struct which
can be used by all sched classes.

The struct sched_statistics is the schedular statistics of a task_struct
or a task_group. So we can move it into struct task_struct and
struct task_group to achieve the goal.

Below is the detailed explaination of the change in the structs.

- Before this patch

struct task_struct {|-> struct sched_entity {
... |   ...
struct sched_entity *se; ---|   struct sched_statistics statistics;
struct sched_rt_entity *rt; ...
... ...
};  };

struct task_group { |--> se[0]->statistics : schedstats of CPU0
... |
 #ifdef CONFIG_FAIR_GROUP_SCHED |
struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1
|
 #endif |
|--> se[N]->statistics : schedstats of CPUn

 #ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_rt_entity  **rt_se; (N/A)
 #endif
...
};

The '**se' in task_group is allocated in the fair sched class, which is
hard to be reused by other sched class.

- After this patch

struct task_struct {
...
struct sched_statistics statistics;
...
struct sched_entity *se;
struct sched_rt_entity *rt;
...
};

struct task_group {|---> stats[0] : of CPU0
...|
struct sched_statistics **stats; --|---> stats[1] : of CPU1
...|
   |---> stats[n] : of CPUn
 #ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity **se;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity  **rt_se;
 #endif
...
};

After the patch it is clearly that both of se or rt_se can easily get the
sched_statistics by a task_struct or a task_group.

Signed-off-by: Yafang Shao 
---
 include/linux/sched.h|   3 +-
 kernel/sched/core.c  |  24 ---
 kernel/sched/deadline.c  |   4 +-
 kernel/sched/debug.c |  81 +++---
 kernel/sched/fair.c  | 142 ++-
 kernel/sched/rt.c|   4 +-
 kernel/sched/sched.h |   3 +
 kernel/sched/stats.h |  46 +
 kernel/sched/stop_task.c |   4 +-
 9 files changed, 206 insertions(+), 105 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 063cd120b459..f8e969be4bee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -462,8 +462,6 @@ struct sched_entity {
 
u64 nr_migrations;
 
-   struct sched_statistics statistics;
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
struct sched_entity *parent;
@@ -681,6 +679,7 @@ struct task_struct {
unsigned intrt_priority;
 
const struct sched_class*sched_class;
+   struct sched_statistics stats;
struct sched_entity se;
struct sched_rt_entity  rt;
 #ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fd76628778f7..081b4f1f2cb4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2427,11 +2427,11 @@ ttwu_stat(struct task_struct *p, int cpu, int 
wake_flags)
 #ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
-   __schedstat_inc(p->se.statistics.nr_wakeups_local);
+   __schedstat_inc(p->stats.nr_wakeups_local);
} else {
struct sched_domain *sd;
 
-   __schedstat_inc(p->se.statistics.nr_wakeups_remote);
+   __schedstat_inc(p->stats.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
@@ -2443,14 +2443,14 @@ ttwu_stat(struct task_struct *p, int cpu, int 
wake_flags)
}
 
if (wake_flags & WF_MIGRATED)
-   __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+   __schedstat_inc(p->stats.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
 
__schedstat_inc(rq->ttwu_count);
-   __schedstat_inc(p->se.statistics.nr_wakeups);
+   __schedstat_inc(p->stats.nr_wakeups);
 
if (wake_flags & WF_SYNC)
-   __schedstat_inc(p->se.statistics.nr_wakeups_sync);
+   __schedstat_inc(p->stats.nr_wakeups_sync);
 }
 
 /*
@@ -3075,7 +3075,7 @@ static void __sched_fork(unsigned long clone_flags, 
struct task

[PATCH 1/6] sched: don't include stats.h in sched.h

2020-12-01 Thread Yafang Shao

This patch is a preparation of the followup patches. In the followup
patches some common helpers will be defined in stats.h, and these common
helpers require some definitions in sched.h, so let's move stats.h out
of sched.h.

The source files which require stats.h include it specifically.

Signed-off-by: Yafang Shao 
---
 kernel/sched/core.c  | 1 +
 kernel/sched/deadline.c  | 1 +
 kernel/sched/debug.c | 1 +
 kernel/sched/fair.c  | 1 +
 kernel/sched/idle.c  | 1 +
 kernel/sched/rt.c| 2 +-
 kernel/sched/sched.h | 6 +-
 kernel/sched/stats.c | 1 +
 kernel/sched/stats.h | 2 ++
 kernel/sched/stop_task.c | 1 +
 10 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2003a7d5ab5..fd76628778f7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11,6 +11,7 @@
 #undef CREATE_TRACE_POINTS
 
 #include "sched.h"
+#include "stats.h"
 
 #include 
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f232305dcefe..7a0124f81a4f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,7 @@
  *Fabio Checconi 
  */
 #include "sched.h"
+#include "stats.h"
 #include "pelt.h"
 
 struct dl_bandwidth def_dl_bandwidth;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2357921580f9..9758aa1bba1e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -7,6 +7,7 @@
  * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
  */
 #include "sched.h"
+#include "stats.h"
 
 static DEFINE_SPINLOCK(sched_debug_lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8917d2d715ef..8ff1daa3d9bb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -21,6 +21,7 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 #include "sched.h"
+#include "stats.h"
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 24d0ee26377d..95c02cbca04a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
  *tasks which are handled in sched/fair.c )
  */
 #include "sched.h"
+#include "stats.h"
 
 #include 
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 49ec096a8aa1..af772ac0f32d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -4,7 +4,7 @@
  * policies)
  */
 #include "sched.h"
-
+#include "stats.h"
 #include "pelt.h"
 
 int sched_rr_timeslice = RR_TIMESLICE;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..871544bb9a38 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,9 @@
 /*
  * Scheduler internal types and methods:
  */
+#ifndef _KERNEL_SCHED_SCHED_H
+#define _KERNEL_SCHED_SCHED_H
+
 #include 
 
 #include 
@@ -1538,7 +1541,6 @@ extern void flush_smp_call_function_from_idle(void);
 static inline void flush_smp_call_function_from_idle(void) { }
 #endif
 
-#include "stats.h"
 #include "autogroup.h"
 
 #ifdef CONFIG_CGROUP_SCHED
@@ -2633,3 +2635,5 @@ static inline bool is_per_cpu_kthread(struct task_struct 
*p)
 
 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 750fb3c67eed..844bd9dbfbf0 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -3,6 +3,7 @@
  * /proc/schedstat implementation
  */
 #include "sched.h"
+#include "stats.h"
 
 /*
  * Current schedstat API version.
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 33d0daf83842..c23b653ffc53 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -2,6 +2,8 @@
 
 #ifdef CONFIG_SCHEDSTATS
 
+#include "sched.h"
+
 /*
  * Expects runqueue lock to be held for atomicity of update
  */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index ceb5b6b12561..a5d289049388 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -8,6 +8,7 @@
  * See kernel/stop_machine.c
  */
 #include "sched.h"
+#include "stats.h"
 
 #ifdef CONFIG_SMP
 static int
-- 
2.18.4

[PATCH 0/6] sched: support schedstats for RT sched class

2020-12-01 Thread Yafang Shao

We want to measure the latency of RT tasks in our production
environment with schedstats facility, but currently schedstats is only
supported for fair sched class. This patch enable it for RT sched class
as well.

- Structure

Before we make schedstats available for RT sched class, we should make
struct sched_statistics independent of fair sched class first.


The struct sched_statistics is the schedular statistics of a task_struct
or a task_group. So we can move it into struct task_struct and
struct task_group to achieve the goal.

Below is the detailed explaination of the change in the structs.

The struct before this patch:

struct task_struct {|-> struct sched_entity {
... |   ...
struct sched_entity *se; ---|   struct sched_statistics statistics;
struct sched_rt_entity *rt; ...
... ...
};  };

struct task_group { |--> se[0]->statistics : schedstats of CPU0
... |
 #ifdef CONFIG_FAIR_GROUP_SCHED |
struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1
|
 #endif |
|--> se[N]->statistics : schedstats of CPUn

 #ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_rt_entity  **rt_se; (N/A)
 #endif
...
};

The '**se' in task_group is allocated in the fair sched class, which is
hard to be reused by other sched class.

The struct after this patch:
struct task_struct {
...
struct sched_statistics statistics;
...
struct sched_entity *se;
struct sched_rt_entity *rt;
...
};

struct task_group {|---> stats[0] : of CPU0
...|
struct sched_statistics **stats; --|---> stats[1] : of CPU1
...|
   |---> stats[n] : of CPUn
 #ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity **se;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity  **rt_se;
 #endif
...
};

After the patch it is clearly that both of se or rt_se can easily get the
sched_statistics by a task_struct or a task_group.

- Function Interface

The original prototype of the schedstats helpers are

update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se)

The cfs_rq in these helpers is used to get the rq_clock, and the se is
used to get the struct sched_statistics and the struct task_struct. In
order to make these helpers available by all sched classes, we can pass
the rq, sched_statistics and task_struct directly.

Then the new helpers are

update_stats_wait_*(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)

which are independent of fair sched class.

To avoid vmlinux growing too large or introducing ovehead when
!schedstat_enabled(), some new helpers after schedstat_enabled() are also
introduced, Suggested by Mel. These helpers are in sched/stats.c,

__update_stats_wait_*(struct rq *rq, struct task_struct *p,
  struct sched_statistics *stats)

- Implementation

After we make the struct sched_statistics and the helpers of it
independent of fair sched class, we can easily use the schedstats
facility for RT sched class.

The schedstat usage in RT sched class is similar with fair sched class,
for example,
fairRT
enqueue update_stats_enqueue_fair   update_stats_enqueue_rt
dequeue update_stats_dequeue_fair   update_stats_dequeue_rt
put_prev_task   update_stats_wait_start update_stats_wait_start
set_next_task   update_stats_wait_end   update_stats_wait_end

- Usage
 
The user can get the schedstats information in the same way in fair sched
class. For example,
fairRT
task show   /proc/[pid]/sched   /proc/[pid]/sched
group show  cpu.stat in cgroup  cpu.stat in cgroup

The sched:sched_stat_{wait, sleep, iowait, blocked, runtime} tracepoints
can be used to trace RT tasks as well.

Detailed examples of the output of schedstats for RT tasks are in patch #6.


Changes since RFC:
- improvement of schedstats helpers, per Mel.
- make struct schedstats independent of fair sched class

Yafang Shao (6):
  sched: don't include stats.h in sched.h
  sched, fair: use __schedstat_set() in set_next_entity()
  sched: make struct sched_statistics independent of fair sched class
  sched: make schedstats helpers independent of fair sched class
  sched, rt: support sched_stat_runtime tracepoint for RT sched class
  sched, rt: support schedstats for RT sched class

 include/linux/sched.h|   3 +-
 kernel/sched/core.c  |  25 +++--
 kernel/sched/deadline.c  |   5 +-
 kernel/sched/debug.c |  82 +++
 kernel/sched/fair.c  | 209 +++---

Re: [RFC PATCH v3 0/5] sched: support schedstats for RT sched class

2020-11-27 Thread Yafang Shao

On Sat, Nov 28, 2020 at 12:12 AM Yafang Shao  wrote:
>
> We want to measure the latency of RT tasks in our production
> environment with schedstats facility, but currently schedstats is only
> supported for fair sched class. This patch enable it for RT sched class
> as well.
>
> - Structure
>
> Before we make schedstats available for RT sched class, we should make
> struct sched_statistics independent of fair sched class first.
>
>
> The struct sched_statistics is the schedular statistics of a task_struct
> or a task_group. So we can move it into struct task_struct and
> struct task_group to achieve the goal.
>
> Below is the detailed explaination of the change in the structs.
>
> The struct before this patch:
>
> struct task_struct {|-> struct sched_entity {
> ... |   ...
> struct sched_entity *se; ---|   struct sched_statistics statistics;
> struct sched_rt_entity *rt; ...
> ... ...
> };  };
>
> struct task_group { |--> se[0]->statistics : schedstats of CPU0
> ... |
>  #ifdef CONFIG_FAIR_GROUP_SCHED |
> struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1
> |
>  #endif |
> |--> se[N]->statistics : schedstats of CPUn
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> struct sched_rt_entity  **rt_se; (N/A)
>  #endif
> ...
> };
>
> The '**se' in task_group is allocated in the fair sched class, which is
> hard to be reused by other sched class.
>
> The struct after this patch:
> struct task_struct {
> ...
> struct sched_statistics statistics;
> ...
> struct sched_entity *se;
> struct sched_rt_entity *rt;
> ...
> };
>
> struct task_group {|---> stats[0] : of CPU0
> ...|
> struct sched_statistics **stats; --|---> stats[1] : of CPU1
> ...|
>|---> stats[n] : of CPUn
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> struct sched_entity **se;
>  #endif
>  #ifdef CONFIG_RT_GROUP_SCHED
> struct sched_rt_entity  **rt_se;
>  #endif
> ...
> };
>
> After the patch it is clearly that both of se or rt_se can easily get the
> sched_statistics by a task_struct or a task_group.
>
> - Function Interface
>
> The original prototype of the schedstats helpers are
>
> update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se)
>
> The cfs_rq in these helpers is used to get the rq_clock, and the se is
> used to get the struct sched_statistics and the struct task_struct. In
> order to make these helpers available by all sched classes, we can pass
> the rq, sched_statistics and task_struct directly.
>
> Then the new helpers are
>
> update_stats_wait_*(struct rq *rq, struct task_struct *p,
> struct sched_statistics *stats)
>
> which are independent of fair sched class.
>
> To avoid vmlinux growing too large or introducing ovehead when
> !schedstat_enabled(), some new helpers after schedstat_enabled() are also
> introduced, Suggested by Mel. These helpers are in sched/stats.c,
>
> __update_stats_wait_*(struct rq *rq, struct task_struct *p,
>   struct sched_statistics *stats)
>
> - Implementation
>
> After we make the struct sched_statistics and the helpers of it
> independent of fair sched class, we can easily use the schedstats
> facility for RT sched class.
>
> The schedstat usage in RT sched class is similar with fair sched class,
> for example,
> fairRT
> enqueue update_stats_enqueue_fair   update_stats_enqueue_rt
> dequeue update_stats_dequeue_fair   update_stats_dequeue_rt
> put_prev_task   update_stats_wait_start update_stats_wait_start
> set_next_task   update_stats_wait_end   update_stats_wait_end
>
> - Usage
>
> The user can get the schedstats information in the same way in fair sched
> class. For example,
>     fairRT
> task show   /proc/[pid]/sched   /proc/[pid]/sched
> group show  cpu.stat in cgroup  cpu.stat in cgroup
>
> The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can
> be used to trace RT tasks as well. sched_stat_runtime can also be
> supported in the future if it is helpful.
>
> Yafang Shao (5):
>   sched: don't include stats.h in sched.h
>   sched, fair: use __sched

[RFC PATCH v3 4/5] sched: make schedstats helpers independent of fair sched class

2020-11-27 Thread Yafang Shao

The original prototype of the schedstats helpers are

update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se)

The cfs_rq in these helpers is used to get the rq_clock, and the se is
used to get the struct sched_statistics and the struct task_struct. In
order to make these helpers available by all sched classes, we can pass
the rq, sched_statistics and task_struct directly.

Then the new helpers are

update_stats_wait_*(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)

which are independent of fair sched class.

To avoid vmlinux growing too large or introducing ovehead when
!schedstat_enabled(), some new helpers after schedstat_enabled() are also
introduced, Suggested by Mel. These helpers are in sched/stats.c,

__update_stats_wait_*(struct rq *rq, struct task_struct *p,
  struct sched_statistics *stats)

Cc: Mel Gorman 
Signed-off-by: Yafang Shao 
---
 kernel/sched/fair.c  | 140 +++
 kernel/sched/stats.c | 104 
 kernel/sched/stats.h |  32 ++
 3 files changed, 157 insertions(+), 119 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 14d8df308d44..b869a83fac29 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -917,69 +917,44 @@ static void update_curr_fair(struct rq *rq)
 }
 
 static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
struct sched_statistics *stats = NULL;
-   u64 wait_start, prev_wait_start;
+   struct task_struct *p = NULL;
 
if (!schedstat_enabled())
return;
 
-   __schedstat_from_sched_entity(se, );
-
-   wait_start = rq_clock(rq_of(cfs_rq));
-   prev_wait_start = schedstat_val(stats->wait_start);
+   if (entity_is_task(se))
+   p = task_of(se);
 
-   if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
-   likely(wait_start > prev_wait_start))
-   wait_start -= prev_wait_start;
+   __schedstat_from_sched_entity(se, );
 
-   __schedstat_set(stats->wait_start, wait_start);
+   __update_stats_wait_start(rq_of(cfs_rq), p, stats);
 }
 
 static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
struct sched_statistics *stats = NULL;
struct task_struct *p = NULL;
-   u64 delta;
 
if (!schedstat_enabled())
return;
 
-   __schedstat_from_sched_entity(se, );
-
-   delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start);
-   if (entity_is_task(se)) {
+   if (entity_is_task(se))
p = task_of(se);
 
-   if (task_on_rq_migrating(p)) {
-   /*
-* Preserve migrating task's wait time so wait_start
-* time stamp can be adjusted to accumulate wait time
-* prior to migration.
-*/
-   __schedstat_set(stats->wait_start, delta);
-
-   return;
-   }
-
-   trace_sched_stat_wait(p, delta);
-   }
+   __schedstat_from_sched_entity(se, );
 
-   __schedstat_set(stats->wait_max,
-   max(schedstat_val(stats->wait_max), delta));
-   __schedstat_inc(stats->wait_count);
-   __schedstat_add(stats->wait_sum, delta);
-   __schedstat_set(stats->wait_start, 0);
+   __update_stats_wait_end(rq_of(cfs_rq), p, stats);
 }
 
 static inline void
-update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
 {
struct sched_statistics *stats = NULL;
struct task_struct *p = NULL;
-   u64 sleep_start, block_start;
 
if (!schedstat_enabled())
return;
@@ -989,67 +964,14 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, 
struct sched_entity *se)
 
__schedstat_from_sched_entity(se, );
 
-   sleep_start = schedstat_val(stats->sleep_start);
-   block_start = schedstat_val(stats->block_start);
-
-   if (sleep_start) {
-   u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
-
-   if ((s64)delta < 0)
-   delta = 0;
-
-   if (unlikely(delta > schedstat_val(stats->sleep_max)))
-   __schedstat_set(stats->sleep_max, delta);
-
-   __schedstat_set(stats->sleep_start, 0);
-   __schedstat_add(stats->sum_sleep_runtime, delta);
-
-   if (p) {
-   account_scheduler_latency(p, delta >> 10, 1);
-   trace_sched_stat_sleep(p, delta)

[RFC PATCH v3 3/5] sched: make struct sched_statistics independent of fair sched class

2020-11-27 Thread Yafang Shao

If we want to use schedstats facility, we should move out of
struct sched_statistics from the struct sched_entity or add it into other
sctructs of sched entity as well. Obviously the latter one is bad because
that requires more spaces. So we should move it into a common struct which
can be used by all sched classes.

The struct sched_statistics is the schedular statistics of a task_struct
or a task_group. So we can move it into struct task_struct and
struct task_group to achieve the goal.

Below is the detailed explaination of the change in the structs.

- Before this patch

struct task_struct {|-> struct sched_entity {
... |   ...
struct sched_entity *se; ---|   struct sched_statistics statistics;
struct sched_rt_entity *rt; ...
... ...
};  };

struct task_group { |--> se[0]->statistics : schedstats of CPU0
... |
 #ifdef CONFIG_FAIR_GROUP_SCHED |
struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1
|
 #endif |
|--> se[N]->statistics : schedstats of CPUn

 #ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_rt_entity  **rt_se; (N/A)
 #endif
...
};

The '**se' in task_group is allocated in the fair sched class, which is
hard to be reused by other sched class.

- After this patch

struct task_struct {
...
struct sched_statistics statistics;
...
struct sched_entity *se;
struct sched_rt_entity *rt;
...
};

struct task_group {|---> stats[0] : of CPU0
...|
struct sched_statistics **stats; --|---> stats[1] : of CPU1
...|
   |---> stats[n] : of CPUn
 #ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity **se;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity  **rt_se;
 #endif
...
};

After the patch it is clearly that both of se or rt_se can easily get the
sched_statistics by a task_struct or a task_group.

Signed-off-by: Yafang Shao 
---
 include/linux/sched.h|   3 +-
 kernel/sched/core.c  |  24 ---
 kernel/sched/deadline.c  |   4 +-
 kernel/sched/debug.c |  81 +++---
 kernel/sched/fair.c  | 142 ++-
 kernel/sched/rt.c|   4 +-
 kernel/sched/sched.h |   3 +
 kernel/sched/stats.h |  46 +
 kernel/sched/stop_task.c |   4 +-
 9 files changed, 206 insertions(+), 105 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 063cd120b459..f8e969be4bee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -462,8 +462,6 @@ struct sched_entity {
 
u64 nr_migrations;
 
-   struct sched_statistics statistics;
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
struct sched_entity *parent;
@@ -681,6 +679,7 @@ struct task_struct {
unsigned intrt_priority;
 
const struct sched_class*sched_class;
+   struct sched_statistics stats;
struct sched_entity se;
struct sched_rt_entity  rt;
 #ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fd76628778f7..081b4f1f2cb4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2427,11 +2427,11 @@ ttwu_stat(struct task_struct *p, int cpu, int 
wake_flags)
 #ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
-   __schedstat_inc(p->se.statistics.nr_wakeups_local);
+   __schedstat_inc(p->stats.nr_wakeups_local);
} else {
struct sched_domain *sd;
 
-   __schedstat_inc(p->se.statistics.nr_wakeups_remote);
+   __schedstat_inc(p->stats.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
@@ -2443,14 +2443,14 @@ ttwu_stat(struct task_struct *p, int cpu, int 
wake_flags)
}
 
if (wake_flags & WF_MIGRATED)
-   __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+   __schedstat_inc(p->stats.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
 
__schedstat_inc(rq->ttwu_count);
-   __schedstat_inc(p->se.statistics.nr_wakeups);
+   __schedstat_inc(p->stats.nr_wakeups);
 
if (wake_flags & WF_SYNC)
-   __schedstat_inc(p->se.statistics.nr_wakeups_sync);
+   __schedstat_inc(p->stats.nr_wakeups_sync);
 }
 
 /*
@@ -3075,7 +3075,7 @@ static void __sched_fork(unsigned long clone_flags, 
struct task

[RFC PATCH v3 2/5] sched, fair: use __schedstat_set() in set_next_entity()

2020-11-27 Thread Yafang Shao

schedstat_enabled() has been already checked, so we can use
__schedstat_set() directly.

Signed-off-by: Yafang Shao 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8ff1daa3d9bb..7e7c03cede94 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4414,7 +4414,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 */
if (schedstat_enabled() &&
rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
-   schedstat_set(se->statistics.slice_max,
+   __schedstat_set(se->statistics.slice_max,
max((u64)schedstat_val(se->statistics.slice_max),
se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
-- 
2.18.4

[RFC PATCH v3 0/5] sched: support schedstats for RT sched class

2020-11-27 Thread Yafang Shao

We want to measure the latency of RT tasks in our production
environment with schedstats facility, but currently schedstats is only
supported for fair sched class. This patch enable it for RT sched class
as well.

- Structure

Before we make schedstats available for RT sched class, we should make
struct sched_statistics independent of fair sched class first.


The struct sched_statistics is the schedular statistics of a task_struct
or a task_group. So we can move it into struct task_struct and
struct task_group to achieve the goal.

Below is the detailed explaination of the change in the structs.

The struct before this patch:

struct task_struct {|-> struct sched_entity {
... |   ...
struct sched_entity *se; ---|   struct sched_statistics statistics;
struct sched_rt_entity *rt; ...
... ...
};  };

struct task_group { |--> se[0]->statistics : schedstats of CPU0
... |
 #ifdef CONFIG_FAIR_GROUP_SCHED |
struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1
|
 #endif |
|--> se[N]->statistics : schedstats of CPUn

 #ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_rt_entity  **rt_se; (N/A)
 #endif
...
};

The '**se' in task_group is allocated in the fair sched class, which is
hard to be reused by other sched class.

The struct after this patch:
struct task_struct {
...
struct sched_statistics statistics;
...
struct sched_entity *se;
struct sched_rt_entity *rt;
...
};

struct task_group {|---> stats[0] : of CPU0
...|
struct sched_statistics **stats; --|---> stats[1] : of CPU1
...|
   |---> stats[n] : of CPUn
 #ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity **se;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity  **rt_se;
 #endif
...
};

After the patch it is clearly that both of se or rt_se can easily get the
sched_statistics by a task_struct or a task_group.

- Function Interface

The original prototype of the schedstats helpers are

update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se)

The cfs_rq in these helpers is used to get the rq_clock, and the se is
used to get the struct sched_statistics and the struct task_struct. In
order to make these helpers available by all sched classes, we can pass
the rq, sched_statistics and task_struct directly.

Then the new helpers are

update_stats_wait_*(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)

which are independent of fair sched class.

To avoid vmlinux growing too large or introducing ovehead when
!schedstat_enabled(), some new helpers after schedstat_enabled() are also
introduced, Suggested by Mel. These helpers are in sched/stats.c,

__update_stats_wait_*(struct rq *rq, struct task_struct *p,
  struct sched_statistics *stats)

- Implementation

After we make the struct sched_statistics and the helpers of it
independent of fair sched class, we can easily use the schedstats
facility for RT sched class.

The schedstat usage in RT sched class is similar with fair sched class,
for example,
fairRT
enqueue update_stats_enqueue_fair   update_stats_enqueue_rt
dequeue update_stats_dequeue_fair   update_stats_dequeue_rt
put_prev_task   update_stats_wait_start update_stats_wait_start
set_next_task   update_stats_wait_end   update_stats_wait_end

- Usage
 
The user can get the schedstats information in the same way in fair sched
class. For example,
fairRT
task show   /proc/[pid]/sched   /proc/[pid]/sched
group show  cpu.stat in cgroup  cpu.stat in cgroup

The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can
be used to trace RT tasks as well. sched_stat_runtime can also be
supported in the future if it is helpful.

Yafang Shao (5):
  sched: don't include stats.h in sched.h
  sched, fair: use __schedstat_set() in set_next_entity()
  sched: make struct sched_statistics independent of fair sched class
  sched: make schedstats helpers independent of fair sched class
  sched, rt: support schedstats for RT sched class

 include/linux/sched.h|   3 +-
 kernel/sched/core.c  |  25 +++--
 kernel/sched/deadline.c  |   5 +-
 kernel/sched/debug.c |  82 +++
 kernel/sched/fair.c  | 209 +++
 kernel/sched/idle.c  |   1 +
 kernel/sched/rt.c| 178 -
 kernel/sched/sched.h |  13 ++-
 kernel/sched/stats.c | 105

[RFC PATCH v3 1/5] sched: don't include stats.h in sched.h

2020-11-27 Thread Yafang Shao

This patch is a preparation of the followup patches. In the followup
patches some common helpers will be defined in stats.h, and these common
helpers require some definitions in sched.h, so let's move stats.h out
of sched.h.

The source files which require stats.h include it specifically.

Signed-off-by: Yafang Shao 
---
 kernel/sched/core.c  | 1 +
 kernel/sched/deadline.c  | 1 +
 kernel/sched/debug.c | 1 +
 kernel/sched/fair.c  | 1 +
 kernel/sched/idle.c  | 1 +
 kernel/sched/rt.c| 2 +-
 kernel/sched/sched.h | 6 +-
 kernel/sched/stats.c | 1 +
 kernel/sched/stats.h | 2 ++
 kernel/sched/stop_task.c | 1 +
 10 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2003a7d5ab5..fd76628778f7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11,6 +11,7 @@
 #undef CREATE_TRACE_POINTS
 
 #include "sched.h"
+#include "stats.h"
 
 #include 
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f232305dcefe..7a0124f81a4f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,7 @@
  *Fabio Checconi 
  */
 #include "sched.h"
+#include "stats.h"
 #include "pelt.h"
 
 struct dl_bandwidth def_dl_bandwidth;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2357921580f9..9758aa1bba1e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -7,6 +7,7 @@
  * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
  */
 #include "sched.h"
+#include "stats.h"
 
 static DEFINE_SPINLOCK(sched_debug_lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8917d2d715ef..8ff1daa3d9bb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -21,6 +21,7 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 #include "sched.h"
+#include "stats.h"
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 24d0ee26377d..95c02cbca04a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
  *tasks which are handled in sched/fair.c )
  */
 #include "sched.h"
+#include "stats.h"
 
 #include 
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 49ec096a8aa1..af772ac0f32d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -4,7 +4,7 @@
  * policies)
  */
 #include "sched.h"
-
+#include "stats.h"
 #include "pelt.h"
 
 int sched_rr_timeslice = RR_TIMESLICE;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..871544bb9a38 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,9 @@
 /*
  * Scheduler internal types and methods:
  */
+#ifndef _KERNEL_SCHED_SCHED_H
+#define _KERNEL_SCHED_SCHED_H
+
 #include 
 
 #include 
@@ -1538,7 +1541,6 @@ extern void flush_smp_call_function_from_idle(void);
 static inline void flush_smp_call_function_from_idle(void) { }
 #endif
 
-#include "stats.h"
 #include "autogroup.h"
 
 #ifdef CONFIG_CGROUP_SCHED
@@ -2633,3 +2635,5 @@ static inline bool is_per_cpu_kthread(struct task_struct 
*p)
 
 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 750fb3c67eed..844bd9dbfbf0 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -3,6 +3,7 @@
  * /proc/schedstat implementation
  */
 #include "sched.h"
+#include "stats.h"
 
 /*
  * Current schedstat API version.
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 33d0daf83842..c23b653ffc53 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -2,6 +2,8 @@
 
 #ifdef CONFIG_SCHEDSTATS
 
+#include "sched.h"
+
 /*
  * Expects runqueue lock to be held for atomicity of update
  */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index ceb5b6b12561..a5d289049388 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -8,6 +8,7 @@
  * See kernel/stop_machine.c
  */
 #include "sched.h"
+#include "stats.h"
 
 #ifdef CONFIG_SMP
 static int
-- 
2.18.4

Re: [RFC PATCH v2 3/5] sched: make schedstats helper independent of cfs_rq

2020-11-24 Thread Yafang Shao

On Tue, Nov 24, 2020 at 7:40 PM Mel Gorman  wrote:
>
> On Mon, Nov 23, 2020 at 08:58:06PM +0800, Yafang Shao wrote:
> > The 'cfs_rq' in these helpers
> > update_stats_{wait_start, wait_end, enqueue_sleeper} is only used to get
> > the rq_clock, so we can pass the rq directly. Then these helpers can be
> > used by all sched class after being moved into stats.h.
> >
> > After that change, the size of vmlinux is increased around 824Bytes.
> >   w/o this patch, with this patch
> > Size of vmlinux:  7844383278444656
> >
> > Signed-off-by: Yafang Shao 
>
> The inline helpers are quite large. When I was suggesting that the overhead
> was minimal, what I expected what that the inline functions would be a
> schedstat_enabled() followed by a real function call. It would introduce
> a small additional overhead when schedstats are enabled but avoid vmlinux
> growing too large
>
> e.g.
>
> static inline void
> update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> if (!schedstat_enabled())
> return;
>
> __update_stats_wait_start(cfs_rq, se);
> }
>
> where __update_stats_wait_start then lives in kernel/sched/stats.c
>

Good idea!  Now I understand what you mean. Thanks for the detailed explanation.
I will update it in the next version.


-- 
Thanks
Yafang

[RFC PATCH v2 1/5] sched: don't include stats.h in sched.h

2020-11-23 Thread Yafang Shao

This patch is a preparation of the followup patches. In the followup
patches some common helpers will be defined in stats.h, and these common
helpers require some definitions in sched.h, so let's move stats.h out
of sched.h.

The source files which require stats.h include it specifically.

Signed-off-by: Yafang Shao 
---
 kernel/sched/core.c  | 1 +
 kernel/sched/deadline.c  | 1 +
 kernel/sched/debug.c | 1 +
 kernel/sched/fair.c  | 1 +
 kernel/sched/idle.c  | 1 +
 kernel/sched/rt.c| 2 +-
 kernel/sched/sched.h | 6 +-
 kernel/sched/stats.c | 1 +
 kernel/sched/stats.h | 2 ++
 kernel/sched/stop_task.c | 1 +
 10 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2003a7d5ab5..fd76628778f7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11,6 +11,7 @@
 #undef CREATE_TRACE_POINTS
 
 #include "sched.h"
+#include "stats.h"
 
 #include 
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f232305dcefe..7a0124f81a4f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,7 @@
  *Fabio Checconi 
  */
 #include "sched.h"
+#include "stats.h"
 #include "pelt.h"
 
 struct dl_bandwidth def_dl_bandwidth;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2357921580f9..9758aa1bba1e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -7,6 +7,7 @@
  * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
  */
 #include "sched.h"
+#include "stats.h"
 
 static DEFINE_SPINLOCK(sched_debug_lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8917d2d715ef..8ff1daa3d9bb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -21,6 +21,7 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 #include "sched.h"
+#include "stats.h"
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 24d0ee26377d..95c02cbca04a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
  *tasks which are handled in sched/fair.c )
  */
 #include "sched.h"
+#include "stats.h"
 
 #include 
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 49ec096a8aa1..af772ac0f32d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -4,7 +4,7 @@
  * policies)
  */
 #include "sched.h"
-
+#include "stats.h"
 #include "pelt.h"
 
 int sched_rr_timeslice = RR_TIMESLICE;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..871544bb9a38 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,9 @@
 /*
  * Scheduler internal types and methods:
  */
+#ifndef _KERNEL_SCHED_SCHED_H
+#define _KERNEL_SCHED_SCHED_H
+
 #include 
 
 #include 
@@ -1538,7 +1541,6 @@ extern void flush_smp_call_function_from_idle(void);
 static inline void flush_smp_call_function_from_idle(void) { }
 #endif
 
-#include "stats.h"
 #include "autogroup.h"
 
 #ifdef CONFIG_CGROUP_SCHED
@@ -2633,3 +2635,5 @@ static inline bool is_per_cpu_kthread(struct task_struct 
*p)
 
 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 750fb3c67eed..844bd9dbfbf0 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -3,6 +3,7 @@
  * /proc/schedstat implementation
  */
 #include "sched.h"
+#include "stats.h"
 
 /*
  * Current schedstat API version.
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 33d0daf83842..c23b653ffc53 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -2,6 +2,8 @@
 
 #ifdef CONFIG_SCHEDSTATS
 
+#include "sched.h"
+
 /*
  * Expects runqueue lock to be held for atomicity of update
  */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index ceb5b6b12561..a5d289049388 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -8,6 +8,7 @@
  * See kernel/stop_machine.c
  */
 #include "sched.h"
+#include "stats.h"
 
 #ifdef CONFIG_SMP
 static int
-- 
2.18.4

[RFC PATCH v2 4/5] sched: define update_stats_curr_start() as a common helper

2020-11-23 Thread Yafang Shao

update_stats_curr_start() is used to update the exec_start when we are
starting a new run period, which is used by all sched class. So we'd
better define it as a common helper.

Signed-off-by: Yafang Shao 
---
 kernel/sched/fair.c  | 14 +-
 kernel/sched/rt.c|  2 +-
 kernel/sched/sched.h | 12 
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 946b60f586e4..13e803369ced 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -915,18 +915,6 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct 
sched_entity *se, int flags)
}
 }
 
-/*
- * We are picking a new current task - update its stats:
- */
-static inline void
-update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-   /*
-* We are starting a new run period:
-*/
-   se->exec_start = rq_clock_task(rq_of(cfs_rq));
-}
-
 /**
  * Scheduling class queueing methods:
  */
@@ -4255,7 +4243,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
update_load_avg(cfs_rq, se, UPDATE_TG);
}
 
-   update_stats_curr_start(cfs_rq, se);
+   update_stats_curr_start(rq_of(cfs_rq), se);
cfs_rq->curr = se;
 
/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index af772ac0f32d..3422dd85cfb4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1574,7 +1574,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct 
task_struct *p, int flag
 
 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool 
first)
 {
-   p->se.exec_start = rq_clock_task(rq);
+   update_stats_curr_start(rq, >se);
 
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a4576ccf3d7..3948112dc31c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2644,4 +2644,16 @@ static inline struct task_struct *task_of(struct 
sched_entity *se)
return container_of(se, struct task_struct, se);
 }
 
+/*
+ * We are picking a new current task - update its stats:
+ */
+static inline void
+update_stats_curr_start(struct rq *rq, struct sched_entity *se)
+{
+   /*
+* We are starting a new run period:
+*/
+   se->exec_start = rq_clock_task(rq);
+}
+
 #endif /* _KERNEL_SCHED_SCHED_H */
-- 
2.18.4

[RFC PATCH v2 0/5] sched: support schedstat for RT sched class

2020-11-23 Thread Yafang Shao

We want to measure the latency of RT tasks in our production
environment with schedstat facility, but currently schedstat is only
supported for fair sched class. This patchset enable it for RT sched class
as well.

The schedstat statistics are defined in struct sched_entity, which is a
member of struct task_struct, so we can resue it for RT sched class.

The schedstat usage in RT sched class is similar with fair sched class,
for example,
fairRT
enqueue update_stats_enqueue_fair   update_stats_enqueue_rt
dequeue update_stats_dequeue_fair   update_stats_dequeue_rt
put_prev_task   update_stats_wait_start update_stats_wait_start
set_next_task   update_stats_wait_end   update_stats_wait_end
show/proc/[pid]/sched   /proc/[pid]/sched

The sched:sched_stats_* tracepoints can be used to trace RT tasks as
well after that patchset.

PATCH #1 ~ #4 are the preparation of PATCH #5.

- v2:
keep the schedstats functions inline, per Mel.

Yafang Shao (5):
  sched: don't include stats.h in sched.h
  sched: define task_of() as a common helper
  sched: make schedstats helper independent of cfs_rq
  sched: define update_stats_curr_start() as a common helper
  sched, rt: support schedstat for RT sched class

 kernel/sched/core.c  |   1 +
 kernel/sched/deadline.c  |   1 +
 kernel/sched/debug.c |   1 +
 kernel/sched/fair.c  | 174 ++-
 kernel/sched/idle.c  |   1 +
 kernel/sched/rt.c|  94 -
 kernel/sched/sched.h |  30 ++-
 kernel/sched/stats.c |   1 +
 kernel/sched/stats.h | 146 
 kernel/sched/stop_task.c |   1 +
 10 files changed, 280 insertions(+), 170 deletions(-)

-- 
2.18.4

[RFC PATCH v2 2/5] sched: define task_of() as a common helper

2020-11-23 Thread Yafang Shao

task_of() is used to get task_struct from sched_entity. As sched_entity
in struct task_struct can be used by all sched class, we'd better move
this macro into sched.h, then it can be used by all sched class.

Signed-off-by: Yafang Shao 
---
 kernel/sched/fair.c  | 11 ---
 kernel/sched/sched.h |  8 
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8ff1daa3d9bb..59e454cae3be 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -259,12 +259,6 @@ const struct sched_class fair_sched_class;
  */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-   SCHED_WARN_ON(!entity_is_task(se));
-   return container_of(se, struct task_struct, se);
-}
-
 /* Walk up scheduling entities hierarchy */
 #define for_each_sched_entity(se) \
for (; se; se = se->parent)
@@ -446,11 +440,6 @@ find_matching_se(struct sched_entity **se, struct 
sched_entity **pse)
 
 #else  /* !CONFIG_FAIR_GROUP_SCHED */
 
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-   return container_of(se, struct task_struct, se);
-}
-
 #define for_each_sched_entity(se) \
for (; se; se = NULL)
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 871544bb9a38..9a4576ccf3d7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2636,4 +2636,12 @@ static inline bool is_per_cpu_kthread(struct task_struct 
*p)
 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
 
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+   SCHED_WARN_ON(!entity_is_task(se));
+#endif
+   return container_of(se, struct task_struct, se);
+}
+
 #endif /* _KERNEL_SCHED_SCHED_H */
-- 
2.18.4

[RFC PATCH v2 3/5] sched: make schedstats helper independent of cfs_rq

2020-11-23 Thread Yafang Shao

The 'cfs_rq' in these helpers
update_stats_{wait_start, wait_end, enqueue_sleeper} is only used to get
the rq_clock, so we can pass the rq directly. Then these helpers can be
used by all sched class after being moved into stats.h.

After that change, the size of vmlinux is increased around 824Bytes.
w/o this patch, with this patch
Size of vmlinux:7844383278444656

Signed-off-by: Yafang Shao 
---
 kernel/sched/fair.c  | 148 ++-
 kernel/sched/stats.h | 144 +
 2 files changed, 149 insertions(+), 143 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 59e454cae3be..946b60f586e4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -869,124 +869,6 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(>curr->se));
 }
 
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-   u64 wait_start, prev_wait_start;
-
-   if (!schedstat_enabled())
-   return;
-
-   wait_start = rq_clock(rq_of(cfs_rq));
-   prev_wait_start = schedstat_val(se->statistics.wait_start);
-
-   if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
-   likely(wait_start > prev_wait_start))
-   wait_start -= prev_wait_start;
-
-   __schedstat_set(se->statistics.wait_start, wait_start);
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-   struct task_struct *p;
-   u64 delta;
-
-   if (!schedstat_enabled())
-   return;
-
-   delta = rq_clock(rq_of(cfs_rq)) - 
schedstat_val(se->statistics.wait_start);
-
-   if (entity_is_task(se)) {
-   p = task_of(se);
-   if (task_on_rq_migrating(p)) {
-   /*
-* Preserve migrating task's wait time so wait_start
-* time stamp can be adjusted to accumulate wait time
-* prior to migration.
-*/
-   __schedstat_set(se->statistics.wait_start, delta);
-   return;
-   }
-   trace_sched_stat_wait(p, delta);
-   }
-
-   __schedstat_set(se->statistics.wait_max,
- max(schedstat_val(se->statistics.wait_max), delta));
-   __schedstat_inc(se->statistics.wait_count);
-   __schedstat_add(se->statistics.wait_sum, delta);
-   __schedstat_set(se->statistics.wait_start, 0);
-}
-
-static inline void
-update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-   struct task_struct *tsk = NULL;
-   u64 sleep_start, block_start;
-
-   if (!schedstat_enabled())
-   return;
-
-   sleep_start = schedstat_val(se->statistics.sleep_start);
-   block_start = schedstat_val(se->statistics.block_start);
-
-   if (entity_is_task(se))
-   tsk = task_of(se);
-
-   if (sleep_start) {
-   u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
-
-   if ((s64)delta < 0)
-   delta = 0;
-
-   if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
-   __schedstat_set(se->statistics.sleep_max, delta);
-
-   __schedstat_set(se->statistics.sleep_start, 0);
-   __schedstat_add(se->statistics.sum_sleep_runtime, delta);
-
-   if (tsk) {
-   account_scheduler_latency(tsk, delta >> 10, 1);
-   trace_sched_stat_sleep(tsk, delta);
-   }
-   }
-   if (block_start) {
-   u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
-
-   if ((s64)delta < 0)
-   delta = 0;
-
-   if (unlikely(delta > schedstat_val(se->statistics.block_max)))
-   __schedstat_set(se->statistics.block_max, delta);
-
-   __schedstat_set(se->statistics.block_start, 0);
-   __schedstat_add(se->statistics.sum_sleep_runtime, delta);
-
-   if (tsk) {
-   if (tsk->in_iowait) {
-   __schedstat_add(se->statistics.iowait_sum, 
delta);
-   __schedstat_inc(se->statistics.iowait_count);
-   trace_sched_stat_iowait(tsk, delta);
-   }
-
-   trace_sched_stat_blocked(tsk, delta);
-
-   /*
-* Blocking time is in units of nanosecs, so shift by
-* 20 to get a milliseconds-range estimation of the
-* amount of time that the task spent sleeping:
-*/
-

[RFC PATCH v2 5/5] sched, rt: support schedstat for RT sched class

2020-11-23 Thread Yafang Shao

We want to measure the latency of RT tasks in our production
environment with schedstat facility, but currently schedstat is only
supported for fair sched class. This patch enable it for RT sched class
as well.

The schedstat statistics are define in struct sched_entity, which is a
member of struct task_struct, so we can resue it for RT sched class.

The schedstat usage in RT sched class is similar with fair sched class,
for example,
fairRT
enqueue update_stats_enqueue_fair   update_stats_enqueue_rt
dequeue update_stats_dequeue_fair   update_stats_dequeue_rt
put_prev_task   update_stats_wait_start update_stats_wait_start
set_next_task   update_stats_wait_end   update_stats_wait_end
show/proc/[pid]/sched   /proc/[pid]/sched

The sched:sched_stats_* tracepoints can be used to trace RT tasks as
well.

Signed-off-by: Yafang Shao 
---
 kernel/sched/rt.c| 90 
 kernel/sched/sched.h |  4 ++
 2 files changed, 94 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3422dd85cfb4..f2eff92275f0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1246,6 +1246,75 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct 
rt_rq *rt_rq)
dec_rt_group(rt_se, rt_rq);
 }
 
+#ifdef CONFIG_SCHEDSTATS
+
+static inline bool
+rt_se_is_waiting(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+   return rt_se != rt_rq->curr;
+}
+
+static inline void
+rt_rq_curr_set(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+   rt_rq->curr = rt_se;
+}
+
+#else
+
+static inline bool
+rt_se_is_waiting(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+   return false;
+}
+
+static inline void
+rt_rq_curr_set(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+}
+
+#endif
+
+static inline void
+update_stats_enqueue_rt(struct rq *rq, struct sched_entity *se,
+   struct sched_rt_entity *rt_se, int flags)
+{
+   struct rt_rq *rt_rq = >rt;
+
+   if (!schedstat_enabled())
+   return;
+
+   if (rt_se_is_waiting(rt_rq, rt_se))
+   update_stats_wait_start(rq, se);
+
+   if (flags & ENQUEUE_WAKEUP)
+   update_stats_enqueue_sleeper(rq, se);
+}
+
+static inline void
+update_stats_dequeue_rt(struct rq *rq, struct sched_entity *se,
+   struct sched_rt_entity *rt_se, int flags)
+{
+   struct rt_rq *rt_rq = >rt;
+
+   if (!schedstat_enabled())
+   return;
+
+   if (rt_se_is_waiting(rt_rq, rt_se))
+   update_stats_wait_end(rq, se);
+
+   if ((flags & DEQUEUE_SLEEP) && rt_entity_is_task(rt_se)) {
+   struct task_struct *tsk = rt_task_of(rt_se);
+
+   if (tsk->state & TASK_INTERRUPTIBLE)
+   __schedstat_set(se->statistics.sleep_start,
+   rq_clock(rq));
+   if (tsk->state & TASK_UNINTERRUPTIBLE)
+   __schedstat_set(se->statistics.block_start,
+   rq_clock(rq));
+   }
+}
+
 /*
  * Change rt_se->run_list location unless SAVE && !MOVE
  *
@@ -1275,6 +1344,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity 
*rt_se, unsigned int flag
struct rt_prio_array *array = _rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
struct list_head *queue = array->queue + rt_se_prio(rt_se);
+   struct task_struct *task = rt_task_of(rt_se);
 
/*
 * Don't enqueue the group if its throttled, or when empty.
@@ -1288,6 +1358,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity 
*rt_se, unsigned int flag
return;
}
 
+   update_stats_enqueue_rt(rq_of_rt_rq(rt_rq), >se, rt_se, flags);
+
if (move_entity(flags)) {
WARN_ON_ONCE(rt_se->on_list);
if (flags & ENQUEUE_HEAD)
@@ -1307,7 +1379,9 @@ static void __dequeue_rt_entity(struct sched_rt_entity 
*rt_se, unsigned int flag
 {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = _rq->active;
+   struct task_struct *task = rt_task_of(rt_se);
 
+   update_stats_dequeue_rt(rq_of_rt_rq(rt_rq), >se, rt_se, flags);
if (move_entity(flags)) {
WARN_ON_ONCE(!rt_se->on_list);
__delist_rt_entity(rt_se, array);
@@ -1374,6 +1448,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int 
flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
 
+   check_schedstat_required();
enqueue_rt_entity(rt_se, flags);
 
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -1574,6 +1649,12 @@ static void check_preempt_curr_rt(struct rq *rq, struct 
task_struct *p, int flag
 
 static inline void set_next_

Re: [RFC PATCH 4/4] sched, rt: support schedstat for RT sched class

2020-11-20 Thread Yafang Shao

On Fri, Nov 20, 2020 at 10:39 AM jun qian  wrote:
>
> Yafang Shao  于2020年11月19日周四 上午11:55写道：
> >
> > We want to measure the latency of RT tasks in our production
> > environment with schedstat facility, but currently schedstat is only
> > supported for fair sched class. This patch enable it for RT sched class
> > as well.
> >
> > The schedstat statistics are define in struct sched_entity, which is a
> > member of struct task_struct, so we can resue it for RT sched class.
> >
> > The schedstat usage in RT sched class is similar with fair sched class,
> > for example,
> > fairRT
> > enqueue update_stats_enqueue_fair   update_stats_enqueue_rt
> > dequeue update_stats_dequeue_fair   update_stats_dequeue_rt
> > put_prev_task   update_stats_wait_start update_stats_wait_start
> > set_next_task   update_stats_wait_end   update_stats_wait_end
> > show/proc/[pid]/sched   /proc/[pid]/sched
> >
> > The sched:sched_stats_* tracepoints can be used to trace RT tasks as
> > well.
> >
> > Signed-off-by: Yafang Shao 
> > ---
> >  kernel/sched/rt.c| 61 
> >  kernel/sched/sched.h |  2 ++
> >  2 files changed, 63 insertions(+)
> >
> > diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> > index b9ec886702a1..a318236b7166 100644
> > --- a/kernel/sched/rt.c
> > +++ b/kernel/sched/rt.c
> > @@ -1246,6 +1246,46 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, 
> > struct rt_rq *rt_rq)
> > dec_rt_group(rt_se, rt_rq);
> >  }
> >
>
> Does the deadline schedule class should be considered also?
>

deadline sched class can be supported as well per my understanding, I
think we can do it later.
This patchset only aims to support RT sched class.

> thanks
>
> > +static inline void
> > +update_stats_enqueue_rt(struct rq *rq, struct sched_entity *se,
> > +   struct sched_rt_entity *rt_se, int flags)
> > +{
> > +   struct rt_rq *rt_rq = >rt;
> > +
> > +   if (!schedstat_enabled())
> > +   return;
> > +
> > +   if (rt_se != rt_rq->curr)
> > +   update_stats_wait_start(rq, se);
> > +
> > +   if (flags & ENQUEUE_WAKEUP)
> > +   update_stats_enqueue_sleeper(rq, se);
> > +}
> > +
> > +static inline void
> > +update_stats_dequeue_rt(struct rq *rq, struct sched_entity *se,
> > +   struct sched_rt_entity *rt_se, int flags)
> > +{
> > +   struct rt_rq *rt_rq = >rt;
> > +
> > +   if (!schedstat_enabled())
> > +   return;
> > +
> > +   if (rt_se != rt_rq->curr)
> > +   update_stats_wait_end(rq, se);
> > +
> > +   if ((flags & DEQUEUE_SLEEP) && rt_entity_is_task(rt_se)) {
> > +   struct task_struct *tsk = rt_task_of(rt_se);
> > +
> > +   if (tsk->state & TASK_INTERRUPTIBLE)
> > +   __schedstat_set(se->statistics.sleep_start,
> > +   rq_clock(rq));
> > +   if (tsk->state & TASK_UNINTERRUPTIBLE)
> > +   __schedstat_set(se->statistics.block_start,
> > +   rq_clock(rq));
> > +   }
> > +}
> > +
> >  /*
> >   * Change rt_se->run_list location unless SAVE && !MOVE
> >   *
> > @@ -1275,6 +1315,7 @@ static void __enqueue_rt_entity(struct 
> > sched_rt_entity *rt_se, unsigned int flag
> > struct rt_prio_array *array = _rq->active;
> > struct rt_rq *group_rq = group_rt_rq(rt_se);
> > struct list_head *queue = array->queue + rt_se_prio(rt_se);
> > +   struct task_struct *task = rt_task_of(rt_se);
> >
> > /*
> >  * Don't enqueue the group if its throttled, or when empty.
> > @@ -1288,6 +1329,8 @@ static void __enqueue_rt_entity(struct 
> > sched_rt_entity *rt_se, unsigned int flag
> > return;
> > }
> >
> > +   update_stats_enqueue_rt(rq_of_rt_rq(rt_rq), >se, rt_se, 
> > flags);
> > +
> > if (move_entity(flags)) {
> > WARN_ON_ONCE(rt_se->on_list);
> > if (flags & ENQUEUE_HEAD)
> > @@ -1307,7 +1350,9 @@ static void __dequeue_rt_entity(struct 
> > sched_rt_entity *rt_se, unsigned int flag
> >  {
> >

Re: [RFC PATCH 2/4] sched: make schedstats helpers not depend on cfs_rq

2020-11-19 Thread Yafang Shao

On Thu, Nov 19, 2020 at 3:45 PM Mel Gorman  wrote:
>
> On Thu, Nov 19, 2020 at 11:52:28AM +0800, Yafang Shao wrote:
> > The 'cfs_rq' in these helpers is only used to get the rq_clock, so we
> > can pass the rq_clock directly. After that, these helpers can be used by
> > all sched class.
> >
> > Signed-off-by: Yafang Shao 
>
> This introduces overhead in the general case even when schedstats is
> disabled. Previously, update_stats_wait_start was a static inline so
> function call overhead was avoided and schedstat_enabled() meant the
> overhead was negligible. As it's now a function call, the cost of the
> function entry/exit will be unconditionally hit regardless of intrest
> in schedstat.
>
> Regardless of the merit of adding schedstats for RT, the overhead of
> schedstats when stats are disabled should remain the same with the
> static branch check done in an inline function.
>

Thanks for the explanation.
I will make them inline in the next version.


-- 
Thanks
Yafang

[RFC PATCH 2/4] sched: make schedstats helpers not depend on cfs_rq

2020-11-18 Thread Yafang Shao

The 'cfs_rq' in these helpers is only used to get the rq_clock, so we
can pass the rq_clock directly. After that, these helpers can be used by
all sched class.

Signed-off-by: Yafang Shao 
---
 kernel/sched/fair.c  | 148 ++-
 kernel/sched/stats.c | 134 +++
 kernel/sched/stats.h |  11 
 3 files changed, 150 insertions(+), 143 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9d73e8e5ebec..aba21191283d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -868,124 +868,6 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(>curr->se));
 }
 
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-   u64 wait_start, prev_wait_start;
-
-   if (!schedstat_enabled())
-   return;
-
-   wait_start = rq_clock(rq_of(cfs_rq));
-   prev_wait_start = schedstat_val(se->statistics.wait_start);
-
-   if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
-   likely(wait_start > prev_wait_start))
-   wait_start -= prev_wait_start;
-
-   __schedstat_set(se->statistics.wait_start, wait_start);
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-   struct task_struct *p;
-   u64 delta;
-
-   if (!schedstat_enabled())
-   return;
-
-   delta = rq_clock(rq_of(cfs_rq)) - 
schedstat_val(se->statistics.wait_start);
-
-   if (entity_is_task(se)) {
-   p = task_of(se);
-   if (task_on_rq_migrating(p)) {
-   /*
-* Preserve migrating task's wait time so wait_start
-* time stamp can be adjusted to accumulate wait time
-* prior to migration.
-*/
-   __schedstat_set(se->statistics.wait_start, delta);
-   return;
-   }
-   trace_sched_stat_wait(p, delta);
-   }
-
-   __schedstat_set(se->statistics.wait_max,
- max(schedstat_val(se->statistics.wait_max), delta));
-   __schedstat_inc(se->statistics.wait_count);
-   __schedstat_add(se->statistics.wait_sum, delta);
-   __schedstat_set(se->statistics.wait_start, 0);
-}
-
-static inline void
-update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-   struct task_struct *tsk = NULL;
-   u64 sleep_start, block_start;
-
-   if (!schedstat_enabled())
-   return;
-
-   sleep_start = schedstat_val(se->statistics.sleep_start);
-   block_start = schedstat_val(se->statistics.block_start);
-
-   if (entity_is_task(se))
-   tsk = task_of(se);
-
-   if (sleep_start) {
-   u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
-
-   if ((s64)delta < 0)
-   delta = 0;
-
-   if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
-   __schedstat_set(se->statistics.sleep_max, delta);
-
-   __schedstat_set(se->statistics.sleep_start, 0);
-   __schedstat_add(se->statistics.sum_sleep_runtime, delta);
-
-   if (tsk) {
-   account_scheduler_latency(tsk, delta >> 10, 1);
-   trace_sched_stat_sleep(tsk, delta);
-   }
-   }
-   if (block_start) {
-   u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
-
-   if ((s64)delta < 0)
-   delta = 0;
-
-   if (unlikely(delta > schedstat_val(se->statistics.block_max)))
-   __schedstat_set(se->statistics.block_max, delta);
-
-   __schedstat_set(se->statistics.block_start, 0);
-   __schedstat_add(se->statistics.sum_sleep_runtime, delta);
-
-   if (tsk) {
-   if (tsk->in_iowait) {
-   __schedstat_add(se->statistics.iowait_sum, 
delta);
-   __schedstat_inc(se->statistics.iowait_count);
-   trace_sched_stat_iowait(tsk, delta);
-   }
-
-   trace_sched_stat_blocked(tsk, delta);
-
-   /*
-* Blocking time is in units of nanosecs, so shift by
-* 20 to get a milliseconds-range estimation of the
-* amount of time that the task spent sleeping:
-*/
-   if (unlikely(prof_on == SLEEP_PROFILING)) {
-   profile_hits(SLEEP_PROFILING,
-   (void *)get_wchan(tsk),
-

[RFC PATCH 3/4] sched: define update_stats_curr_start() as a common helper

2020-11-18 Thread Yafang Shao

update_stats_curr_start() is used to update the exec_start when we are
starting a new run period, which is used by all sched class. So we'd
better define it as a common helper.

Signed-off-by: Yafang Shao 
---
 kernel/sched/fair.c  | 14 +-
 kernel/sched/rt.c|  2 +-
 kernel/sched/sched.h | 12 
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aba21191283d..b762cc3e165c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -914,18 +914,6 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct 
sched_entity *se, int flags)
}
 }
 
-/*
- * We are picking a new current task - update its stats:
- */
-static inline void
-update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-   /*
-* We are starting a new run period:
-*/
-   se->exec_start = rq_clock_task(rq_of(cfs_rq));
-}
-
 /**
  * Scheduling class queueing methods:
  */
@@ -4254,7 +4242,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
update_load_avg(cfs_rq, se, UPDATE_TG);
}
 
-   update_stats_curr_start(cfs_rq, se);
+   update_stats_curr_start(rq_of(cfs_rq), se);
cfs_rq->curr = se;
 
/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 49ec096a8aa1..b9ec886702a1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1574,7 +1574,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct 
task_struct *p, int flag
 
 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool 
first)
 {
-   p->se.exec_start = rq_clock_task(rq);
+   update_stats_curr_start(rq, >se);
 
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e29f2375c4f5..28986736ced9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2642,3 +2642,15 @@ static inline struct task_struct *task_of(struct 
sched_entity *se)
return container_of(se, struct task_struct, se);
 }
 
+/*
+ * We are picking a new current task - update its stats:
+ */
+static inline void
+update_stats_curr_start(struct rq *rq, struct sched_entity *se)
+{
+   /*
+* We are starting a new run period:
+*/
+   se->exec_start = rq_clock_task(rq);
+}
+
-- 
2.18.4

[RFC PATCH 4/4] sched, rt: support schedstat for RT sched class

2020-11-18 Thread Yafang Shao

We want to measure the latency of RT tasks in our production
environment with schedstat facility, but currently schedstat is only
supported for fair sched class. This patch enable it for RT sched class
as well.

The schedstat statistics are define in struct sched_entity, which is a
member of struct task_struct, so we can resue it for RT sched class.

The schedstat usage in RT sched class is similar with fair sched class,
for example,
fairRT
enqueue update_stats_enqueue_fair   update_stats_enqueue_rt
dequeue update_stats_dequeue_fair   update_stats_dequeue_rt
put_prev_task   update_stats_wait_start update_stats_wait_start
set_next_task   update_stats_wait_end   update_stats_wait_end
show/proc/[pid]/sched   /proc/[pid]/sched

The sched:sched_stats_* tracepoints can be used to trace RT tasks as
well.

Signed-off-by: Yafang Shao 
---
 kernel/sched/rt.c| 61 
 kernel/sched/sched.h |  2 ++
 2 files changed, 63 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b9ec886702a1..a318236b7166 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1246,6 +1246,46 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct 
rt_rq *rt_rq)
dec_rt_group(rt_se, rt_rq);
 }
 
+static inline void
+update_stats_enqueue_rt(struct rq *rq, struct sched_entity *se,
+   struct sched_rt_entity *rt_se, int flags)
+{
+   struct rt_rq *rt_rq = >rt;
+
+   if (!schedstat_enabled())
+   return;
+
+   if (rt_se != rt_rq->curr)
+   update_stats_wait_start(rq, se);
+
+   if (flags & ENQUEUE_WAKEUP)
+   update_stats_enqueue_sleeper(rq, se);
+}
+
+static inline void
+update_stats_dequeue_rt(struct rq *rq, struct sched_entity *se,
+   struct sched_rt_entity *rt_se, int flags)
+{
+   struct rt_rq *rt_rq = >rt;
+
+   if (!schedstat_enabled())
+   return;
+
+   if (rt_se != rt_rq->curr)
+   update_stats_wait_end(rq, se);
+
+   if ((flags & DEQUEUE_SLEEP) && rt_entity_is_task(rt_se)) {
+   struct task_struct *tsk = rt_task_of(rt_se);
+
+   if (tsk->state & TASK_INTERRUPTIBLE)
+   __schedstat_set(se->statistics.sleep_start,
+   rq_clock(rq));
+   if (tsk->state & TASK_UNINTERRUPTIBLE)
+   __schedstat_set(se->statistics.block_start,
+   rq_clock(rq));
+   }
+}
+
 /*
  * Change rt_se->run_list location unless SAVE && !MOVE
  *
@@ -1275,6 +1315,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity 
*rt_se, unsigned int flag
struct rt_prio_array *array = _rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
struct list_head *queue = array->queue + rt_se_prio(rt_se);
+   struct task_struct *task = rt_task_of(rt_se);
 
/*
 * Don't enqueue the group if its throttled, or when empty.
@@ -1288,6 +1329,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity 
*rt_se, unsigned int flag
return;
}
 
+   update_stats_enqueue_rt(rq_of_rt_rq(rt_rq), >se, rt_se, flags);
+
if (move_entity(flags)) {
WARN_ON_ONCE(rt_se->on_list);
if (flags & ENQUEUE_HEAD)
@@ -1307,7 +1350,9 @@ static void __dequeue_rt_entity(struct sched_rt_entity 
*rt_se, unsigned int flag
 {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = _rq->active;
+   struct task_struct *task = rt_task_of(rt_se);
 
+   update_stats_dequeue_rt(rq_of_rt_rq(rt_rq), >se, rt_se, flags);
if (move_entity(flags)) {
WARN_ON_ONCE(!rt_se->on_list);
__delist_rt_entity(rt_se, array);
@@ -1374,6 +1419,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int 
flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
 
+   check_schedstat_required();
enqueue_rt_entity(rt_se, flags);
 
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -1574,6 +1620,12 @@ static void check_preempt_curr_rt(struct rq *rq, struct 
task_struct *p, int flag
 
 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool 
first)
 {
+   struct sched_rt_entity *rt_se = >rt;
+   struct rt_rq *rt_rq = >rt;
+
+   if (on_rt_rq(>rt))
+   update_stats_wait_end(rq, >se);
+
update_stats_curr_start(rq, >se);
 
/* The running task is never eligible for pushing */
@@ -1591,6 +1643,8 @@ static inline void set_next_task_rt(struct rq *rq, struct 
task_struct *p, bool f
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 
rt_queue_push_tasks(rq);
+

[RFC PATCH 1/4] sched: define task_of() as a common helper

2020-11-18 Thread Yafang Shao

task_of() is used to get task_struct from sched_entity. As sched_entity
in struct task_struct can be used by all sched class, we'd better move
this macro into sched.h, then it can be used by all sched class.

Signed-off-by: Yafang Shao 
---
 kernel/sched/fair.c  | 11 ---
 kernel/sched/sched.h |  9 +
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8917d2d715ef..9d73e8e5ebec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -258,12 +258,6 @@ const struct sched_class fair_sched_class;
  */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-   SCHED_WARN_ON(!entity_is_task(se));
-   return container_of(se, struct task_struct, se);
-}
-
 /* Walk up scheduling entities hierarchy */
 #define for_each_sched_entity(se) \
for (; se; se = se->parent)
@@ -445,11 +439,6 @@ find_matching_se(struct sched_entity **se, struct 
sched_entity **pse)
 
 #else  /* !CONFIG_FAIR_GROUP_SCHED */
 
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-   return container_of(se, struct task_struct, se);
-}
-
 #define for_each_sched_entity(se) \
for (; se; se = NULL)
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..e29f2375c4f5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2633,3 +2633,12 @@ static inline bool is_per_cpu_kthread(struct task_struct 
*p)
 
 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+   SCHED_WARN_ON(!entity_is_task(se));
+#endif
+   return container_of(se, struct task_struct, se);
+}
+
-- 
2.18.4

[RFC PATCH 0/4] sched: support schedstat for RT sched class

2020-11-18 Thread Yafang Shao

We want to measure the latency of RT tasks in our production
environment with schedstat facility, but currently schedstat is only
supported for fair sched class. This patchset enable it for RT sched class
as well.

The schedstat statistics are defined in struct sched_entity, which is a
member of struct task_struct, so we can resue it for RT sched class.

The schedstat usage in RT sched class is similar with fair sched class,
for example,
fairRT
enqueue update_stats_enqueue_fair   update_stats_enqueue_rt
dequeue update_stats_dequeue_fair   update_stats_dequeue_rt
put_prev_task   update_stats_wait_start update_stats_wait_start
set_next_task   update_stats_wait_end   update_stats_wait_end
show/proc/[pid]/sched   /proc/[pid]/sched

The sched:sched_stats_* tracepoints can be used to trace RT tasks as
well after that patchset.

PATCH #1 ~ #3 are the preparation of PATCH #4.

Yafang Shao (4):
  sched: define task_of() as a common helper
  sched: make schedstats helpers not depend on cfs_rq
  sched: define update_stats_curr_start() as a common helper
  sched, rt: support schedstat for RT sched class

 kernel/sched/fair.c  | 173 ++-
 kernel/sched/rt.c|  63 +++-
 kernel/sched/sched.h |  23 ++
 kernel/sched/stats.c | 134 +
 kernel/sched/stats.h |  11 +++
 5 files changed, 236 insertions(+), 168 deletions(-)

-- 
2.18.4

Re: [PATCH 1/1] Sched/fair: Improve the accuracy of sched_stat_wait statistics

2020-10-15 Thread Yafang Shao

On Wed, Oct 14, 2020 at 9:19 PM Peter Zijlstra  wrote:
>
> On Fri, Oct 09, 2020 at 05:25:30PM +0800, qianjun.ker...@gmail.com wrote:
> > From: jun qian 
> >
> > When the sched_schedstat changes from 0 to 1, some sched se maybe
> > already in the runqueue, the se->statistics.wait_start will be 0.
> > So it will let the (rq_of(cfs_rq)) - se->statistics.wait_start)
> > wrong. We need to avoid this scenario.
> >
> > Signed-off-by: jun qian 
> > Signed-off-by: Yafang Shao 
>
> This SoB chain isn't valid. Did Yafang's tag need to a reviewed-by or
> something?
>

This patch improves the behavior when sched_schedstat is changed from
0 to 1, so it looks good to me.

Reviewed-by: Yafang Shao 

> > ---
> >  kernel/sched/fair.c | 9 +
> >  1 file changed, 9 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 1a68a05..6f8ca0c 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -906,6 +906,15 @@ static void update_curr_fair(struct rq *rq)
> >   if (!schedstat_enabled())
> >   return;
> >
> > + /*
> > +  * When the sched_schedstat changes from 0 to 1, some sched se
> > +  * maybe already in the runqueue, the se->statistics.wait_start
> > +  * will be 0.So it will let the delta wrong. We need to avoid this
> > +  * scenario.
> > +  */
> > + if (unlikely(!schedstat_val(se->statistics.wait_start)))
> > + return;
> > +
> >   delta = rq_clock(rq_of(cfs_rq)) - 
> > schedstat_val(se->statistics.wait_start);
> >
> >   if (entity_is_task(se)) {
> > --
> > 1.8.3.1
> >



-- 
Thanks
Yafang

Re: [PATCH] tracing: add tgid into common field

2020-10-13 Thread Yafang Shao

On Tue, Oct 13, 2020 at 9:05 PM Steven Rostedt  wrote:
>
> On Tue, 13 Oct 2020 13:54:54 +0800
> Yafang Shao  wrote:
>
> > --- a/include/linux/trace_events.h
> > +++ b/include/linux/trace_events.h
> > @@ -67,6 +67,7 @@ struct trace_entry {
> >   unsigned char   flags;
> >   unsigned char   preempt_count;
> >   int pid;
> > + int tgid;
> >  };
> >
>
> Sorry, this adds another 4 bytes to *every* event, if you want it to or
> not. I'd rather have some ways to remove fields from this header, and not
> add more to it.
>
> I can't take this patch.
>

Right, that is an overhead. I will think about some ways to avoid adding it.


-- 
Thanks
Yafang

[PATCH] tracing: add tgid into common field

2020-10-12 Thread Yafang Shao

Sometimes we want to trace a specific mutil-threaded process, which may
create threads dynamically. Currently it is not easy to trace all its
threads, because we can only filter these threads out by using
common_pid.

This patch adds the tgid into the common field as well, with which we
can easily filter this mutil-threaded process out. E.g.

$ cd /sys/kernel/debug/tracing
$ echo 'common_tgid == 4054' > events/sched/sched_wakeup/filter
$ cat trace_pipe
  python-4057[005] d... 48003.898560: sched_wakeup: comm=python 
pid=4054 prio=120 target_cpu=002
  python-4054[002] dNs. 48003.932906: sched_wakeup: 
comm=kworker/2:2 pid=130 prio=120 target_cpu=002
  python-4054[002] dNH. 48003.932907: sched_wakeup: comm=cat 
pid=4084 prio=120 target_cpu=004
  python-4055[003] d... 48004.816596: sched_wakeup: comm=python 
pid=4054 prio=120 target_cpu=002

With record-tgid set into trace_options, we can show the tgid,

$ echo record-tgid > trace_options
$ cat trace_pipe
  python-4054(   4054) [002] d... 48166.611771: sched_wakeup: 
comm=python pid=4055 prio=120 target_cpu=004
  python-4057(   4054) [005] d... 48166.611776: sched_wakeup: 
comm=python pid=4054 prio=120 target_cpu=002
  python-4055(   4054) [004] d... 48166.611848: sched_wakeup: 
comm=python pid=4054 prio=120 target_cpu=002

After that change, tgid_map is only used by saved_tgid, which may be
used by some user tools, so I just keep it as-is.

Signed-off-by: Yafang Shao 
---
 include/linux/trace_events.h | 1 +
 kernel/trace/trace.c | 1 +
 kernel/trace/trace_events.c  | 1 +
 kernel/trace/trace_output.c  | 2 +-
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 5c6943354049..3725c05f0b01 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -67,6 +67,7 @@ struct trace_entry {
unsigned char   flags;
unsigned char   preempt_count;
int pid;
+   int tgid;
 };
 
 #define TRACE_EVENT_TYPE_MAX   \
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d3e5de717df2..c2423efaac2c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2445,6 +2445,7 @@ tracing_generic_entry_update(struct trace_entry *entry, 
unsigned short type,
 
entry->preempt_count= pc & 0xff;
entry->pid  = (tsk) ? tsk->pid : 0;
+   entry->tgid = (tsk) ? tsk->tgid : 0;
entry->type = type;
entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a85effb2373b..9a5adcecf245 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -182,6 +182,7 @@ static int trace_define_common_fields(void)
__common_field(unsigned char, flags);
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
+   __common_field(int, tgid);
 
return ret;
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 000e9dc224c6..e04dd45267c7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -591,7 +591,7 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid);
 
if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
-   unsigned int tgid = trace_find_tgid(entry->pid);
+   unsigned int tgid = entry->tgid;
 
if (!tgid)
trace_seq_printf(s, "(---) ");
-- 
2.17.1

Re: [PATCH v3 2/2] mmap_lock: add tracepoints around lock acquisition

2020-10-09 Thread Yafang Shao

On Sat, Oct 10, 2020 at 6:05 AM Axel Rasmussen  wrote:
>
> The goal of these tracepoints is to be able to debug lock contention
> issues. This lock is acquired on most (all?) mmap / munmap / page fault
> operations, so a multi-threaded process which does a lot of these can
> experience significant contention.
>
> We trace just before we start acquisition, when the acquisition returns
> (whether it succeeded or not), and when the lock is released (or
> downgraded). The events are broken out by lock type (read / write).
>
> The events are also broken out by memcg path. For container-based
> workloads, users often think of several processes in a memcg as a single
> logical "task", so collecting statistics at this level is useful.
>
> The end goal is to get latency information. This isn't directly included
> in the trace events. Instead, users are expected to compute the time
> between "start locking" and "acquire returned", using e.g. synthetic
> events or BPF. The benefit we get from this is simpler code.
>
> Because we use tracepoint_enabled() to decide whether or not to trace,
> this patch has effectively no overhead unless tracepoints are enabled at
> runtime. If tracepoints are enabled, there is a performance impact, but
> how much depends on exactly what e.g. the BPF program does.
>
> Signed-off-by: Axel Rasmussen 

Acked-by: Yafang Shao 

> ---
>  include/linux/mmap_lock.h| 93 ++--
>  include/trace/events/mmap_lock.h | 70 
>  mm/Makefile  |  2 +-
>  mm/mmap_lock.c   | 87 ++
>  4 files changed, 246 insertions(+), 6 deletions(-)
>  create mode 100644 include/trace/events/mmap_lock.h
>  create mode 100644 mm/mmap_lock.c
>
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index 0707671851a8..6586b42b4faa 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -1,11 +1,63 @@
>  #ifndef _LINUX_MMAP_LOCK_H
>  #define _LINUX_MMAP_LOCK_H
>
> +#include 
> +#include 
>  #include 
> +#include 
> +#include 
> +#include 
>
>  #define MMAP_LOCK_INITIALIZER(name) \
> .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
>
> +DECLARE_TRACEPOINT(mmap_lock_start_locking);
> +DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
> +DECLARE_TRACEPOINT(mmap_lock_released);
> +
> +#ifdef CONFIG_TRACING
> +
> +void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
> +void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
> +  bool success);
> +void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);
> +
> +static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
> +  bool write)
> +{
> +   if (tracepoint_enabled(mmap_lock_start_locking))
> +   __mmap_lock_do_trace_start_locking(mm, write);
> +}
> +
> +static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
> + bool write, bool 
> success)
> +{
> +   if (tracepoint_enabled(mmap_lock_acquire_returned))
> +   __mmap_lock_do_trace_acquire_returned(mm, write, success);
> +}
> +
> +static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool 
> write)
> +{
> +   if (tracepoint_enabled(mmap_lock_released))
> +   __mmap_lock_do_trace_released(mm, write);
> +}
> +
> +#else /* !CONFIG_TRACING */
> +
> +static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
> +  bool write)
> +{
> +}
> +static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
> + bool write, bool 
> success)
> +{
> +}
> +static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool 
> write)
> +{
> +}
> +
> +#endif /* CONFIG_TRACING */
> +
>  static inline void mmap_init_lock(struct mm_struct *mm)
>  {
> init_rwsem(>mmap_lock);
> @@ -13,58 +65,88 @@ static inline void mmap_init_lock(struct mm_struct *mm)
>
>  static inline void mmap_write_lock(struct mm_struct *mm)
>  {
> +   __mmap_lock_trace_start_locking(mm, true);
> down_write(>mmap_lock);
> +   __mmap_lock_trace_acquire_returned(mm, true, true);
>  }
>
>  static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
>  {
> +   __mmap_lock_trace_start_locking(mm, true);
> down_write_nested(>mmap_lock, subclass);
> +

Re: [PATCH] mmap_lock: add tracepoints around lock acquisition

2020-09-23 Thread Yafang Shao

On Thu, Sep 24, 2020 at 12:09 AM Steven Rostedt  wrote:
>
> On Wed, 23 Sep 2020 18:04:17 +0800
> Yafang Shao  wrote:
>
> > > What you can do, and what we have done is the following:
> > >
> > > (see include/linux/page_ref.h)
> > >
> > >
> > > #ifdef CONFIG_TRACING
> > > extern struct tracepoint __tracepoint_mmap_lock_start_locking;
> > > extern struct tracepoint __tracepoint_mmap_lock_acquire_returned;
> > >
> > > #define mmap_lock_tracepoint_active(t) 
> > > static_key_false(&(__tracepoint_mmap_lock_##t).key)
> > >
> > > #else
> > > #define mmap_lock_tracepoint_active(t) false
> > > #endif
> > >
> > > static inline void mmap_write_lock(struct mm_struct *mm)
> > > {
> > > if (mmap_lock_tracepoint_active(start_locking))
> > > mmap_lock_start_trace_wrapper();
> > > down_write(>mmap_lock);
> > > if (mmap_lock_tracepoint_active(acquire_returned))
> > > mmap_lock_acquire_trace_wrapper();
> > > }
> > >
> > >
> > > -- Steve
> >
> >
> > Great!
> >
> > Thanks Steve.
>
> If the above becomes useful, I may just add helper functions into a
> header file that you can include. Perhaps call it: tracepoint_active()
> and you need to pass in as an argument the name of the tracepoint.
>

Yes, please. The new helper would be useful.

-- 
Thanks
Yafang

Re: [PATCH] mmap_lock: add tracepoints around lock acquisition

2020-09-23 Thread Yafang Shao

On Wed, Sep 23, 2020 at 12:51 AM Steven Rostedt  wrote:
>
> On Tue, 22 Sep 2020 12:09:19 +0800
> Yafang Shao  wrote:
>
> > > > Are there any methods to avoid un-inlining these wrappers ?
> > > >
> > > > For example,
> > > > // include/linux/mmap_lock.h
> > > >
> > > > void mmap_lock_start_trace_wrapper();
> > > > void mmap_lock_acquire_trace_wrapper();
> > > >
> > > > static inline void mmap_write_lock(struct mm_struct *mm)
> > > > {
> > > > mmap_lock_start_trace_wrapper();
> > > > down_write(>mmap_lock);
> > > > mmap_lock_acquire_trace_wrapper();
> > > > }
> > > >
> > > > // mm/mmap_lock.c
> > > > void mmap_lock_start_trace_wrapper()
> > > > {
> > > > trace_mmap_lock_start();
> > > > }
> > > >
> > > > void mmap_lock_start_trace_wrapper()
> > > > {
> > > > trace_mmap_lock_acquired();
> > > > }
> > >
> > > We can do something like that, but I don't think it would end up being 
> > > better.
> > >
> > > At the end of the day, because the trace stuff cannot be in the
> > > header, we have to add an extra function call one way or the other.
> > > This would just move the call one step further down the call stack.
> > > So, I don't think it would affect performance in the
> > > CONFIG_MMAP_LOCK_STATS + tracepoints not enabled at runtime case.
> > >
> >
> > Right, it seems we have to add an extra function call.
> >
> > > Also the wrappers aren't quite so simple as this, they need some
> > > parameters to work. (the struct mm_struct, whether it was a read or a
> > > write lock, and whether or not the lock operation succeeded), so it
> > > would mean adding more inlined code, which I think adds up to be a
> > > nontrivial amount since these wrappers are called so often in the
> > > kernel.
> > >
> > > If you feel strongly, let me know and I can send a version as you
> > > describe and we can compare the two.
> > >
> >
> > These tracepoints will be less useful if we have to turn on the config
> > to enable it.
> > I don't mind implementing it that way if we can't optimize it.
> >
> > Maybe Steven can give some suggestions, Steven ?
> >
>
>
> What you can do, and what we have done is the following:
>
> (see include/linux/page_ref.h)
>
>
> #ifdef CONFIG_TRACING
> extern struct tracepoint __tracepoint_mmap_lock_start_locking;
> extern struct tracepoint __tracepoint_mmap_lock_acquire_returned;
>
> #define mmap_lock_tracepoint_active(t) 
> static_key_false(&(__tracepoint_mmap_lock_##t).key)
>
> #else
> #define mmap_lock_tracepoint_active(t) false
> #endif
>
> static inline void mmap_write_lock(struct mm_struct *mm)
> {
> if (mmap_lock_tracepoint_active(start_locking))
> mmap_lock_start_trace_wrapper();
> down_write(>mmap_lock);
> if (mmap_lock_tracepoint_active(acquire_returned))
> mmap_lock_acquire_trace_wrapper();
> }
>
>
> -- Steve


Great!

Thanks Steve.

-- 
Thanks
Yafang

Re: [PATCH] mm/memcontrol: Add the drop_cache interface for cgroup v2

2020-09-22 Thread Yafang Shao

On Tue, Sep 22, 2020 at 3:27 PM Michal Hocko  wrote:
>
> On Tue 22-09-20 12:20:52, Yafang Shao wrote:
> > On Mon, Sep 21, 2020 at 7:36 PM Michal Hocko  wrote:
> > >
> > > On Mon 21-09-20 19:23:01, Yafang Shao wrote:
> > > > On Mon, Sep 21, 2020 at 7:05 PM Michal Hocko  wrote:
> > > > >
> > > > > On Mon 21-09-20 18:55:40, Yafang Shao wrote:
> > > > > > On Mon, Sep 21, 2020 at 4:12 PM Michal Hocko  
> > > > > > wrote:
> > > > > > >
> > > > > > > On Mon 21-09-20 16:02:55, zangchun...@bytedance.com wrote:
> > > > > > > > From: Chunxin Zang 
> > > > > > > >
> > > > > > > > In the cgroup v1, we have 'force_mepty' interface. This is very
> > > > > > > > useful for userspace to actively release memory. But the cgroup
> > > > > > > > v2 does not.
> > > > > > > >
> > > > > > > > This patch reuse cgroup v1's function, but have a new name for
> > > > > > > > the interface. Because I think 'drop_cache' may be is easier to
> > > > > > > > understand :)
> > > > > > >
> > > > > > > This should really explain a usecase. Global drop_caches is a 
> > > > > > > terrible
> > > > > > > interface and it has caused many problems in the past. People have
> > > > > > > learned to use it as a remedy to any problem they might see and 
> > > > > > > cause
> > > > > > > other problems without realizing that. This is the reason why we 
> > > > > > > even
> > > > > > > log each attempt to drop caches.
> > > > > > >
> > > > > > > I would rather not repeat the same mistake on the memcg level 
> > > > > > > unless
> > > > > > > there is a very strong reason for it.
> > > > > > >
> > > > > >
> > > > > > I think we'd better add these comments above the function
> > > > > > mem_cgroup_force_empty() to explain why we don't want to expose this
> > > > > > interface in cgroup2, otherwise people will continue to send this
> > > > > > proposal without any strong reason.
> > > > >
> > > > > I do not mind people sending this proposal.  "V1 used to have an
> > > > > interface, we need it in v2 as well" is not really viable without
> > > > > providing more reasoning on the specific usecase.
> > > > >
> > > > > _Any_ patch should have a proper justification. This is nothing really
> > > > > new to the process and I am wondering why this is coming as a 
> > > > > surprise.
> > > > >
> > > >
> > > > Container users always want to drop cache in a specific container,
> > > > because they used to use drop_caches to fix memory pressure issues.
> > >
> > > This is exactly the kind of problems we have seen in the past. There
> > > should be zero reason to addre potential reclaim problems by dropping
> > > page cache on the floor. There is a huge cargo cult about this
> > > procedure and I have seen numerous reports when people complained about
> > > performance afterwards just to learn that the dropped page cache was one
> > > of the resons for that.
> > >
> > > > Although drop_caches can cause some unexpected issues, it could also
> > > > fix some issues.
> > >
> > > "Some issues" is way too general. We really want to learn about those
> > > issues and address them properly.
> > >
> >
> > One use case in our production environment is that some of our tasks
> > become very latency sensitive from 7am to 10am, so before these tasks
> > become active we will use drop_caches to drop page caches generated by
> > other tasks at night to avoid these tasks triggering direct reclaim.
> >
> > The best way to do it is to fix the latency in direct reclaim, but it
> > will take great effort.
>
> What is the latency triggered by the memory reclaim? It should be mostly
> a clean page cache right as drop_caches only drops clean pages. Or is
> this more about [id]cache? Do you have any profiles where is the time
> spent?
>

Yes, we have analyzed the issues in the direct reclaim, but that is
not the point.
The point is that each case may take us several

Re: [PATCH] mm/memcontrol: Add the drop_cache interface for cgroup v2

2020-09-21 Thread Yafang Shao

On Mon, Sep 21, 2020 at 7:36 PM Michal Hocko  wrote:
>
> On Mon 21-09-20 19:23:01, Yafang Shao wrote:
> > On Mon, Sep 21, 2020 at 7:05 PM Michal Hocko  wrote:
> > >
> > > On Mon 21-09-20 18:55:40, Yafang Shao wrote:
> > > > On Mon, Sep 21, 2020 at 4:12 PM Michal Hocko  wrote:
> > > > >
> > > > > On Mon 21-09-20 16:02:55, zangchun...@bytedance.com wrote:
> > > > > > From: Chunxin Zang 
> > > > > >
> > > > > > In the cgroup v1, we have 'force_mepty' interface. This is very
> > > > > > useful for userspace to actively release memory. But the cgroup
> > > > > > v2 does not.
> > > > > >
> > > > > > This patch reuse cgroup v1's function, but have a new name for
> > > > > > the interface. Because I think 'drop_cache' may be is easier to
> > > > > > understand :)
> > > > >
> > > > > This should really explain a usecase. Global drop_caches is a terrible
> > > > > interface and it has caused many problems in the past. People have
> > > > > learned to use it as a remedy to any problem they might see and cause
> > > > > other problems without realizing that. This is the reason why we even
> > > > > log each attempt to drop caches.
> > > > >
> > > > > I would rather not repeat the same mistake on the memcg level unless
> > > > > there is a very strong reason for it.
> > > > >
> > > >
> > > > I think we'd better add these comments above the function
> > > > mem_cgroup_force_empty() to explain why we don't want to expose this
> > > > interface in cgroup2, otherwise people will continue to send this
> > > > proposal without any strong reason.
> > >
> > > I do not mind people sending this proposal.  "V1 used to have an
> > > interface, we need it in v2 as well" is not really viable without
> > > providing more reasoning on the specific usecase.
> > >
> > > _Any_ patch should have a proper justification. This is nothing really
> > > new to the process and I am wondering why this is coming as a surprise.
> > >
> >
> > Container users always want to drop cache in a specific container,
> > because they used to use drop_caches to fix memory pressure issues.
>
> This is exactly the kind of problems we have seen in the past. There
> should be zero reason to addre potential reclaim problems by dropping
> page cache on the floor. There is a huge cargo cult about this
> procedure and I have seen numerous reports when people complained about
> performance afterwards just to learn that the dropped page cache was one
> of the resons for that.
>
> > Although drop_caches can cause some unexpected issues, it could also
> > fix some issues.
>
> "Some issues" is way too general. We really want to learn about those
> issues and address them properly.
>

One use case in our production environment is that some of our tasks
become very latency sensitive from 7am to 10am, so before these tasks
become active we will use drop_caches to drop page caches generated by
other tasks at night to avoid these tasks triggering direct reclaim.
The best way to do it is to fix the latency in direct reclaim, but it
will take great effort. while drop_caches give us an easier way to
achieve the same goal.
IOW, drop_caches give the users an option to achieve their goal before
they find a better solution.

> > So container users want to use it in containers as
> > well.
> > If this feature is not implemented in cgroup, they will ask you why
> > but there is no explanation in the kernel.
>
> There is no usecase that would really require it so far.
>
> > Regarding the memory.high, it is not perfect as well, because you have
> > to set it to 0 to drop_caches, and the processes in the containers
> > have to reclaim pages as well because they reach the memory.high, but
> > memory.force_empty won't make other processes to reclaim.
>
> Since 536d3bf261a2 ("mm: memcontrol: avoid workload stalls when lowering
> memory.high") the limit is set after the reclaim so the race window when
> somebody would be pushed to high limit reclaim is reduced. But I do
> agree this is just a workaround.
>
> > That doesn't mean I agree to add this interface, while I really mean
> > that if we discard one feature we'd better explain why.
>
> We need to understand why somebody wants an interface because once it is
> added it will have to be maintained for ever.
> --
> Michal Hocko
> SUSE Labs



-- 
Thanks
Yafang

Re: [PATCH] mmap_lock: add tracepoints around lock acquisition

2020-09-21 Thread Yafang Shao

On Tue, Sep 22, 2020 at 12:53 AM Axel Rasmussen
 wrote:
>
> On Sun, Sep 20, 2020 at 9:58 PM Yafang Shao  wrote:
> >
> > On Fri, Sep 18, 2020 at 2:13 AM Axel Rasmussen  
> > wrote:
> > >
> > > The goal of these tracepoints is to be able to debug lock contention
> > > issues. This lock is acquired on most (all?) mmap / munmap / page fault
> > > operations, so a multi-threaded process which does a lot of these can
> > > experience significant contention.
> > >
> > > We trace just before we start acquisition, when the acquisition returns
> > > (whether it succeeded or not), and when the lock is released (or
> > > downgraded). The events are broken out by lock type (read / write).
> > >
> > > The events are also broken out by memcg path. For container-based
> > > workloads, users often think of several processes in a memcg as a single
> > > logical "task", so collecting statistics at this level is useful.
> > >
> > > These events *do not* include latency bucket information, which means
> > > for a proper latency histogram users will need to use BPF instead of
> > > event histograms. The benefit we get from this is simpler code.
> > >
> > > This patch is a no-op if the Kconfig option is not enabled. If it is,
> > > tracepoints are still disabled by default (configurable at runtime);
> > > the only fixed cost here is un-inlining a few functions. As best as
> > > I've been able to measure, the overhead this introduces is a small
> > > fraction of 1%. Actually hooking up the tracepoints to BPF introduces
> > > additional overhead, depending on exactly what the BPF program is
> > > collecting.
> >
> > Are there any methods to avoid un-inlining these wrappers ?
> >
> > For example,
> > // include/linux/mmap_lock.h
> >
> > void mmap_lock_start_trace_wrapper();
> > void mmap_lock_acquire_trace_wrapper();
> >
> > static inline void mmap_write_lock(struct mm_struct *mm)
> > {
> > mmap_lock_start_trace_wrapper();
> > down_write(>mmap_lock);
> > mmap_lock_acquire_trace_wrapper();
> > }
> >
> > // mm/mmap_lock.c
> > void mmap_lock_start_trace_wrapper()
> > {
> > trace_mmap_lock_start();
> > }
> >
> > void mmap_lock_start_trace_wrapper()
> > {
> > trace_mmap_lock_acquired();
> > }
>
> We can do something like that, but I don't think it would end up being better.
>
> At the end of the day, because the trace stuff cannot be in the
> header, we have to add an extra function call one way or the other.
> This would just move the call one step further down the call stack.
> So, I don't think it would affect performance in the
> CONFIG_MMAP_LOCK_STATS + tracepoints not enabled at runtime case.
>

Right, it seems we have to add an extra function call.

> Also the wrappers aren't quite so simple as this, they need some
> parameters to work. (the struct mm_struct, whether it was a read or a
> write lock, and whether or not the lock operation succeeded), so it
> would mean adding more inlined code, which I think adds up to be a
> nontrivial amount since these wrappers are called so often in the
> kernel.
>
> If you feel strongly, let me know and I can send a version as you
> describe and we can compare the two.
>

These tracepoints will be less useful if we have to turn on the config
to enable it.
I don't mind implementing it that way if we can't optimize it.

Maybe Steven can give some suggestions, Steven ?

> >
> >
> >
> > > ---
> > >  include/linux/mmap_lock.h|  28 +++-
> > >  include/trace/events/mmap_lock.h |  73 ++
> > >  mm/Kconfig   |  17 +++
> > >  mm/Makefile  |   1 +
> > >  mm/mmap_lock.c   | 224 +++
> > >  5 files changed, 342 insertions(+), 1 deletion(-)
> > >  create mode 100644 include/trace/events/mmap_lock.h
> > >  create mode 100644 mm/mmap_lock.c
> > >
> > > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> > > index 0707671851a8..d12aa2ff6c05 100644
> > > --- a/include/linux/mmap_lock.h
> > > +++ b/include/linux/mmap_lock.h
> > > @@ -1,11 +1,35 @@
> > >  #ifndef _LINUX_MMAP_LOCK_H
> > >  #define _LINUX_MMAP_LOCK_H
> > >
> > > +#include 
> > > +#include 
> > >  #include 
> > > +#include 
> > > +#include 
> > >
> > >  #define MMAP_LOCK_INITIALIZER(name) \
> > >

Re: [PATCH] mm/memcontrol: Add the drop_cache interface for cgroup v2

2020-09-21 Thread Yafang Shao

On Mon, Sep 21, 2020 at 7:05 PM Michal Hocko  wrote:
>
> On Mon 21-09-20 18:55:40, Yafang Shao wrote:
> > On Mon, Sep 21, 2020 at 4:12 PM Michal Hocko  wrote:
> > >
> > > On Mon 21-09-20 16:02:55, zangchun...@bytedance.com wrote:
> > > > From: Chunxin Zang 
> > > >
> > > > In the cgroup v1, we have 'force_mepty' interface. This is very
> > > > useful for userspace to actively release memory. But the cgroup
> > > > v2 does not.
> > > >
> > > > This patch reuse cgroup v1's function, but have a new name for
> > > > the interface. Because I think 'drop_cache' may be is easier to
> > > > understand :)
> > >
> > > This should really explain a usecase. Global drop_caches is a terrible
> > > interface and it has caused many problems in the past. People have
> > > learned to use it as a remedy to any problem they might see and cause
> > > other problems without realizing that. This is the reason why we even
> > > log each attempt to drop caches.
> > >
> > > I would rather not repeat the same mistake on the memcg level unless
> > > there is a very strong reason for it.
> > >
> >
> > I think we'd better add these comments above the function
> > mem_cgroup_force_empty() to explain why we don't want to expose this
> > interface in cgroup2, otherwise people will continue to send this
> > proposal without any strong reason.
>
> I do not mind people sending this proposal.  "V1 used to have an
> interface, we need it in v2 as well" is not really viable without
> providing more reasoning on the specific usecase.
>
> _Any_ patch should have a proper justification. This is nothing really
> new to the process and I am wondering why this is coming as a surprise.
>

Container users always want to drop cache in a specific container,
because they used to use drop_caches to fix memory pressure issues.
Although drop_caches can cause some unexpected issues, it could also
fix some issues. So container users want to use it in containers as
well.
If this feature is not implemented in cgroup, they will ask you why
but there is no explanation in the kernel.

Regarding the memory.high, it is not perfect as well, because you have
to set it to 0 to drop_caches, and the processes in the containers
have to reclaim pages as well because they reach the memory.high, but
memory.force_empty won't make other processes to reclaim.

That doesn't mean I agree to add this interface, while I really mean
that if we discard one feature we'd better explain why.

-- 
Thanks
Yafang

Re: [PATCH] mm/memcontrol: Add the drop_cache interface for cgroup v2

2020-09-21 Thread Yafang Shao

On Mon, Sep 21, 2020 at 4:12 PM Michal Hocko  wrote:
>
> On Mon 21-09-20 16:02:55, zangchun...@bytedance.com wrote:
> > From: Chunxin Zang 
> >
> > In the cgroup v1, we have 'force_mepty' interface. This is very
> > useful for userspace to actively release memory. But the cgroup
> > v2 does not.
> >
> > This patch reuse cgroup v1's function, but have a new name for
> > the interface. Because I think 'drop_cache' may be is easier to
> > understand :)
>
> This should really explain a usecase. Global drop_caches is a terrible
> interface and it has caused many problems in the past. People have
> learned to use it as a remedy to any problem they might see and cause
> other problems without realizing that. This is the reason why we even
> log each attempt to drop caches.
>
> I would rather not repeat the same mistake on the memcg level unless
> there is a very strong reason for it.
>

I think we'd better add these comments above the function
mem_cgroup_force_empty() to explain why we don't want to expose this
interface in cgroup2, otherwise people will continue to send this
proposal without any strong reason.


> > Signed-off-by: Chunxin Zang 
> > ---
> >  Documentation/admin-guide/cgroup-v2.rst | 11 +++
> >  mm/memcontrol.c |  5 +
> >  2 files changed, 16 insertions(+)
> >
> > diff --git a/Documentation/admin-guide/cgroup-v2.rst 
> > b/Documentation/admin-guide/cgroup-v2.rst
> > index ce3e05e41724..fbff959c8116 100644
> > --- a/Documentation/admin-guide/cgroup-v2.rst
> > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > @@ -1181,6 +1181,17 @@ PAGE_SIZE multiple when read back.
> >   high limit is used and monitored properly, this limit's
> >   utility is limited to providing the final safety net.
> >
> > +  memory.drop_cache
> > +A write-only single value file which exists on non-root
> > +cgroups.
> > +
> > +Provide a mechanism for users to actively trigger memory
> > +reclaim. The cgroup will be reclaimed and as many pages
> > +reclaimed as possible.
> > +
> > +It will broke low boundary. Because it tries to reclaim the
> > +memory many times, until the memory drops to a certain level.
> > +
> >memory.oom.group
> >   A read-write single value file which exists on non-root
> >   cgroups.  The default value is "0".
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 0b38b6ad547d..98646484efff 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -6226,6 +6226,11 @@ static struct cftype memory_files[] = {
> >   .write = memory_max_write,
> >   },
> >   {
> > + .name = "drop_cache",
> > + .flags = CFTYPE_NOT_ON_ROOT,
> > + .write = mem_cgroup_force_empty_write,
> > + },
> > + {
> >   .name = "events",
> >   .flags = CFTYPE_NOT_ON_ROOT,
> >   .file_offset = offsetof(struct mem_cgroup, events_file),
> > --
> > 2.11.0
>
> --
> Michal Hocko
> SUSE Labs
>


-- 
Thanks
Yafang

Re: [PATCH] mmap_lock: add tracepoints around lock acquisition

2020-09-20 Thread Yafang Shao

On Fri, Sep 18, 2020 at 2:13 AM Axel Rasmussen  wrote:
>
> The goal of these tracepoints is to be able to debug lock contention
> issues. This lock is acquired on most (all?) mmap / munmap / page fault
> operations, so a multi-threaded process which does a lot of these can
> experience significant contention.
>
> We trace just before we start acquisition, when the acquisition returns
> (whether it succeeded or not), and when the lock is released (or
> downgraded). The events are broken out by lock type (read / write).
>
> The events are also broken out by memcg path. For container-based
> workloads, users often think of several processes in a memcg as a single
> logical "task", so collecting statistics at this level is useful.
>
> These events *do not* include latency bucket information, which means
> for a proper latency histogram users will need to use BPF instead of
> event histograms. The benefit we get from this is simpler code.
>
> This patch is a no-op if the Kconfig option is not enabled. If it is,
> tracepoints are still disabled by default (configurable at runtime);
> the only fixed cost here is un-inlining a few functions. As best as
> I've been able to measure, the overhead this introduces is a small
> fraction of 1%. Actually hooking up the tracepoints to BPF introduces
> additional overhead, depending on exactly what the BPF program is
> collecting.

Are there any methods to avoid un-inlining these wrappers ?

For example,
// include/linux/mmap_lock.h

void mmap_lock_start_trace_wrapper();
void mmap_lock_acquire_trace_wrapper();

static inline void mmap_write_lock(struct mm_struct *mm)
{
mmap_lock_start_trace_wrapper();
down_write(>mmap_lock);
mmap_lock_acquire_trace_wrapper();
}

// mm/mmap_lock.c
void mmap_lock_start_trace_wrapper()
{
trace_mmap_lock_start();
}

void mmap_lock_start_trace_wrapper()
{
trace_mmap_lock_acquired();
}



> ---
>  include/linux/mmap_lock.h|  28 +++-
>  include/trace/events/mmap_lock.h |  73 ++
>  mm/Kconfig   |  17 +++
>  mm/Makefile  |   1 +
>  mm/mmap_lock.c   | 224 +++
>  5 files changed, 342 insertions(+), 1 deletion(-)
>  create mode 100644 include/trace/events/mmap_lock.h
>  create mode 100644 mm/mmap_lock.c
>
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index 0707671851a8..d12aa2ff6c05 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -1,11 +1,35 @@
>  #ifndef _LINUX_MMAP_LOCK_H
>  #define _LINUX_MMAP_LOCK_H
>
> +#include 
> +#include 
>  #include 
> +#include 
> +#include 
>
>  #define MMAP_LOCK_INITIALIZER(name) \
> .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
>
> +#ifdef CONFIG_MMAP_LOCK_STATS
> +
> +void mmap_init_lock(struct mm_struct *mm);
> +void mmap_write_lock(struct mm_struct *mm);
> +void mmap_write_lock_nested(struct mm_struct *mm, int subclass);
> +int mmap_write_lock_killable(struct mm_struct *mm);
> +bool mmap_write_trylock(struct mm_struct *mm);
> +void mmap_write_unlock(struct mm_struct *mm);
> +void mmap_write_downgrade(struct mm_struct *mm);
> +void mmap_read_lock(struct mm_struct *mm);
> +int mmap_read_lock_killable(struct mm_struct *mm);
> +bool mmap_read_trylock(struct mm_struct *mm);
> +void mmap_read_unlock(struct mm_struct *mm);
> +bool mmap_read_trylock_non_owner(struct mm_struct *mm);
> +void mmap_read_unlock_non_owner(struct mm_struct *mm);
> +void mmap_assert_locked(struct mm_struct *mm);
> +void mmap_assert_write_locked(struct mm_struct *mm);
> +
> +#else /* !CONFIG_MMAP_LOCK_STATS */
> +
>  static inline void mmap_init_lock(struct mm_struct *mm)
>  {
> init_rwsem(>mmap_lock);
> @@ -63,7 +87,7 @@ static inline void mmap_read_unlock(struct mm_struct *mm)
>
>  static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
>  {
> -   if (down_read_trylock(>mmap_lock)) {
> +   if (mmap_read_trylock(mm)) {
> rwsem_release(>mmap_lock.dep_map, _RET_IP_);
> return true;
> }
> @@ -87,4 +111,6 @@ static inline void mmap_assert_write_locked(struct 
> mm_struct *mm)
> VM_BUG_ON_MM(!rwsem_is_locked(>mmap_lock), mm);
>  }
>
> +#endif /* CONFIG_MMAP_LOCK_STATS */
> +
>  #endif /* _LINUX_MMAP_LOCK_H */
> diff --git a/include/trace/events/mmap_lock.h 
> b/include/trace/events/mmap_lock.h
> new file mode 100644
> index ..549c662e6ed8
> --- /dev/null
> +++ b/include/trace/events/mmap_lock.h
> @@ -0,0 +1,73 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM mmap_lock
> +
> +#if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_MMAP_LOCK_H
> +
> +#include 
> +#include 
> +
> +struct mm_struct;
> +
> +DECLARE_EVENT_CLASS(
> +   mmap_lock_template,
> +
> +   TP_PROTO(struct mm_struct *mm, const char *memcg_path, u64 duration,
> +   bool write, bool success),
> +
> +   TP_ARGS(mm,

1 2 3 4 5 >

1 - 100 of 472 matches

Mail list logo