[PATCH] delayacct: clear right task's flag after blkio completes
When I was implementing a latency analyze tool by using task->delays and other things, I found there's issue in delayacct. The issue is it should clear the target's flag instead of current's in delayacct_blkio_end(). When I git blame delayacct, I found there're some similar issues we have fixed in delayacct_blkio_end(). 'Commit c96f5471ce7d ("delayacct: Account blkio completion on the correct task")' fixed the issue that it should account blkio completion on the target task instead of current. 'Commit b512719f771a ("delayacct: fix crash in delayacct_blkio_end() after delayacct init failure")' fixed the issue that it should check target task's delays instead of current task'. It seems that delayacct_blkio_{begin, end} are error prone. So I introduce a new paratmeter - the target task 'p' into these helpers, after that change, the callsite will specifilly set the right task, which should make it less error prone. Signed-off-by: Yafang Shao Cc: Tejun Heo Cc: Josh Snyder --- include/linux/delayacct.h | 20 ++-- mm/memory.c | 8 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 2d3bdcccf5eb..21651f946751 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -82,16 +82,16 @@ static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) return 0; } -static inline void delayacct_set_flag(int flag) +static inline void delayacct_set_flag(struct task_struct *p, int flag) { - if (current->delays) - current->delays->flags |= flag; + if (p->delays) + p->delays->flags |= flag; } -static inline void delayacct_clear_flag(int flag) +static inline void delayacct_clear_flag(struct task_struct *p, int flag) { - if (current->delays) - current->delays->flags &= ~flag; + if (p->delays) + p->delays->flags &= ~flag; } static inline void delayacct_tsk_init(struct task_struct *tsk) @@ -114,7 +114,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) static inline void delayacct_blkio_start(void) { - delayacct_set_flag(DELAYACCT_PF_BLKIO); + delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } @@ -123,7 +123,7 @@ static inline void delayacct_blkio_end(struct task_struct *p) { if (p->delays) __delayacct_blkio_end(p); - delayacct_clear_flag(DELAYACCT_PF_BLKIO); + delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } static inline int delayacct_add_tsk(struct taskstats *d, @@ -166,9 +166,9 @@ static inline void delayacct_thrashing_end(void) } #else -static inline void delayacct_set_flag(int flag) +static inline void delayacct_set_flag(struct task_struct *p, int flag) {} -static inline void delayacct_clear_flag(int flag) +static inline void delayacct_clear_flag(struct task_struct *p, int flag) {} static inline void delayacct_init(void) {} diff --git a/mm/memory.c b/mm/memory.c index 550405fc3b5e..a013d32a6611 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3296,7 +3296,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } - delayacct_set_flag(DELAYACCT_PF_SWAPIN); + delayacct_set_flag(current, DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; @@ -3347,7 +3347,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->address, >ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto unlock; } @@ -3361,13 +3361,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto out_release; } locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; goto out_release; -- 2.18.2
Re: [PATCH v2 0/6] sched: support schedstats for RT sched class
On Sat, Mar 27, 2021 at 6:13 PM Yafang Shao wrote: > > We want to measure the latency of RT tasks in our production > environment with schedstats facility, but currently schedstats is only > supported for fair sched class. In order to support if for other sched > classes, we should make it independent of fair sched class. The struct > sched_statistics is the schedular statistics of a task_struct or a > task_group, both of which are independent of sched class. So we can move > struct sched_statistics into struct task_struct and struct task_group to > achieve the goal. > > After the patchset, schestats are orgnized as follows, > struct task_struct { > ... > struct sched_statistics statistics; > ... > struct sched_entity *se; > struct sched_rt_entity *rt; > ... > }; > > struct task_group {|---> stats[0] : of CPU0 > ...| > struct sched_statistics **stats; --|---> stats[1] : of CPU1 > ...| >|---> stats[n] : of CPUn > #ifdef CONFIG_FAIR_GROUP_SCHED > struct sched_entity **se; > #endif > #ifdef CONFIG_RT_GROUP_SCHED > struct sched_rt_entity **rt_se; > #endif > ... > }; > > The sched_statistics members may be frequently modified when schedstats is > enabled, in order to avoid impacting on random data which may in the same > cacheline with them, the struct sched_statistics is defined as cacheline > aligned. > > Then we can use schedstats to trace RT tasks as well, for example, > Interface File > task schedstats : /proc/[pid]/sched > group schedstats: /proc/sched_debug > tracepoints : sched:sched_stat_{runtime, wait, sleep, iowait, blocked} > > As PATCH #2 and #3 changes the core struct in the scheduler, so I did > 'perf bench sched pipe' to measure the sched performance before and after > the change, suggested by Mel. Below is the data, which are all in > usecs/op. > Before After > kernel.sched_schedstats=0 6.0~6.16.0~6.1 > kernel.sched_schedstats=1 6.2~6.46.2~6.4 > No obvious difference after the change. > > Changes since v1: > - Fix the build failure reported by kernel test robot. > - Add the performance data with 'perf bench sched pipe', suggested by > Mel. > - Make the struct sched_statistics cacheline aligned. > - Introduce task block time in schedstats > > Changes since RFC: > - improvement of schedstats helpers, per Mel. > - make struct schedstats independent of fair sched class > > Yafang Shao (6): > sched, fair: use __schedstat_set() in set_next_entity() > sched: make struct sched_statistics independent of fair sched class > sched: make schedstats helpers independent of fair sched class > sched: introduce task block time in schedstats > sched, rt: support sched_stat_runtime tracepoint for RT sched class > sched, rt: support schedstats for RT sched class > > include/linux/sched.h| 7 +- > kernel/sched/core.c | 24 +++-- > kernel/sched/deadline.c | 4 +- > kernel/sched/debug.c | 90 + > kernel/sched/fair.c | 210 --- > kernel/sched/rt.c| 143 +- > kernel/sched/sched.h | 3 + > kernel/sched/stats.c | 104 +++ > kernel/sched/stats.h | 89 + > kernel/sched/stop_task.c | 4 +- > 10 files changed, 489 insertions(+), 189 deletions(-) > > -- > 2.18.2 > Peter, Ingo, Mel, Any comments on this version ? -- Thanks Yafang
[PATCH v2 6/6] sched, rt: support schedstats for RT sched class
We want to measure the latency of RT tasks in our production environment with schedstats facility, but currently schedstats is only supported for fair sched class. This patch enable it for RT sched class as well. After we make the struct sched_statistics and the helpers of it independent of fair sched class, we can easily use the schedstats facility for RT sched class. The schedstat usage in RT sched class is similar with fair sched class, for example, fairRT enqueue update_stats_enqueue_fair update_stats_enqueue_rt dequeue update_stats_dequeue_fair update_stats_dequeue_rt put_prev_task update_stats_wait_start update_stats_wait_start set_next_task update_stats_wait_end update_stats_wait_end The user can get the schedstats information in the same way in fair sched class. For example, Interface File task schedstats : /proc/[pid]/sched group schedstats: /proc/sched_debug The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can be used to trace RT tasks as well. The output of these tracepoints for a RT tasks as follows, - blocked & iowait kworker/48:1-442 [048] d... 539.830872: sched_stat_iowait: comm=stress pid=10461 delay=158242 [ns] kworker/48:1-442 [048] d... 539.830872: sched_stat_blocked: comm=stress pid=10461 delay=158242 [ns] - wait stress-10460 [001] dN.. 813.965304: sched_stat_wait: comm=stress pid=10462 delay=7536 [ns] stress-10462 [001] d.h. 813.966300: sched_stat_runtime: comm=stress pid=10462 runtime=993812 [ns] vruntime=0 [ns] [...] stress-10462 [001] d.h. 814.065300: sched_stat_runtime: comm=stress pid=10462 runtime=994484 [ns] vruntime=0 [ns] [ totally 100 times of sched_stat_runtime for pid 10462] [ The delay of pid 10462 is the sum of above runtime ] stress-10462 [001] dN.. 814.065307: sched_stat_wait: comm=stress pid=10460 delay=11089 [ns] stress-10460 [001] d.h. 814.066299: sched_stat_runtime: comm=stress pid=10460 runtime=991934 [ns] vruntime=0 [ns] - sleep sleep-15582 [041] dN.. 1732.814348: sched_stat_sleep: comm=sleep.sh pid=15474 delay=1001223130 [ns] sleep-15584 [041] dN.. 1733.815908: sched_stat_sleep: comm=sleep.sh pid=15474 delay=1001238954 [ns] [ In sleep.sh, it sleeps 1 sec each time. ] [l...@intel.com: reported build failure in prev version] Signed-off-by: Yafang Shao Cc: kernel test robot --- kernel/sched/rt.c | 137 ++ 1 file changed, 137 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ae5282484710..e5f706ffcdbc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1273,6 +1273,125 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr rt_se->on_list = 0; } +#ifdef CONFIG_RT_GROUP_SCHED +static inline void +__schedstats_from_sched_rt_entity(struct sched_rt_entity *rt_se, + struct sched_statistics **stats) +{ + struct task_struct *p; + struct task_group *tg; + struct rt_rq *rt_rq; + int cpu; + + if (rt_entity_is_task(rt_se)) { + p = rt_task_of(rt_se); + *stats = >stats; + } else { + rt_rq = group_rt_rq(rt_se); + tg = rt_rq->tg; + cpu = cpu_of(rq_of_rt_rq(rt_rq)); + *stats = tg->stats[cpu]; + } +} + +#else + +static inline void +__schedstats_from_sched_rt_entity(struct sched_rt_entity *rt_se, + struct sched_statistics **stats) +{ + struct task_struct *p; + + p = rt_task_of(rt_se); + *stats = >stats; +} +#endif + +static inline void +update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats = NULL; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + __schedstats_from_sched_rt_entity(rt_se, ); + + __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats = NULL; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + __schedstats_from_sched_rt_entity(rt_se, ); + + __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int flags) +{ + if (!schedstat_enabled()) + return; + + if (fla
[PATCH v2 5/6] sched, rt: support sched_stat_runtime tracepoint for RT sched class
The runtime of a RT task has already been there, so we only need to add a tracepoint. One difference between fair task and RT task is that there is no vruntime in RT task. To reuse the sched_stat_runtime tracepoint, '0' is passed as vruntime for RT task. The output of this tracepoint for RT task as follows, stress-9748[039] d.h. 113.519352: sched_stat_runtime: comm=stress pid=9748 runtime=997573 [ns] vruntime=0 [ns] stress-9748[039] d.h. 113.520352: sched_stat_runtime: comm=stress pid=9748 runtime=997627 [ns] vruntime=0 [ns] stress-9748[039] d.h. 113.521352: sched_stat_runtime: comm=stress pid=9748 runtime=998203 [ns] vruntime=0 [ns] Signed-off-by: Yafang Shao --- kernel/sched/rt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 34ad07fb924e..ae5282484710 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1012,6 +1012,8 @@ static void update_curr_rt(struct rq *rq) schedstat_set(curr->stats.exec_max, max(curr->stats.exec_max, delta_exec)); + trace_sched_stat_runtime(curr, delta_exec, 0); + curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); -- 2.18.2
[PATCH v2 4/6] sched: introduce task block time in schedstats
Currently in schedstats we have sum_sleep_runtime and iowait_sum, but there's no metric to show how long the task is in D state. Once a task in D state, it means the task is blocked in the kernel, for example the task may be waiting for a mutex. The D state is more frequent than iowait, and it is more critital than S state. So it is worth to add a metric to measure it. Signed-off-by: Yafang Shao --- include/linux/sched.h | 2 ++ kernel/sched/debug.c | 6 -- kernel/sched/stats.c | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b687bb38897b..2b885481b8bf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -428,6 +428,8 @@ struct sched_statistics { u64 block_start; u64 block_max; + s64 sum_block_runtime; + u64 exec_max; u64 slice_max; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index d1bc616936d9..0995412dd3c0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -499,10 +499,11 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); @@ -941,6 +942,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, u64 avg_atom, avg_per_cpu; PN_SCHEDSTAT(stats.sum_sleep_runtime); + PN_SCHEDSTAT(stats.sum_block_runtime); PN_SCHEDSTAT(stats.wait_start); PN_SCHEDSTAT(stats.sleep_start); PN_SCHEDSTAT(stats.block_start); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index b2542f4d3192..21fae41c06f5 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -82,6 +82,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, __schedstat_set(stats->block_start, 0); __schedstat_add(stats->sum_sleep_runtime, delta); + __schedstat_add(stats->sum_block_runtime, delta); if (p) { if (p->in_iowait) { -- 2.18.2
[PATCH v2 3/6] sched: make schedstats helpers independent of fair sched class
The original prototype of the schedstats helpers are update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se) The cfs_rq in these helpers is used to get the rq_clock, and the se is used to get the struct sched_statistics and the struct task_struct. In order to make these helpers available by all sched classes, we can pass the rq, sched_statistics and task_struct directly. Then the new helpers are update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) which are independent of fair sched class. To avoid vmlinux growing too large or introducing ovehead when !schedstat_enabled(), some new helpers after schedstat_enabled() are also introduced, Suggested by Mel. These helpers are in sched/stats.c, __update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) The size of vmlinux as follows, Before After Size of vmlinux 826308552 826304640 The size is a litte smaller as some functions are not inlined again after the change. I also compared the sched performance with 'perf bench sched pipe', suggested by Mel. The result as follows, Before After kernel.sched_schedstats=0 6.0~6.1 6.0~6.1 kernel.sched_schedstats=1 6.2~6.4 6.2~6.4 No obvious difference after the change. No functional change. [l...@intel.com: reported build failure in prev version] Signed-off-by: Yafang Shao Acked-by: Mel Gorman Cc: kernel test robot --- kernel/sched/fair.c | 133 +++ kernel/sched/stats.c | 103 + kernel/sched/stats.h | 34 +++ 3 files changed, 156 insertions(+), 114 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5f72fef1cc0a..77a615b6310d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -904,32 +904,28 @@ static void update_curr_fair(struct rq *rq) } static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats = NULL; - u64 wait_start, prev_wait_start; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; __schedstats_from_sched_entity(se, ); - wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(stats->wait_start); + if (entity_is_task(se)) + p = task_of(se); - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > prev_wait_start)) - wait_start -= prev_wait_start; + __update_stats_wait_start(rq_of(cfs_rq), p, stats); - __schedstat_set(stats->wait_start, wait_start); } static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats = NULL; struct task_struct *p = NULL; - u64 delta; if (!schedstat_enabled()) return; @@ -945,105 +941,34 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(!schedstat_val(stats->wait_start))) return; - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start); - - if (entity_is_task(se)) { + if (entity_is_task(se)) p = task_of(se); - if (task_on_rq_migrating(p)) { - /* -* Preserve migrating task's wait time so wait_start -* time stamp can be adjusted to accumulate wait time -* prior to migration. -*/ - __schedstat_set(stats->wait_start, delta); - return; - } - trace_sched_stat_wait(p, delta); - } - __schedstat_set(stats->wait_max, - max(schedstat_val(stats->wait_max), delta)); - __schedstat_inc(stats->wait_count); - __schedstat_add(stats->wait_sum, delta); - __schedstat_set(stats->wait_start, 0); + __update_stats_wait_end(rq_of(cfs_rq), p, stats); } static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats = NULL; struct task_struct *tsk = NULL; - u64 sleep_start, block_start; if (!schedstat_enabled()) return; __schedstats_from_sched_entity(se, ); - sleep_start = schedstat_val(stats->sleep_start); - block_start = schedstat_val(stats->block_start); - i
[PATCH v2 2/6] sched: make struct sched_statistics independent of fair sched class
If we want to use the schedstats facility to trace other sched classes, we should make it independent of fair sched class. The struct sched_statistics is the schedular statistics of a task_struct or a task_group. So we can move it into struct task_struct and struct task_group to achieve the goal. After the patch, schestats are orgnized as follows, struct task_struct { ... struct sched_statistics statistics; ... struct sched_entity *se; struct sched_rt_entity *rt; ... }; struct task_group {|---> stats[0] : of CPU0 ...| struct sched_statistics **stats; --|---> stats[1] : of CPU1 ...| |---> stats[n] : of CPUn #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity **se; #endif #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; #endif ... }; The sched_statistics members may be frequently modified when schedstats is enabled, in order to avoid impacting on random data which may in the same cacheline with them, the struct sched_statistics is defined as cacheline aligned. As this patch changes the core struct of scheduler, so I verified the performance it may impact on the scheduler with 'perf bench sched pipe', suggested by Mel. Below is the result, in which all the values are in usecs/op. Before After kernel.sched_schedstats=0 6.0~6.16.0~6.1 kernel.sched_schedstats=1 6.2~6.46.2~6.4 No obvious impact on the sched performance. No functional change. [l...@intel.com: reported build failure in prev version] Signed-off-by: Yafang Shao Acked-by: Mel Gorman Cc: kernel test robot --- include/linux/sched.h| 5 +- kernel/sched/core.c | 24 kernel/sched/deadline.c | 4 +- kernel/sched/debug.c | 86 ++-- kernel/sched/fair.c | 121 --- kernel/sched/rt.c| 4 +- kernel/sched/sched.h | 3 + kernel/sched/stats.h | 55 ++ kernel/sched/stop_task.c | 4 +- 9 files changed, 210 insertions(+), 96 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 05572e2140ad..b687bb38897b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -447,7 +447,7 @@ struct sched_statistics { u64 nr_wakeups_passive; u64 nr_wakeups_idle; #endif -}; +} cacheline_aligned; struct sched_entity { /* For load-balancing: */ @@ -463,8 +463,6 @@ struct sched_entity { u64 nr_migrations; - struct sched_statistics statistics; - #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; @@ -697,6 +695,7 @@ struct task_struct { unsigned intrt_priority; const struct sched_class*sched_class; + struct sched_statistics stats; struct sched_entity se; struct sched_rt_entity rt; #ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3384ea74cad4..d55681b4f9a4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2913,11 +2913,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); - __schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(p->stats.nr_wakeups_local); } else { struct sched_domain *sd; - __schedstat_inc(p->se.statistics.nr_wakeups_remote); + __schedstat_inc(p->stats.nr_wakeups_remote); rcu_read_lock(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { @@ -2929,14 +2929,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - __schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->stats.nr_wakeups_migrate); #endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); - __schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(p->stats.nr_wakeups); if (wake_flags & WF_SYNC) - __schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->stats.nr_wakeups_sync); } /* @@ -3572,7 +3572,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SCHEDSTATS /* Even if schedstat is disabled, there should not be garbage */ - memset(>se.statistics, 0, sizeof(p->se.statistics)); + memset(>stats, 0, sizeof(p->st
[PATCH v2 0/6] sched: support schedstats for RT sched class
We want to measure the latency of RT tasks in our production environment with schedstats facility, but currently schedstats is only supported for fair sched class. In order to support if for other sched classes, we should make it independent of fair sched class. The struct sched_statistics is the schedular statistics of a task_struct or a task_group, both of which are independent of sched class. So we can move struct sched_statistics into struct task_struct and struct task_group to achieve the goal. After the patchset, schestats are orgnized as follows, struct task_struct { ... struct sched_statistics statistics; ... struct sched_entity *se; struct sched_rt_entity *rt; ... }; struct task_group {|---> stats[0] : of CPU0 ...| struct sched_statistics **stats; --|---> stats[1] : of CPU1 ...| |---> stats[n] : of CPUn #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity **se; #endif #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; #endif ... }; The sched_statistics members may be frequently modified when schedstats is enabled, in order to avoid impacting on random data which may in the same cacheline with them, the struct sched_statistics is defined as cacheline aligned. Then we can use schedstats to trace RT tasks as well, for example, Interface File task schedstats : /proc/[pid]/sched group schedstats: /proc/sched_debug tracepoints : sched:sched_stat_{runtime, wait, sleep, iowait, blocked} As PATCH #2 and #3 changes the core struct in the scheduler, so I did 'perf bench sched pipe' to measure the sched performance before and after the change, suggested by Mel. Below is the data, which are all in usecs/op. Before After kernel.sched_schedstats=0 6.0~6.16.0~6.1 kernel.sched_schedstats=1 6.2~6.46.2~6.4 No obvious difference after the change. Changes since v1: - Fix the build failure reported by kernel test robot. - Add the performance data with 'perf bench sched pipe', suggested by Mel. - Make the struct sched_statistics cacheline aligned. - Introduce task block time in schedstats Changes since RFC: - improvement of schedstats helpers, per Mel. - make struct schedstats independent of fair sched class Yafang Shao (6): sched, fair: use __schedstat_set() in set_next_entity() sched: make struct sched_statistics independent of fair sched class sched: make schedstats helpers independent of fair sched class sched: introduce task block time in schedstats sched, rt: support sched_stat_runtime tracepoint for RT sched class sched, rt: support schedstats for RT sched class include/linux/sched.h| 7 +- kernel/sched/core.c | 24 +++-- kernel/sched/deadline.c | 4 +- kernel/sched/debug.c | 90 + kernel/sched/fair.c | 210 --- kernel/sched/rt.c| 143 +- kernel/sched/sched.h | 3 + kernel/sched/stats.c | 104 +++ kernel/sched/stats.h | 89 + kernel/sched/stop_task.c | 4 +- 10 files changed, 489 insertions(+), 189 deletions(-) -- 2.18.2
[PATCH v2 1/6] sched, fair: use __schedstat_set() in set_next_entity()
schedstat_enabled() has been already checked, so we can use __schedstat_set() directly. Signed-off-by: Yafang Shao Acked-by: Mel Gorman --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aaa0dfa29d53..114eec730698 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4429,7 +4429,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (schedstat_enabled() && rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - schedstat_set(se->statistics.slice_max, + __schedstat_set(se->statistics.slice_max, max((u64)schedstat_val(se->statistics.slice_max), se->sum_exec_runtime - se->prev_sum_exec_runtime)); } -- 2.18.2
Re: [PATCH v6 resend 0/3] mm, vsprintf: dump full information of page flags in pGp
On Fri, Mar 19, 2021 at 6:13 PM Yafang Shao wrote: > > The existed pGp shows the names of page flags only, rather than the full > information including section, node, zone, last cpuipid and kasan tag. > While it is not easy to parse these information manually because there > are so many flavors. We'd better interpret them in printf. > > To be compitable with the existed format of pGp, the new introduced ones > also use '|' as the separator, then the user tools parsing pGp won't > need to make change, suggested by Matthew. The new added information is > tracked onto the end of the existed one, e.g. > [ 8838.835456] Slab 0x2828b78a objects=33 used=3 > fp=0xd04efc88 > flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) > > The documentation and test cases are also updated. The result of the > test cases as follows, > [68599.816764] test_printf: loaded. > [68599.819068] test_printf: all 388 tests passed > [68599.830367] test_printf: unloaded. > > This patchset also includes some code cleanup in mm/slub.c. > > v6: > - fixes the build failure and test failure reported by kernel test robot > > v5: > - remove the bitmap and better name the struct, per Petr > > v4: > - extend %pGp instead of introducing new format, per Matthew > > v3: > - coding improvement, per Joe and Andy > - the possible impact on debugfs and the fix of it, per Joe and Matthew > - introduce new format instead of changing pGp, per Andy > > v2: > - various coding improvement, per Joe, Miaohe, Vlastimil and Andy > - remove the prefix completely in patch #2, per Vlastimil > - Update the test cases, per Andy > > Yafang Shao (3): > mm, slub: use pGp to print page flags > mm, slub: don't combine pr_err with INFO > vsprintf: dump full information of page flags in pGp > > Documentation/core-api/printk-formats.rst | 2 +- > lib/test_printf.c | 90 --- > lib/vsprintf.c| 66 +++-- > mm/slub.c | 13 ++-- > 4 files changed, 149 insertions(+), 22 deletions(-) > > -- > 2.18.2 > Hi Petr, Any comments on this version ? Thanks Yafang
[PATCH v6 resend 2/3] mm, slub: don't combine pr_err with INFO
It is strange to combine "pr_err" with "INFO", so let's remove the prefix completely. This patch is motivated by David's comment[1]. - before the patch [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) - after the patch [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) [1]. https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t Suggested-by: Vlastimil Babka Signed-off-by: Yafang Shao Acked-by: Vlastimil Babka Reviewed-by: Miaohe Lin Reviewed-by: Andy Shevchenko Reviewed-by: David Hildenbrand Acked-by: David Rientjes Cc: Matthew Wilcox --- mm/slub.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ed3f728c1367..7ed388077633 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -624,7 +624,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) if (!t->addr) return; - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { @@ -650,7 +650,7 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", page, page->objects, page->inuse, page->freelist, page->flags, >flags); @@ -707,7 +707,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) @@ -800,7 +800,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); @@ -3899,7 +3899,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } -- 2.18.2
[PATCH v6 resend 0/3] mm, vsprintf: dump full information of page flags in pGp
The existed pGp shows the names of page flags only, rather than the full information including section, node, zone, last cpuipid and kasan tag. While it is not easy to parse these information manually because there are so many flavors. We'd better interpret them in printf. To be compitable with the existed format of pGp, the new introduced ones also use '|' as the separator, then the user tools parsing pGp won't need to make change, suggested by Matthew. The new added information is tracked onto the end of the existed one, e.g. [ 8838.835456] Slab 0x2828b78a objects=33 used=3 fp=0xd04efc88 flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) The documentation and test cases are also updated. The result of the test cases as follows, [68599.816764] test_printf: loaded. [68599.819068] test_printf: all 388 tests passed [68599.830367] test_printf: unloaded. This patchset also includes some code cleanup in mm/slub.c. v6: - fixes the build failure and test failure reported by kernel test robot v5: - remove the bitmap and better name the struct, per Petr v4: - extend %pGp instead of introducing new format, per Matthew v3: - coding improvement, per Joe and Andy - the possible impact on debugfs and the fix of it, per Joe and Matthew - introduce new format instead of changing pGp, per Andy v2: - various coding improvement, per Joe, Miaohe, Vlastimil and Andy - remove the prefix completely in patch #2, per Vlastimil - Update the test cases, per Andy Yafang Shao (3): mm, slub: use pGp to print page flags mm, slub: don't combine pr_err with INFO vsprintf: dump full information of page flags in pGp Documentation/core-api/printk-formats.rst | 2 +- lib/test_printf.c | 90 --- lib/vsprintf.c| 66 +++-- mm/slub.c | 13 ++-- 4 files changed, 149 insertions(+), 22 deletions(-) -- 2.18.2
[PATCH v6 resend 3/3] vsprintf: dump full information of page flags in pGp
Currently the pGp only shows the names of page flags, rather than the full information including section, node, zone, last cpupid and kasan tag. While it is not easy to parse these information manually because there're so many flavors. Let's interpret them in pGp as well. To be compitable with the existed format of pGp, the new introduced ones also use '|' as the separator, then the user tools parsing pGp won't need to make change, suggested by Matthew. The new information is tracked onto the end of the existed one. On example of the output in mm/slub.c as follows, - Before the patch, [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) - After the patch, [ 8448.272530] Slab 0x90797883 objects=33 used=3 fp=0x790f1c26 flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) The documentation and test cases are also updated. The output of the test cases as follows, [68599.816764] test_printf: loaded. [68599.819068] test_printf: all 388 tests passed [68599.830367] test_printf: unloaded. [l...@intel.com: reported issues in the prev version in test_printf.c] Signed-off-by: Yafang Shao Cc: David Hildenbrand Cc: Joe Perches Cc: Miaohe Lin Cc: Vlastimil Babka Cc: Andy Shevchenko Cc: Matthew Wilcox Cc: Petr Mladek Cc: kernel test robot --- Documentation/core-api/printk-formats.rst | 2 +- lib/test_printf.c | 90 --- lib/vsprintf.c| 66 +++-- 3 files changed, 142 insertions(+), 16 deletions(-) diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 160e710d992f..00d07c7eefd4 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -540,7 +540,7 @@ Flags bitfields such as page flags, gfp_flags :: - %pGpreferenced|uptodate|lru|active|private + %pGp referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1f %pGgGFP_USER|GFP_DMA32|GFP_NOWARN %pGvread|exec|mayread|maywrite|mayexec|denywrite diff --git a/lib/test_printf.c b/lib/test_printf.c index 95a2f82427c7..27b964ec723d 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -577,24 +577,98 @@ netdev_features(void) { } +struct page_flags_test { + int width; + int shift; + int mask; + unsigned long value; + const char *fmt; + const char *name; +}; + +static struct page_flags_test pft[] = { + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, +0, "%d", "section"}, + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, +0, "%d", "node"}, + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, +0, "%d", "zone"}, + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, +0, "%#x", "lastcpupid"}, + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, +0, "%#x", "kasantag"}, +}; + +static void __init +page_flags_test(int section, int node, int zone, int last_cpupid, + int kasan_tag, int flags, const char *name, char *cmp_buf) +{ + unsigned long values[] = {section, node, zone, last_cpupid, kasan_tag}; + unsigned long page_flags = 0; + unsigned long size = 0; + bool append = false; + int i; + + flags &= BIT(NR_PAGEFLAGS) - 1; + if (flags) { + page_flags |= flags; + snprintf(cmp_buf + size, BUF_SIZE - size, "%s", name); + size = strlen(cmp_buf); +#if SECTIONS_WIDTH || NODES_WIDTH || ZONES_WIDTH || \ + LAST_CPUPID_WIDTH || KASAN_TAG_WIDTH + /* Other information also included in page flags */ + snprintf(cmp_buf + size, BUF_SIZE - size, "|"); + size = strlen(cmp_buf); +#endif + } + + /* Set the test value */ + for (i = 0; i < ARRAY_SIZE(pft); i++) + pft[i].value = values[i]; + + for (i = 0; i < ARRAY_SIZE(pft); i++) { + if (!pft[i].width) + continue; + + if (append) { + snprintf(cmp_buf + size, BUF_SIZE - size, "|"); + size = strlen(cmp_buf); + } + + page_flags |= (pft[i].value & pft[i].mask) << pft[i].shift; + snprintf(cmp_buf + size, BUF_SIZE - size, "%s=", pft[i].name); + size = strlen(cmp_buf); + snprintf(cmp_buf + size, BUF_SIZE - size, pft[i].fmt, +pft[i].value & pft[i].mask); + size = strlen(cmp_buf); + append = true; + } + + test(cmp_buf, "%pGp", _flags); +} + static void __init flags(void) { unsigned long flags; - gfp_t gfp; char *cmp
[PATCH v6 resend 1/3] mm, slub: use pGp to print page flags
As pGp has been already introduced in printk, we'd better use it to make the output human readable. Before this change, the output is, [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 fp=0x8cd1579c flags=0x17c0010200 While after this change, the output is, [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) Signed-off-by: Yafang Shao Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Christoph Lameter Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Miaohe Lin Reviewed-by: Andy Shevchenko --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 3021ce9bf1b3..ed3f728c1367 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -650,8 +650,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + page, page->objects, page->inuse, page->freelist, + page->flags, >flags); } -- 2.18.2
Re: linux-next: Tree for Mar 10 (lib/test_printf.c)
On Wed, Mar 10, 2021 at 5:52 PM Petr Mladek wrote: > > On Tue 2021-03-09 21:57:48, Randy Dunlap wrote: > > On 3/9/21 8:02 PM, Stephen Rothwell wrote: > > > Hi all, > > > > > > > on i386 (at least): > > > > ../lib/test_printf.c: In function 'page_flags_test': > > ../lib/test_printf.c:595:17: error: 'sec' undeclared (first use in this > > function); did you mean 'sem'? > > page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT; > > ^~~ > > > > > > Should that be 'section'? > > Yup, it looks like. > > There seems to be one more problem found by the test robot: > >lib/test_printf.c:595:17: note: each undeclared identifier is reported > only once for each function it appears in > >> lib/test_printf.c:612:17: error: 'tag' undeclared (first use in this > >> function) > 612 | page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; > > > Yafang is going to send a fix. I have temporary removed the > problematic patch from printk/linux.git, for-next branch. > > I am sorry for the troubles. Anyway, it is great that linux-next > and all the test robots are around. > Hi Petr, I sent a new version v6[1] to fix the test failure and build failure report by kernel test robot. Sorry for the late fix as I'm very busy recently. [1]. https://lore.kernel.org/linux-mm/20210314083717.96380-1-laoar.s...@gmail.com/T/#t -- Thanks Yafang
[PATCH v6 2/3] mm, slub: don't combine pr_err with INFO
It is strange to combine "pr_err" with "INFO", so let's remove the prefix completely. This patch is motivated by David's comment[1]. - before the patch [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) - after the patch [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) [1]. https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t Suggested-by: Vlastimil Babka Signed-off-by: Yafang Shao Acked-by: Vlastimil Babka Reviewed-by: Miaohe Lin Reviewed-by: Andy Shevchenko Reviewed-by: David Hildenbrand Cc: Matthew Wilcox --- mm/slub.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ed3f728c1367..7ed388077633 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -624,7 +624,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) if (!t->addr) return; - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { @@ -650,7 +650,7 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", page, page->objects, page->inuse, page->freelist, page->flags, >flags); @@ -707,7 +707,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) @@ -800,7 +800,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); @@ -3899,7 +3899,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } -- 2.18.2
[PATCH v6 1/3] mm, slub: use pGp to print page flags
As pGp has been already introduced in printk, we'd better use it to make the output human readable. Before this change, the output is, [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 fp=0x8cd1579c flags=0x17c0010200 While after this change, the output is, [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) Signed-off-by: Yafang Shao Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Christoph Lameter Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Miaohe Lin Reviewed-by: Andy Shevchenko --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 3021ce9bf1b3..ed3f728c1367 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -650,8 +650,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + page, page->objects, page->inuse, page->freelist, + page->flags, >flags); } -- 2.18.2
[PATCH v6 3/3] vsprintf: dump full information of page flags in pGp
Currently the pGp only shows the names of page flags, rather than the full information including section, node, zone, last cpupid and kasan tag. While it is not easy to parse these information manually because there're so many flavors. Let's interpret them in pGp as well. To be compitable with the existed format of pGp, the new introduced ones also use '|' as the separator, then the user tools parsing pGp won't need to make change, suggested by Matthew. The new information is tracked onto the end of the existed one. On example of the output in mm/slub.c as follows, - Before the patch, [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) - After the patch, [ 8448.272530] Slab 0x90797883 objects=33 used=3 fp=0x790f1c26 flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) The documentation and test cases are also updated. The output of the test cases as follows, [68599.816764] test_printf: loaded. [68599.819068] test_printf: all 388 tests passed [68599.830367] test_printf: unloaded. [l...@intel.com: reported issues in the prev version in test_printf.c] Signed-off-by: Yafang Shao Cc: David Hildenbrand Cc: Joe Perches Cc: Miaohe Lin Cc: Vlastimil Babka Cc: Andy Shevchenko Cc: Matthew Wilcox Cc: Petr Mladek Cc: kernel test robot --- Documentation/core-api/printk-formats.rst | 2 +- lib/test_printf.c | 90 --- lib/vsprintf.c| 66 +++-- 3 files changed, 142 insertions(+), 16 deletions(-) diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 160e710d992f..00d07c7eefd4 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -540,7 +540,7 @@ Flags bitfields such as page flags, gfp_flags :: - %pGpreferenced|uptodate|lru|active|private + %pGp referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1f %pGgGFP_USER|GFP_DMA32|GFP_NOWARN %pGvread|exec|mayread|maywrite|mayexec|denywrite diff --git a/lib/test_printf.c b/lib/test_printf.c index 95a2f82427c7..f87e433d6fa9 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -577,24 +577,98 @@ netdev_features(void) { } +struct page_flags_test { + int width; + int shift; + int mask; + unsigned long value; + const char *fmt; + const char *name; +}; + +static struct page_flags_test pft[] = { + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, +0, "%d", "section"}, + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, +0, "%d", "node"}, + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, +0, "%d", "zone"}, + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, +0, "%#x", "lastcpupid"}, + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, +0, "%#x", "kasantag"}, +}; + +static void __init +page_flags_test(int section, int node, int zone, int last_cpupid, + int kasan_tag, int flags, const char *name, char *cmp_buf) +{ + unsigned long values[] = {section, node, zone, last_cpupid, kasan_tag}; + unsigned long page_flags = 0; + unsigned long size = 0; + bool append = false; + int i; + + flags &= BIT(NR_PAGEFLAGS) - 1; + if (flags) { + page_flags |= flags; + snprintf(cmp_buf + size, BUF_SIZE - size, "%s", name); + size = strlen(cmp_buf); +#if SECTIONS_WIDTH || NODES_WIDTH || ZONES_WIDTH || \ + LAST_CPUPID_WIDTH || KASAN_TAG_WIDTH + /* Other information also included in page flags */ + snprintf(cmp_buf + size, BUF_SIZE - size, "|"); + size = strlen(cmp_buf); +#endif + } + + /* Set the test value */ + for (i = 0; i < ARRAY_SIZE(pft); i++) + pft[i].value = values[i]; + + for (i = 0; i < ARRAY_SIZE(pft); i++) { + if (!pft[i].width) + continue; + + if (append) { + snprintf(cmp_buf + size, BUF_SIZE - size, "|"); + size = strlen(cmp_buf); + } + + page_flags |= (pft[i].value & pft[i].mask) << pft[i].shift; + snprintf(cmp_buf + size, BUF_SIZE - size, "%s=", pft[i].name); + size = strlen(cmp_buf); + snprintf(cmp_buf + size, BUF_SIZE - size, pft[i].fmt, +pft[i].value & pft[i].mask); + size = strlen(cmp_buf); + append = true; + } + + test(cmp_buf, "%pGp", _flags); +} + static void __init flags(void) { unsigned long flags; - gfp_t gfp; char *cmp
[PATCH v6 0/3] mm, vsprintf: dump full information of page flags in pGp
The existed pGp shows the names of page flags only, rather than the full information including section, node, zone, last cpuipid and kasan tag. While it is not easy to parse these information manually because there are so many flavors. We'd better interpret them in printf. To be compitable with the existed format of pGp, the new introduced ones also use '|' as the separator, then the user tools parsing pGp won't need to make change, suggested by Matthew. The new added information is tracked onto the end of the existed one, e.g. [ 8838.835456] Slab 0x2828b78a objects=33 used=3 fp=0xd04efc88 flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) The documentation and test cases are also updated. The result of the test cases as follows, [68599.816764] test_printf: loaded. [68599.819068] test_printf: all 388 tests passed [68599.830367] test_printf: unloaded. This patchset also includes some code cleanup in mm/slub.c. v6: - fixes the build failure and test failure reported by kernel test robot v5: - remove the bitmap and better name the struct, per Petr v4: - extend %pGp instead of introducing new format, per Matthew v3: - coding improvement, per Joe and Andy - the possible impact on debugfs and the fix of it, per Joe and Matthew - introduce new format instead of changing pGp, per Andy v2: - various coding improvement, per Joe, Miaohe, Vlastimil and Andy - remove the prefix completely in patch #2, per Vlastimil - Update the test cases, per Andy Yafang Shao (3): mm, slub: use pGp to print page flags mm, slub: don't combine pr_err with INFO vsprintf: dump full information of page flags in pGp Documentation/core-api/printk-formats.rst | 2 +- lib/test_printf.c | 90 --- lib/vsprintf.c| 66 +++-- mm/slub.c | 13 ++-- 4 files changed, 149 insertions(+), 22 deletions(-) -- 2.18.2
[PATCH 1/1] vsprintf: dump full information of page flags in pGp fix
The name of the flag should be printed using default_str_spec. There's no difference in the output after this change because the string is printed as-is with both default_dec_spec and default_flag_spec. This patch is a followup of the patchset "mm, vsprintf: dump full information of page flags in pGp" [1] [1]. https://lore.kernel.org/linux-mm/20210215155141.47432-1-laoar.s...@gmail.com/ Signed-off-by: Yafang Shao Cc: Petr Mladek Cc: Matthew Wilcox Cc: Andy Shevchenko Cc: Vlastimil Babka Cc: Miaohe Lin Cc: Joe Perches Cc: David Hildenbrand --- lib/vsprintf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 533ac5404180..5d034e799c06 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1963,7 +1963,7 @@ char *format_page_flags(char *buf, char *end, unsigned long flags) buf++; } - buf = string(buf, end, p->name, *p->spec); + buf = string(buf, end, p->name, default_str_spec); if (buf < end) *buf = '='; buf++; -- 2.17.1
Re: [PATCH v5 3/3] vsprintf: dump full information of page flags in pGp
On Mon, Feb 22, 2021 at 8:38 PM Petr Mladek wrote: > > Hello, > > first, I am sorry for the late reply. I have marked the thread as > proceed by mistake last week... > > > On Mon 2021-02-15 23:51:41, Yafang Shao wrote: > > Currently the pGp only shows the names of page flags, rather than > > the full information including section, node, zone, last cpupid and > > kasan tag. While it is not easy to parse these information manually > > because there're so many flavors. Let's interpret them in pGp as well. > > > > To be compitable with the existed format of pGp, the new introduced ones > > also use '|' as the separator, then the user tools parsing pGp won't > > need to make change, suggested by Matthew. The new information is > > tracked onto the end of the existed one. > > > > One example of the output in mm/slub.c as follows, > > - Before the patch, > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 > > fp=0x9ae06ffc flags=0x17c0010200(slab|head) > > > > - After the patch, > > [ 8448.272530] Slab 0x90797883 objects=33 used=3 > > fp=0x790f1c26 > > flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) > > > > The documentation and test cases are also updated. The output of the > > test cases as follows, > > [11585.830272] test_printf: loaded. > > [11585.830454] test_printf: all 388 tests passed > > [11585.831401] test_printf: unloaded. > > > > --- a/lib/vsprintf.c > > +++ b/lib/vsprintf.c > > +static > > +char *format_page_flags(char *buf, char *end, unsigned long flags) > > +{ > > + unsigned long main_flags = flags & (BIT(NR_PAGEFLAGS) - 1); > > + bool append = false; > > + int i; > > + > > + /* Page flags from the main area. */ > > + if (main_flags) { > > + buf = format_flags(buf, end, main_flags, pageflag_names); > > + append = true; > > + } > > + > > + /* Page flags from the fields area */ > > + for (i = 0; i < ARRAY_SIZE(pff); i++) { > > + /* Skip undefined fields. */ > > + if (!pff[i].width) > > + continue; > > + > > + /* Format: Flag Name + '=' (equals sign) + Number + '|' > > (separator) */ > > + if (append) { > > + if (buf < end) > > + *buf = '|'; > > + buf++; > > + } > > + > > + buf = string(buf, end, pff[i].name, *pff[i].spec); > > I have found one more small issue. > > The purpose of the flag-specific printk_spec is to define the format > how the value is printed. The name of the flag should be printed > using default_str_spec. > > It works because the string is printed as-is with both > default_dec_spec and default_flag_spec. But it would be better > to use the string format. > Thanks for the explanation. > > + if (buf < end) > > + *buf = '='; > > + buf++; > > + buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask, > > + *pff[i].spec); > > + > > + append = true; > > + } > > + > > + return buf; > > +} > > Otherwise, the patch looks to me. The issue is cosmetic and might be > fixed either by re-spinning just this patch or by a followup patch. I will send a separate followup patch. > Either way, feel free to use: > > Reviewed-by: Petr Mladek > Thanks > Another question where to push this change. It is pity the we > finalized it in the middle of the merge window. It has to spend > at least few days in linux-next. > > I would like to hear from Andy before I push it into linux-next. > There is still theoretical chance to get it into 5.12 when Linus > prolongs the merge window by one week. it has been delayed by > a long lasting power outage. > > Best Regards, > Petr -- Thanks Yafang
Re: [PATCH v5 3/3] vsprintf: dump full information of page flags in pGp
On Mon, Feb 15, 2021 at 11:53 PM Yafang Shao wrote: > > Currently the pGp only shows the names of page flags, rather than > the full information including section, node, zone, last cpupid and > kasan tag. While it is not easy to parse these information manually > because there're so many flavors. Let's interpret them in pGp as well. > > To be compitable with the existed format of pGp, the new introduced ones > also use '|' as the separator, then the user tools parsing pGp won't > need to make change, suggested by Matthew. The new information is > tracked onto the end of the existed one. > > One example of the output in mm/slub.c as follows, > - Before the patch, > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 > fp=0x9ae06ffc flags=0x17c0010200(slab|head) > > - After the patch, > [ 8448.272530] Slab 0x90797883 objects=33 used=3 > fp=0x790f1c26 > flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) > > The documentation and test cases are also updated. The output of the > test cases as follows, > [11585.830272] test_printf: loaded. > [11585.830454] test_printf: all 388 tests passed > [11585.831401] test_printf: unloaded. > > Signed-off-by: Yafang Shao > Cc: David Hildenbrand > Cc: Joe Perches > Cc: Miaohe Lin > Cc: Vlastimil Babka > Cc: Andy Shevchenko > Cc: Matthew Wilcox > Cc: Petr Mladek > --- > Documentation/core-api/printk-formats.rst | 2 +- > lib/test_printf.c | 60 + > lib/vsprintf.c| 66 +-- > 3 files changed, 112 insertions(+), 16 deletions(-) > > diff --git a/Documentation/core-api/printk-formats.rst > b/Documentation/core-api/printk-formats.rst > index 160e710d992f..00d07c7eefd4 100644 > --- a/Documentation/core-api/printk-formats.rst > +++ b/Documentation/core-api/printk-formats.rst > @@ -540,7 +540,7 @@ Flags bitfields such as page flags, gfp_flags > > :: > > - %pGpreferenced|uptodate|lru|active|private > + %pGp > referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1f > %pGgGFP_USER|GFP_DMA32|GFP_NOWARN > %pGvread|exec|mayread|maywrite|mayexec|denywrite > > diff --git a/lib/test_printf.c b/lib/test_printf.c > index 7ac87f18a10f..148773dfe97a 100644 > --- a/lib/test_printf.c > +++ b/lib/test_printf.c > @@ -569,24 +569,68 @@ netdev_features(void) > { > } > > +static void __init > +page_flags_test(int section, int node, int zone, int last_cpupid, > + int kasan_tag, int flags, const char *name, char *cmp_buf) > +{ > + unsigned long page_flags = 0; > + unsigned long size = 0; > + > + flags &= BIT(NR_PAGEFLAGS) - 1; > + if (flags) { > + page_flags |= flags; > + snprintf(cmp_buf + size, BUF_SIZE - size, "%s|", name); > + size = strlen(cmp_buf); > + } > + > +#ifdef SECTION_IN_PAGE_FLAGS > + page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT; > + snprintf(cmp_buf + size, BUF_SIZE - size, "section=%#x|", sec); > + size = strlen(cmp_buf); > +#endif > + > + page_flags |= ((node & NODES_MASK) << NODES_PGSHIFT) | > + ((zone & ZONES_MASK) << ZONES_PGSHIFT); > + snprintf(cmp_buf + size, BUF_SIZE - size, "node=%d|zone=%d", node, > zone); > + size = strlen(cmp_buf); > + > +#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS > + page_flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; > + snprintf(cmp_buf + size, BUF_SIZE - size, "|lastcpupid=%#x", > last_cpupid); > + size = strlen(cmp_buf); > +#endif > + > +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) > + page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; > + snprintf(cmp_buf + size, BUF_SIZE - size, "|kasantag=%#x", tag); > + size = strlen(cmp_buf); > +#endif > + > + test(cmp_buf, "%pGp", _flags); > +} > + > static void __init > flags(void) > { > unsigned long flags; > - gfp_t gfp; > char *cmp_buffer; > + gfp_t gfp; > + > + cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); > + if (!cmp_buffer) > + return; > > flags = 0; > - test("", "%pGp", ); > + page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer); > > - /* Page flags should filter the zone id */ > flags = 1UL << NR_PAGEFLAGS; > - te
[PATCH v5 2/3] mm, slub: don't combine pr_err with INFO
It is strange to combine "pr_err" with "INFO", so let's remove the prefix completely. This patch is motivated by David's comment[1]. - before the patch [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) - after the patch [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) [1]. https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t Suggested-by: Vlastimil Babka Signed-off-by: Yafang Shao Acked-by: Vlastimil Babka Reviewed-by: Miaohe Lin Reviewed-by: Andy Shevchenko Reviewed-by: David Hildenbrand Cc: Matthew Wilcox --- mm/slub.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ddd429832577..d66a7ebd21fe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -615,7 +615,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) if (!t->addr) return; - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { @@ -641,7 +641,7 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", page, page->objects, page->inuse, page->freelist, page->flags, >flags); @@ -698,7 +698,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) @@ -791,7 +791,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); @@ -3855,7 +3855,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } -- 2.18.2
[PATCH v5 0/3] mm, vsprintf: dump full information of page flags in pGp
The existed pGp shows the names of page flags only, rather than the full information including section, node, zone, last cpuipid and kasan tag. While it is not easy to parse these information manually because there are so many flavors. We'd better interpret them in printf. To be compitable with the existed format of pGp, the new introduced ones also use '|' as the separator, then the user tools parsing pGp won't need to make change, suggested by Matthew. The new added information is tracked onto the end of the existed one, e.g. [ 8838.835456] Slab 0x2828b78a objects=33 used=3 fp=0xd04efc88 flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) The documentation and test cases are also updated. The result of the test cases as follows, [11585.830272] test_printf: loaded. [11585.830454] test_printf: all 388 tests passed [11585.831401] test_printf: unloaded. This patchset also includes some code cleanup in mm/slub.c. v5: - remove the bitmap and better name the struct, per Petr v4: - extend %pGp instead of introducing new format, per Matthew v3: - coding improvement, per Joe and Andy - the possible impact on debugfs and the fix of it, per Joe and Matthew - introduce new format instead of changing pGp, per Andy v2: - various coding improvement, per Joe, Miaohe, Vlastimil and Andy - remove the prefix completely in patch #2, per Vlastimil - Update the test cases, per Andy Yafang Shao (3): mm, slub: use pGp to print page flags mm, slub: don't combine pr_err with INFO vsprintf: dump full information of page flags in pGp Documentation/core-api/printk-formats.rst | 2 +- lib/test_printf.c | 60 + lib/vsprintf.c| 66 +-- mm/slub.c | 13 ++--- 4 files changed, 119 insertions(+), 22 deletions(-) -- 2.18.2
[PATCH v5 3/3] vsprintf: dump full information of page flags in pGp
Currently the pGp only shows the names of page flags, rather than the full information including section, node, zone, last cpupid and kasan tag. While it is not easy to parse these information manually because there're so many flavors. Let's interpret them in pGp as well. To be compitable with the existed format of pGp, the new introduced ones also use '|' as the separator, then the user tools parsing pGp won't need to make change, suggested by Matthew. The new information is tracked onto the end of the existed one. One example of the output in mm/slub.c as follows, - Before the patch, [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) - After the patch, [ 8448.272530] Slab 0x90797883 objects=33 used=3 fp=0x790f1c26 flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) The documentation and test cases are also updated. The output of the test cases as follows, [11585.830272] test_printf: loaded. [11585.830454] test_printf: all 388 tests passed [11585.831401] test_printf: unloaded. Signed-off-by: Yafang Shao Cc: David Hildenbrand Cc: Joe Perches Cc: Miaohe Lin Cc: Vlastimil Babka Cc: Andy Shevchenko Cc: Matthew Wilcox Cc: Petr Mladek --- Documentation/core-api/printk-formats.rst | 2 +- lib/test_printf.c | 60 + lib/vsprintf.c| 66 +-- 3 files changed, 112 insertions(+), 16 deletions(-) diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 160e710d992f..00d07c7eefd4 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -540,7 +540,7 @@ Flags bitfields such as page flags, gfp_flags :: - %pGpreferenced|uptodate|lru|active|private + %pGp referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1f %pGgGFP_USER|GFP_DMA32|GFP_NOWARN %pGvread|exec|mayread|maywrite|mayexec|denywrite diff --git a/lib/test_printf.c b/lib/test_printf.c index 7ac87f18a10f..148773dfe97a 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -569,24 +569,68 @@ netdev_features(void) { } +static void __init +page_flags_test(int section, int node, int zone, int last_cpupid, + int kasan_tag, int flags, const char *name, char *cmp_buf) +{ + unsigned long page_flags = 0; + unsigned long size = 0; + + flags &= BIT(NR_PAGEFLAGS) - 1; + if (flags) { + page_flags |= flags; + snprintf(cmp_buf + size, BUF_SIZE - size, "%s|", name); + size = strlen(cmp_buf); + } + +#ifdef SECTION_IN_PAGE_FLAGS + page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, "section=%#x|", sec); + size = strlen(cmp_buf); +#endif + + page_flags |= ((node & NODES_MASK) << NODES_PGSHIFT) | + ((zone & ZONES_MASK) << ZONES_PGSHIFT); + snprintf(cmp_buf + size, BUF_SIZE - size, "node=%d|zone=%d", node, zone); + size = strlen(cmp_buf); + +#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS + page_flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, "|lastcpupid=%#x", last_cpupid); + size = strlen(cmp_buf); +#endif + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, "|kasantag=%#x", tag); + size = strlen(cmp_buf); +#endif + + test(cmp_buf, "%pGp", _flags); +} + static void __init flags(void) { unsigned long flags; - gfp_t gfp; char *cmp_buffer; + gfp_t gfp; + + cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); + if (!cmp_buffer) + return; flags = 0; - test("", "%pGp", ); + page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer); - /* Page flags should filter the zone id */ flags = 1UL << NR_PAGEFLAGS; - test("", "%pGp", ); + page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer); flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru | 1UL << PG_active | 1UL << PG_swapbacked; - test("uptodate|dirty|lru|active|swapbacked", "%pGp", ); - + page_flags_test(1, 1, 1, 0x1f, 1, flags, + "uptodate|dirty|lru|active|swapbacked", + cmp_buffer); flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_DENYWRITE; @@ -601,10 +645,6 @@ flags(void) gfp = __GF
[PATCH v5 1/3] mm, slub: use pGp to print page flags
As pGp has been already introduced in printk, we'd better use it to make the output human readable. Before this change, the output is, [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 fp=0x8cd1579c flags=0x17c0010200 While after this change, the output is, [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) Signed-off-by: Yafang Shao Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Christoph Lameter Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Miaohe Lin Reviewed-by: Andy Shevchenko --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 7ecbbbe5bc0c..ddd429832577 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -641,8 +641,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + page, page->objects, page->inuse, page->freelist, + page->flags, >flags); } -- 2.18.2
Re: [PATCH v4 3/3] vsprintf: dump full information of page flags in pGp
On Wed, Feb 10, 2021 at 8:51 PM Petr Mladek wrote: > > On Wed 2021-02-10 00:21:37, Yafang Shao wrote: > > On Tue, Feb 9, 2021 at 9:53 PM Petr Mladek wrote: > > > > > > On Tue 2021-02-09 18:56:13, Yafang Shao wrote: > > > > Currently the pGp only shows the names of page flags, rather than > > > > the full information including section, node, zone, last cpupid and > > > > kasan tag. While it is not easy to parse these information manually > > > > because there're so many flavors. Let's interpret them in pGp as well. > > > > > > > > To be compitable with the existed format of pGp, the new introduced ones > > > > also use '|' as the separator, then the user tools parsing pGp won't > > > > need to make change, suggested by Matthew. The new information is > > > > tracked onto the end of the existed one. > > > > > > > > On example of the output in mm/slub.c as follows, > > > > - Before the patch, > > > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 > > > > fp=0x9ae06ffc flags=0x17c0010200(slab|head) > > > > > > > > - After the patch, > > > > [ 8838.835456] Slab 0x2828b78a objects=33 used=3 > > > > fp=0xd04efc88 > > > > flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) > > > > > > > > The documentation and test cases are also updated. The output of the > > > > test cases as follows, > > > > [ 501.485081] test_printf: loaded. > > > > [ 501.485768] test_printf: all 388 tests passed > > > > [ 501.488762] test_printf: unloaded. > > > > > > > > > > > diff --git a/lib/vsprintf.c b/lib/vsprintf.c > > > > index 14c9a6af1b23..3f26611adb34 100644 > > > > --- a/lib/vsprintf.c > > > > +++ b/lib/vsprintf.c > > > > @@ -1916,6 +1916,66 @@ char *format_flags(char *buf, char *end, > > > > unsigned long flags, > > > > return buf; > > > > } > > > > > > > > +struct page_flags_layout { > > > > + int width; > > > > + int shift; > > > > + int mask; > > > > + const struct printf_spec *spec; > > > > + const char *name; > > > > +}; > > > > + > > > > +static const struct page_flags_layout pfl[] = { > > > > + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, > > > > + _dec_spec, "section"}, > > > > + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, > > > > + _dec_spec, "node"}, > > > > + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, > > > > + _dec_spec, "zone"}, > > > > + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, > > > > + _flag_spec, "lastcpupid"}, > > > > + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, > > > > + _flag_spec, "kasantag"}, > > > > +}; > > > > + > > > > +static > > > > +char *format_page_flags(char *buf, char *end, unsigned long flags) > > > > +{ > > > > + DECLARE_BITMAP(mask, ARRAY_SIZE(pfl)); > > > > + unsigned long last; > > > > + int i; > > > > + > > > > + if (flags & (BIT(NR_PAGEFLAGS) - 1)) { > > > > + if (buf < end) > > > > + *buf = '|'; > > > > + buf++; > > > > + } > > > > > > This is far from obvious. You print '|' here because you printed > > > something somewhere else. See below. > > > > > > > + > > > > + for (i = 0; i < ARRAY_SIZE(pfl); i++) > > > > + __assign_bit(i, mask, pfl[i].width); > > > > > > The bitmap looks like an overkill. If I get it correctly, it is a > > > tricky way to handle only flags defined by the used build > > > configuration. See below. > > > > > > > + last = find_last_bit(mask, ARRAY_SIZE(pfl)); > > > > + > > > > + for_each_set_bit(i, mask, ARRAY_SIZE(pfl)) { > > > > + /* Format: Flag Name + '=' (equals sign) + Number + '|' > > > > (separator) */ > > > > + buf = string(buf, end, pfl[i].name, *pfl[i].spec); > > > > + > > > > + if (buf < end) > > > > + *buf = '='; > > >
Re: [PATCH v4 3/3] vsprintf: dump full information of page flags in pGp
On Tue, Feb 9, 2021 at 9:53 PM Petr Mladek wrote: > > On Tue 2021-02-09 18:56:13, Yafang Shao wrote: > > Currently the pGp only shows the names of page flags, rather than > > the full information including section, node, zone, last cpupid and > > kasan tag. While it is not easy to parse these information manually > > because there're so many flavors. Let's interpret them in pGp as well. > > > > To be compitable with the existed format of pGp, the new introduced ones > > also use '|' as the separator, then the user tools parsing pGp won't > > need to make change, suggested by Matthew. The new information is > > tracked onto the end of the existed one. > > > > On example of the output in mm/slub.c as follows, > > - Before the patch, > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 > > fp=0x9ae06ffc flags=0x17c0010200(slab|head) > > > > - After the patch, > > [ 8838.835456] Slab 0x2828b78a objects=33 used=3 > > fp=0xd04efc88 > > flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) > > > > The documentation and test cases are also updated. The output of the > > test cases as follows, > > [ 501.485081] test_printf: loaded. > > [ 501.485768] test_printf: all 388 tests passed > > [ 501.488762] test_printf: unloaded. > > > > > diff --git a/lib/vsprintf.c b/lib/vsprintf.c > > index 14c9a6af1b23..3f26611adb34 100644 > > --- a/lib/vsprintf.c > > +++ b/lib/vsprintf.c > > @@ -1916,6 +1916,66 @@ char *format_flags(char *buf, char *end, unsigned > > long flags, > > return buf; > > } > > > > +struct page_flags_layout { > > + int width; > > + int shift; > > + int mask; > > + const struct printf_spec *spec; > > + const char *name; > > +}; > > + > > +static const struct page_flags_layout pfl[] = { > > + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, > > + _dec_spec, "section"}, > > + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, > > + _dec_spec, "node"}, > > + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, > > + _dec_spec, "zone"}, > > + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, > > + _flag_spec, "lastcpupid"}, > > + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, > > + _flag_spec, "kasantag"}, > > +}; > > + > > +static > > +char *format_page_flags(char *buf, char *end, unsigned long flags) > > +{ > > + DECLARE_BITMAP(mask, ARRAY_SIZE(pfl)); > > + unsigned long last; > > + int i; > > + > > + if (flags & (BIT(NR_PAGEFLAGS) - 1)) { > > + if (buf < end) > > + *buf = '|'; > > + buf++; > > + } > > This is far from obvious. You print '|' here because you printed > something somewhere else. See below. > > > + > > + for (i = 0; i < ARRAY_SIZE(pfl); i++) > > + __assign_bit(i, mask, pfl[i].width); > > The bitmap looks like an overkill. If I get it correctly, it is a > tricky way to handle only flags defined by the used build > configuration. See below. > > > + last = find_last_bit(mask, ARRAY_SIZE(pfl)); > > + > > + for_each_set_bit(i, mask, ARRAY_SIZE(pfl)) { > > + /* Format: Flag Name + '=' (equals sign) + Number + '|' > > (separator) */ > > + buf = string(buf, end, pfl[i].name, *pfl[i].spec); > > + > > + if (buf < end) > > + *buf = '='; > > + buf++; > > + buf = number(buf, end, (flags >> pfl[i].shift) & pfl[i].mask, > > + *pfl[i].spec); > > + > > + /* No separator for the last entry */ > > + if (i != last) { > > + if (buf < end) > > + *buf = '|'; > > + buf++; > > + } > > + } > > + > > + return buf; > > +} > > + > > static noinline_for_stack > > char *flags_string(char *buf, char *end, void *flags_ptr, > > struct printf_spec spec, const char *fmt) > > @@ -1929,10 +1989,10 @@ char *flags_string(char *buf, char *end, void > > *flags_ptr, > > switch (fmt[1]) { > > case 'p': > > flags = *(unsigned long *)flags_ptr; > > - /* Remove zone id */ > > - flags &= (1UL << NR_PAGEFLAGS)
[PATCH v4 3/3] vsprintf: dump full information of page flags in pGp
Currently the pGp only shows the names of page flags, rather than the full information including section, node, zone, last cpupid and kasan tag. While it is not easy to parse these information manually because there're so many flavors. Let's interpret them in pGp as well. To be compitable with the existed format of pGp, the new introduced ones also use '|' as the separator, then the user tools parsing pGp won't need to make change, suggested by Matthew. The new information is tracked onto the end of the existed one. On example of the output in mm/slub.c as follows, - Before the patch, [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) - After the patch, [ 8838.835456] Slab 0x2828b78a objects=33 used=3 fp=0xd04efc88 flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) The documentation and test cases are also updated. The output of the test cases as follows, [ 501.485081] test_printf: loaded. [ 501.485768] test_printf: all 388 tests passed [ 501.488762] test_printf: unloaded. Signed-off-by: Yafang Shao Cc: David Hildenbrand Cc: Joe Perches Cc: Miaohe Lin Cc: Vlastimil Babka Cc: Andy Shevchenko Cc: Matthew Wilcox --- Documentation/core-api/printk-formats.rst | 2 +- lib/test_printf.c | 60 + lib/vsprintf.c| 66 +-- 3 files changed, 114 insertions(+), 14 deletions(-) diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 6d26c5c6ac48..93c3e48ff30d 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -538,7 +538,7 @@ Flags bitfields such as page flags, gfp_flags :: - %pGpreferenced|uptodate|lru|active|private + %pGp referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1f %pGgGFP_USER|GFP_DMA32|GFP_NOWARN %pGvread|exec|mayread|maywrite|mayexec|denywrite diff --git a/lib/test_printf.c b/lib/test_printf.c index 7ac87f18a10f..148773dfe97a 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -569,24 +569,68 @@ netdev_features(void) { } +static void __init +page_flags_test(int section, int node, int zone, int last_cpupid, + int kasan_tag, int flags, const char *name, char *cmp_buf) +{ + unsigned long page_flags = 0; + unsigned long size = 0; + + flags &= BIT(NR_PAGEFLAGS) - 1; + if (flags) { + page_flags |= flags; + snprintf(cmp_buf + size, BUF_SIZE - size, "%s|", name); + size = strlen(cmp_buf); + } + +#ifdef SECTION_IN_PAGE_FLAGS + page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, "section=%#x|", sec); + size = strlen(cmp_buf); +#endif + + page_flags |= ((node & NODES_MASK) << NODES_PGSHIFT) | + ((zone & ZONES_MASK) << ZONES_PGSHIFT); + snprintf(cmp_buf + size, BUF_SIZE - size, "node=%d|zone=%d", node, zone); + size = strlen(cmp_buf); + +#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS + page_flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, "|lastcpupid=%#x", last_cpupid); + size = strlen(cmp_buf); +#endif + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, "|kasantag=%#x", tag); + size = strlen(cmp_buf); +#endif + + test(cmp_buf, "%pGp", _flags); +} + static void __init flags(void) { unsigned long flags; - gfp_t gfp; char *cmp_buffer; + gfp_t gfp; + + cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); + if (!cmp_buffer) + return; flags = 0; - test("", "%pGp", ); + page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer); - /* Page flags should filter the zone id */ flags = 1UL << NR_PAGEFLAGS; - test("", "%pGp", ); + page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer); flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru | 1UL << PG_active | 1UL << PG_swapbacked; - test("uptodate|dirty|lru|active|swapbacked", "%pGp", ); - + page_flags_test(1, 1, 1, 0x1f, 1, flags, + "uptodate|dirty|lru|active|swapbacked", + cmp_buffer); flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_DENYWRITE; @@ -601,10 +645,6 @@ flags(void) gfp = __GFP_ATOMIC; t
[PATCH v4 2/3] mm, slub: don't combine pr_err with INFO
It is strange to combine "pr_err" with "INFO", so let's remove the prefix completely. This patch is motivated by David's comment[1]. - before the patch [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) - after the patch [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) [1]. https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t Suggested-by: Vlastimil Babka Signed-off-by: Yafang Shao Acked-by: Vlastimil Babka Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Matthew Wilcox --- mm/slub.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 87ff086e68a4..2514c37ab4e4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -612,7 +612,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) if (!t->addr) return; - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { @@ -638,7 +638,7 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", page, page->objects, page->inuse, page->freelist, page->flags, >flags); @@ -695,7 +695,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) @@ -788,7 +788,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); @@ -3854,7 +3854,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } -- 2.17.1
[PATCH v4 1/3] mm, slub: use pGp to print page flags
As pGp has been already introduced in printk, we'd better use it to make the output human readable. Before this change, the output is, [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 fp=0x8cd1579c flags=0x17c0010200 While after this change, the output is, [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) Signed-off-by: Yafang Shao Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Christoph Lameter Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Miaohe Lin --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 34dcc09e2ec9..87ff086e68a4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -638,8 +638,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + page, page->objects, page->inuse, page->freelist, + page->flags, >flags); } -- 2.17.1
[PATCH v4 0/3] mm, vsprintf: dump full information of page flags in pGp
The existed pGp shows the names of page flags only, rather than the full information including section, node, zone, last cpuipid and kasan tag. While it is not easy to parse these information manually because there are so many flavors. We'd better interpret them in printf. To be compitable with the existed format of pGp, the new introduced ones also use '|' as the separator, then the user tools parsing pGp won't need to make change, suggested by Matthew. The new added information is tracked onto the end of the existed one, e.g. [ 8838.835456] Slab 0x2828b78a objects=33 used=3 fp=0xd04efc88 flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) The documentation and test cases are also updated. The result of the test cases as follows, [ 501.485081] test_printf: loaded. [ 501.485768] test_printf: all 388 tests passed [ 501.488762] test_printf: unloaded. This patchset also includes some code cleanup in mm/slub.c. v4: - extend %pGp instead of introducing new format, per Matthew v3: - coding improvement, per Joe and Andy - the possible impact on debugfs and the fix of it, per Joe and Matthew - introduce new format instead of changing pGp, per Andy v2: - various coding improvement, per Joe, Miaohe, Vlastimil and Andy - remove the prefix completely in patch #2, per Vlastimil - Update the test cases, per Andy Yafang Shao (3): mm, slub: use pGp to print page flags mm, slub: don't combine pr_err with INFO vsprintf: dump full information of page flags in pGp Documentation/core-api/printk-formats.rst | 2 +- lib/test_printf.c | 60 + lib/vsprintf.c| 66 +-- mm/slub.c | 13 ++--- 4 files changed, 121 insertions(+), 20 deletions(-) -- 2.17.1
Re: [PATCH v3 0/3] mm, vsprintf: introduce new format to dump full information of page flags
On Mon, Feb 8, 2021 at 8:20 PM Matthew Wilcox wrote: > > On Mon, Feb 08, 2021 at 06:14:36PM +0800, Yafang Shao wrote: > > To avoid breaking some tools which parsing pGp via debugfs or affecting > > the printf buffer, other new formats are introduced, so the user can choose > > what and in which order they want, suggested by Andy. These new introduced > > format as follows, > > pGpb: print other information first and then the names of page flags > > pGpl: print the names of page flags first and then the other info > > This is overengineering things. We already print in little-endian order, > and the other information should be tacked onto the end. Just extend > %pGp. Andy's suggestion to add another flag was a bad one. > All right. I will do it in the next version. -- Thanks Yafang
[PATCH v3 1/3] mm, slub: use pGp to print page flags
As pGp has been already introduced in printk, we'd better use it to make the output human readable. Before this change, the output is, [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 fp=0x8cd1579c flags=0x17c0010200 While after this change, the output is, [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Christoph Lameter Signed-off-by: Yafang Shao --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 34dcc09e2ec9..87ff086e68a4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -638,8 +638,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + page, page->objects, page->inuse, page->freelist, + page->flags, >flags); } -- 2.17.1
[PATCH v3 2/3] mm, slub: don't combine pr_err with INFO
It is strange to combine "pr_err" with "INFO", so let's remove the prefix completely. This patch is motivated by David's comment[1]. - before the patch [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) - after the patch [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) [1]. https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t Suggested-by: Vlastimil Babka Cc: David Hildenbrand Signed-off-by: Yafang Shao --- mm/slub.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 87ff086e68a4..2514c37ab4e4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -612,7 +612,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) if (!t->addr) return; - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { @@ -638,7 +638,7 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", page, page->objects, page->inuse, page->freelist, page->flags, >flags); @@ -695,7 +695,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) @@ -788,7 +788,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); @@ -3854,7 +3854,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } -- 2.17.1
[PATCH v3 0/3] mm, vsprintf: introduce new format to dump full information of page flags
The existed pGp shows the names of page flags only, rather than the full information including section, node, zone, last cpuipid and kasan tag. While it is not easy to parse these information manually because there are so many flavors. We'd better interpret them in printf. To avoid breaking some tools which parsing pGp via debugfs or affecting the printf buffer, other new formats are introduced, so the user can choose what and in which order they want, suggested by Andy. These new introduced format as follows, pGpb: print other information first and then the names of page flags pGpl: print the names of page flags first and then the other info The differece between them looks like the difference between big-endian and little-endian, that's why they are named like that. The examples of the output as follows, %pGpb 0x17c0010200(node=0|zone=2|lastcpupid=0x1f|slab|head) %pGpl 0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) To be compitable with the existed format of pGp, the new introduced ones also use '|' as the separator, then the user tools parsing pGp won't need to make change, suggested by Matthew. The doc and test cases are also updated. Below is the output of the test cases, [ 4299.847655] test_printf: loaded. [ 4299.848301] test_printf: all 404 tests passed [ 4299.850371] test_printf: unloaded. This patchset also includes some code cleanup in mm/slub.c. v3: - coding improvement, per Joe and Andy - the possible impact on debugfs and the fix of it, per Joe and Matthew - introduce new format instead of changing pGp, per Andy v2: - various coding improvement, per Joe, Miaohe, Vlastimil and Andy - remove the prefix completely in patch #2, per Vlastimil - Update the test cases, per Andy Yafang Shao (3): mm, slub: use pGp to print page flags mm, slub: don't combine pr_err with INFO vsprintf: introduce new format to dump full information of page flags Documentation/core-api/printk-formats.rst | 2 + lib/test_printf.c | 126 +++--- lib/vsprintf.c| 115 +++- mm/slub.c | 13 +-- 4 files changed, 233 insertions(+), 23 deletions(-) -- 2.17.1
[PATCH v3 3/3] vsprintf: introduce new format to dump full information of page flags
The existed pGp shows the names of page flags only, rather than the full information including section, node, zone, last cpuipid and kasan tag. While it is not easy to parse these information manually because there are so many flavors. We'd better interpret them in printf. To avoid breaking some tools which parsing pGp via debugfs or affecting the printf buffer, other new formats are introduced, so the user can choose what and in which order they want, suggested by Andy. These new introduced format as follows, pGpb: print other information first and then the names of page flags pGpl: print the names of page flags first and then the other info The differece between them looks like the difference between big-endian and little-endian, that's why they are named like that. The examples of the output as follows, %pGpb 0x17c0010200(node=0|zone=2|lastcpupid=0x1f|slab|head) %pGpl 0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) To be compitable with the existed format of pGp, the new introduced ones also use '|' as the separator, then the user tools parsing pGp won't need to make change, suggested by Matthew. The doc and test cases are also updated. Below is the output of the test cases, [ 4299.847655] test_printf: loaded. [ 4299.848301] test_printf: all 404 tests passed [ 4299.850371] test_printf: unloaded. Cc: David Hildenbrand Cc: Joe Perches Cc: Miaohe Lin Cc: Vlastimil Babka Cc: Andy Shevchenko Cc: Matthew Wilcox Signed-off-by: Yafang Shao --- Documentation/core-api/printk-formats.rst | 2 + lib/test_printf.c | 126 +++--- lib/vsprintf.c| 115 +++- 3 files changed, 226 insertions(+), 17 deletions(-) diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 6d26c5c6ac48..56f8e0fc3963 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -539,6 +539,8 @@ Flags bitfields such as page flags, gfp_flags :: %pGpreferenced|uptodate|lru|active|private + %pGpb node=0|zone=2|referenced|uptodate|lru|active|private + %pGpl referenced|uptodate|lru|active|private|node=0|zone=2 %pGgGFP_USER|GFP_DMA32|GFP_NOWARN %pGvread|exec|mayread|maywrite|mayexec|denywrite diff --git a/lib/test_printf.c b/lib/test_printf.c index 7ac87f18a10f..af2945bb730a 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -569,24 +569,130 @@ netdev_features(void) { } +static void __init +page_flags_build_info(char *cmp_buf, int prefix, int sec, int node, int zone, + int last_cpupid, int kasan_tag, unsigned long *flags) +{ + unsigned long size = prefix; + +#ifdef SECTION_IN_PAGE_FLAGS + *flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, "section=%#x|", sec); + size = strlen(cmp_buf); +#endif + + *flags |= ((node & NODES_MASK) << NODES_PGSHIFT) | + ((zone & ZONES_MASK) << ZONES_PGSHIFT); + snprintf(cmp_buf + size, BUF_SIZE - size, "node=%d|zone=%d", node, zone); + size = strlen(cmp_buf); + +#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS + *flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, "|lastcpupid=%#x", last_cpupid); + size = strlen(cmp_buf); +#endif + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + *flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, "|kasantag=%#x", tag); + size = strlen(cmp_buf); +#endif +} + static void __init -flags(void) +page_flags_build_names(char *cmp_buf, int prefix, const char *expect, + unsigned long flags, unsigned long *page_flags) { + *page_flags |= flags; + snprintf(cmp_buf + prefix, BUF_SIZE - prefix, "%s", expect); +} + +static void __init +__page_flags_test(const char *expect, unsigned long flags) +{ + test(expect, "%pGp", ); +} + +static void __init +page_flags_test_be(char *cmp_buf, int sec, int node, int zone, + int last_cpupid, int kasan_tag, const char *name, + unsigned long flags) +{ + unsigned long page_flags = 0; + int size; + + page_flags_build_info(cmp_buf, 0, sec, node, zone, last_cpupid, + kasan_tag, _flags); + + if (*name) { + size = strlen(cmp_buf); + if (size < BUF_SIZE - 2) { + *(cmp_buf + size) = '|'; + *(cmp_buf + size + 1) = '\0'; + } + page_flags_build_names(cmp_buf, strlen(cmp_buf), name, flags, _flags); + } + + test(cmp_buf, "%pGpb", _flags); +} + +static void __in
Re: [PATCH v2 3/3] vsprintf: dump full information of page flags in pGp
On Tue, Feb 2, 2021 at 12:16 AM Andy Shevchenko wrote: > > On Mon, Feb 01, 2021 at 09:49:59PM +0800, Yafang Shao wrote: > > On Mon, Feb 1, 2021 at 9:27 PM Andy Shevchenko > > wrote: > > > On Mon, Feb 01, 2021 at 07:56:10PM +0800, Yafang Shao wrote: > > ... > > > > > - Before the patch, > > > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 > > > > fp=0x9ae06ffc flags=0x17c0010200(slab|head) > > > > > > > > - After the patch, > > > > [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 > > > > fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid > > > > 0x1f,slab|head) > > > > > > > > The Documentation and test cases are also updated. > > > > > > Thanks for an update, my comments below. > > > > > > ... > > > > > > > - %pGpreferenced|uptodate|lru|active|private > > > > + %pGpNode 0,Zone 2,referenced|uptodate|lru|active|private > > > > > > Since of the nature of printf() buffer, I wonder if these should be at > > > the end. > > > I.o.w. the question is is the added material more important to user to > > > see than > > > the existed one? > > > > > > > The existing one should be more important than the added one. > > But the order of output will not match with the value for page->flags. > > E.g. > > flags=0x17c0010200(slab|head,Node 0,Zone 2,Lastcpupid 0x1f) > > It may be strange to compare the value with the string. > > More I'm looking at it, more I'm thinking it should have different specifiers > for each group of desired flags to be printed. > > So, you leave %pGp as is and then add another letter to add more details, so > user will choose what and in which order they want. > > For example, let's assume %pGp == %pGpf and P is a new specifier for what you > are initially adding here: > > %pGpfP => referenced|uptodate|lru|active|private,Node 0,Zone 2 > %pGpPf => Node 0,Zone 2,referenced|uptodate|lru|active|private > > and so on. Thanks for your suggestion. I will think about it. -- Thanks Yafang
Re: [PATCH v2 3/3] vsprintf: dump full information of page flags in pGp
On Mon, Feb 1, 2021 at 10:15 PM Matthew Wilcox wrote: > > On Mon, Feb 01, 2021 at 07:56:10PM +0800, Yafang Shao wrote: > > - Before the patch, > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 > > fp=0x9ae06ffc flags=0x17c0010200(slab|head) > > > > - After the patch, > > [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 > > fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid > > 0x1f,slab|head) > > I would suggest it will be easier to parse as: > > flags=0x17c0010200(slab|head|node=0|zone=2|lastcpupid=0x1f) > > That should alleviate the concerns about debugfs format change -- we've > never guaranteed that flag names won't change, and they now look enough > like flags that parsers shouldn't fall over them. Good suggestion! I will do it as you suggested in the next version. -- Thanks Yafang
Re: [PATCH v2 3/3] vsprintf: dump full information of page flags in pGp
On Mon, Feb 1, 2021 at 9:34 PM Andy Shevchenko wrote: > > On Mon, Feb 01, 2021 at 02:23:33PM +0100, David Hildenbrand wrote: > > On 01.02.21 12:56, Yafang Shao wrote: > > > Currently the pGp only shows the names of page flags, rather than > > > the full information including section, node, zone, last cpupid and > > > kasan tag. While it is not easy to parse these information manually > > > because there're so many flavors. Let's interpret them in pGp as well. > > > > > > - Before the patch, > > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 > > > fp=0x9ae06ffc flags=0x17c0010200(slab|head) > > > > > > - After the patch, > > > [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 > > > fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid > > > 0x1f,slab|head) > > > > For debugging purposes, it might be helpful to have the actual zone name > > (and to know if the value is sane). You could obtain it (without other > > modifications) via > > > > const char zname = "Invalid"; > > > > if (zone < MAX_NR_ZONES) > > zname = first_online_pgdat()->node_zones[zone].name; > > > > > > Similarly, it might also be helpful to indicate if a node is > > online/offline/invalid/. > > > > const char nstate = "Invalid"; > > > > if (node_online(nid)) > > nstate = "Online"; > > else if (node_possible(nid)) > > nstate = "Offline"; > > > > > > Printing that in addition to the raw value could be helpful. Just some > > thoughts. > > printf() buffer is not a black hole, esp. when you get it messed with critical > messages (Oops). I suggest to reduce a burden as much as possible. If you wish > to get this, make it caller-configurable, i.e. adding another letter to the > specifier. > I think David's suggestion will help us to identify issues. You mean that we should use another one like "%pGpd" - means pGp debug - to implement it ? -- Thanks Yafang
Re: [PATCH v2 3/3] vsprintf: dump full information of page flags in pGp
On Mon, Feb 1, 2021 at 9:27 PM Andy Shevchenko wrote: > > On Mon, Feb 01, 2021 at 07:56:10PM +0800, Yafang Shao wrote: > > Currently the pGp only shows the names of page flags, rather than > > the full information including section, node, zone, last cpupid and > > kasan tag. While it is not easy to parse these information manually > > because there're so many flavors. Let's interpret them in pGp as well. > > > > - Before the patch, > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 > > fp=0x9ae06ffc flags=0x17c0010200(slab|head) > > > > - After the patch, > > [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 > > fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid > > 0x1f,slab|head) > > > > The Documentation and test cases are also updated. > > Thanks for an update, my comments below. > > ... > > > - %pGpreferenced|uptodate|lru|active|private > > + %pGpNode 0,Zone 2,referenced|uptodate|lru|active|private > > Since of the nature of printf() buffer, I wonder if these should be at the > end. > I.o.w. the question is is the added material more important to user to see > than > the existed one? > The existing one should be more important than the added one. But the order of output will not match with the value for page->flags. E.g. flags=0x17c0010200(slab|head,Node 0,Zone 2,Lastcpupid 0x1f) It may be strange to compare the value with the string. > ... > > > +static void __init > > +page_flags_test(int section, int node, int zone, int last_cpupid, > > + int kasan_tag, int flags, const char *name, char *cmp_buf) > > +{ > > + unsigned long page_flags = 0; > > + unsigned long size = 0; > > + > > +#ifdef SECTION_IN_PAGE_FLAGS > > + page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT; > > + snprintf(cmp_buf, BUF_SIZE, "Section %#x,", sec); > > I would keep it in the same form as latter ones, i.e. > > snprintf(cmp_buf + size, BUF_SIZE - size, "Section %#x,", sec); > > In this case it will be easier if at some point we might need to reshuffle. > Good suggestion. > > + size = strlen(cmp_buf); > > +#endif > > + > > + page_flags |= (node & NODES_MASK) << NODES_PGSHIFT; > > + snprintf(cmp_buf + size, BUF_SIZE - size, "Node %d", node); > > + size = strlen(cmp_buf); > > + > > + page_flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; > > + snprintf(cmp_buf + size, BUF_SIZE - size, ",Zone %d", zone); > > + size = strlen(cmp_buf); > > + > > +#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS > > + page_flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; > > + snprintf(cmp_buf + size, BUF_SIZE - size, ",Lastcpupid %#x", > > last_cpupid); > > + size = strlen(cmp_buf); > > +#endif > > + > > +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) > > + page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; > > + snprintf(cmp_buf + size, BUF_SIZE - size, ",Kasantag %#x", tag); > > + size = strlen(cmp_buf); > > +#endif > > + > > + test(cmp_buf, "%pGp", _flags); > > + > > + if (flags) { > > If below will be always for flags set, I would rewrite this condition as > > if (!flags) > return; > > but it's up to you. > > > + page_flags |= flags; > > + snprintf(cmp_buf + size, BUF_SIZE - size, ",%s", name); > > + test(cmp_buf, "%pGp", _flags); > > + } > > +} > > ... > > > - flags = 0; > > > - flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru > > - | 1UL << PG_active | 1UL << PG_swapbacked; > > I would leave this untouched and reuse below... > > > + cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); > > + if (!cmp_buffer) > > + return; > > ...as > > > + page_flags_test(0, 0, 0, 0, 0, 0, NULL, cmp_buffer); > > flags = 0; > page_flags_test(0, 0, 0, 0, 0, flags, NULL, cmp_buffer); > > > + page_flags_test(1, 1, 1, 0x1, 1, > > + (1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru > > + | 1UL << PG_active | 1UL << PG_swapbacked), > > + "uptodate|dirty|lru|active|swapbacked", > > +
Re: [PATCH v2 3/3] vsprintf: dump full information of page flags in pGp
On Mon, Feb 1, 2021 at 9:05 PM Joe Perches wrote: > > On Mon, 2021-02-01 at 19:56 +0800, Yafang Shao wrote: > > Currently the pGp only shows the names of page flags, rather than > > the full information including section, node, zone, last cpupid and > > kasan tag. While it is not easy to parse these information manually > > because there're so many flavors. Let's interpret them in pGp as well. > > > > - Before the patch, > > [ 6343.396602] Slab 0x4382e02b objects=33 used=3 > > fp=0x9ae06ffc flags=0x17c0010200(slab|head) > > > > - After the patch, > > [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 > > fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid > > 0x1f,slab|head) > > While debugfs is not an ABI, this format is exported in debugfs to > userspace via mm/page_owner.c read_page_owner/print_page_owner. > Right, the page_owner will be affected by this change. > Does changing the output format matter to anyone? > The user tools which parse the page_owner may be affected. If we don't want to affect the userspace tools, I think we can make a little change in page_owner as follows, unsigned long masked_flags = page->flags & (BIT(NR_PAGEFLAGS) - 1); snprintf("..., %#lx(%pGp)\n", page->flags, _flags); > > diff --git a/lib/vsprintf.c b/lib/vsprintf.c > [] > > +static > > +char *format_page_flags(char *buf, char *end, unsigned long page_flags) > > +{ > > + unsigned long flags = page_flags & ((1UL << NR_PAGEFLAGS) - 1); > > + int size = ARRAY_SIZE(pfl); > > There's no real value in used-once variables. > > > + bool separator = false; > > + int i; > > + > > + for (i = 0; i < size; i++) { > > Use ARRAY_SIZE here instead > Sure > for (i = 0; i < ARRAY_SIZE(pfl); i++) { > > > + if (pfl[i].width == 0) > > + continue; > > + > > + if (separator) { > > + if (buf < end) > > + *buf = ','; > > + buf++; > > + } > > + > > + > > + buf = string(buf, end, pfl[i].name, *pfl[i].spec); > > + > > + buf = number(buf, end, (page_flags >> pfl[i].shift) & > > pfl[i].mask, > > + *pfl[i].spec); > > + separator = true; > > + } > > Style question: > Might this array be more intelligible with pointers instead of indexes? Good suggestion! I will change it in the next version. > Something like: > > struct page_flags_layout *p; > > for (p = pfl; p < pfl + ARRAY_SIZE(pfl); p++) { > if (p->width == 0) > continue; > > if (p > pfl) { > if (buf < end) > *buf = ','; > buf++; > } It doesn't work, because there's a 'continue' above, meaning that the p may be greater than pfl without doing anything. > > buf = string(buf, end, p->name, *p->spec); > buf = number(buf, end, (page_flags >> p->shift) & p->mask, > *p->spec); > } > > > + > > + if (flags) { > > Maybe: > > if (page_flags & (BIT(NR_PAGEFLAGS) - 1)) { > Sure. > > + if (buf < end) > > + *buf = ','; > > + buf++; > > + } > > + > > + return buf; > > +} > > + > > -- Thanks Yafang
[PATCH v2 3/3] vsprintf: dump full information of page flags in pGp
Currently the pGp only shows the names of page flags, rather than the full information including section, node, zone, last cpupid and kasan tag. While it is not easy to parse these information manually because there're so many flavors. Let's interpret them in pGp as well. - Before the patch, [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) - After the patch, [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid 0x1f,slab|head) The Documentation and test cases are also updated. Cc: David Hildenbrand Cc: Joe Perches Cc: Miaohe Lin Cc: Vlastimil Babka Cc: Andy Shevchenko Signed-off-by: Yafang Shao --- Documentation/core-api/printk-formats.rst | 2 +- lib/test_printf.c | 65 ++- lib/vsprintf.c| 58 +++- 3 files changed, 109 insertions(+), 16 deletions(-) diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 6d26c5c6ac48..1374cdd04f0f 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -538,7 +538,7 @@ Flags bitfields such as page flags, gfp_flags :: - %pGpreferenced|uptodate|lru|active|private + %pGpNode 0,Zone 2,referenced|uptodate|lru|active|private %pGgGFP_USER|GFP_DMA32|GFP_NOWARN %pGvread|exec|mayread|maywrite|mayexec|denywrite diff --git a/lib/test_printf.c b/lib/test_printf.c index 7ac87f18a10f..4c5e064cbe2e 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -569,6 +569,48 @@ netdev_features(void) { } +static void __init +page_flags_test(int section, int node, int zone, int last_cpupid, + int kasan_tag, int flags, const char *name, char *cmp_buf) +{ + unsigned long page_flags = 0; + unsigned long size = 0; + +#ifdef SECTION_IN_PAGE_FLAGS + page_flags |= (sec & SECTIONS_MASK) << SECTIONS_PGSHIFT; + snprintf(cmp_buf, BUF_SIZE, "Section %#x,", sec); + size = strlen(cmp_buf); +#endif + + page_flags |= (node & NODES_MASK) << NODES_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, "Node %d", node); + size = strlen(cmp_buf); + + page_flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, ",Zone %d", zone); + size = strlen(cmp_buf); + +#ifndef LAST_CPUPID_NOT_IN_PAGE_FLAGS + page_flags |= (last_cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, ",Lastcpupid %#x", last_cpupid); + size = strlen(cmp_buf); +#endif + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + page_flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + snprintf(cmp_buf + size, BUF_SIZE - size, ",Kasantag %#x", tag); + size = strlen(cmp_buf); +#endif + + test(cmp_buf, "%pGp", _flags); + + if (flags) { + page_flags |= flags; + snprintf(cmp_buf + size, BUF_SIZE - size, ",%s", name); + test(cmp_buf, "%pGp", _flags); + } +} + static void __init flags(void) { @@ -576,17 +618,16 @@ flags(void) gfp_t gfp; char *cmp_buffer; - flags = 0; - test("", "%pGp", ); - - /* Page flags should filter the zone id */ - flags = 1UL << NR_PAGEFLAGS; - test("", "%pGp", ); - - flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru - | 1UL << PG_active | 1UL << PG_swapbacked; - test("uptodate|dirty|lru|active|swapbacked", "%pGp", ); + cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); + if (!cmp_buffer) + return; + page_flags_test(0, 0, 0, 0, 0, 0, NULL, cmp_buffer); + page_flags_test(1, 1, 1, 0x1, 1, + (1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru +| 1UL << PG_active | 1UL << PG_swapbacked), + "uptodate|dirty|lru|active|swapbacked", + cmp_buffer); flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_DENYWRITE; @@ -601,10 +642,6 @@ flags(void) gfp = __GFP_ATOMIC; test("__GFP_ATOMIC", "%pGg", ); - cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); - if (!cmp_buffer) - return; - /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); diff --git a/lib/vsprintf.c b/lib/vsprintf.c i
[PATCH v2 2/3] mm, slub: don't combine pr_err with INFO
It is strange to combine "pr_err" with "INFO", so let's remove the prefix completely. This patch is motivated by David's comment[1]. - before the patch [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) - after the patch [ 6343.396602] Slab 0x4382e02b objects=33 used=3 fp=0x9ae06ffc flags=0x17c0010200(slab|head) [1]. https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t Suggested-by: Vlastimil Babka Cc: David Hildenbrand Signed-off-by: Yafang Shao --- mm/slub.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 87ff086e68a4..2514c37ab4e4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -612,7 +612,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) if (!t->addr) return; - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { @@ -638,7 +638,7 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", page, page->objects, page->inuse, page->freelist, page->flags, >flags); @@ -695,7 +695,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) @@ -788,7 +788,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); @@ -3854,7 +3854,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } -- 2.17.1
[PATCH v2 1/3] mm, slub: use pGp to print page flags
As pGp has been already introduced in printk, we'd better use it to make the output human readable. Before this change, the output is, [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 fp=0x8cd1579c flags=0x17c0010200 While after this change, the output is, [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Signed-off-by: Yafang Shao --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 34dcc09e2ec9..87ff086e68a4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -638,8 +638,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + page, page->objects, page->inuse, page->freelist, + page->flags, >flags); } -- 2.17.1
[PATCH v2 0/3] mm, vsprintf: dump full information of page flags in pGp
Currently the pGp only shows the names of page flags, rather than the full information including section, node, zone, last cpupid and kasan tag. While it is not easy to parse these information manually because there're so many flavors. Let's interpret them in pGp as well. This patchset also includes some code cleanup in mm/slub.c. Below is the example of the output in mm/slub.c. - Before the patchset [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 fp=0x8cd1579c flags=0x17c0010200 - After the patchset [ 6871.296131] Slab 0xc0e19a37 objects=33 used=3 fp=0xc4902159 flags=0x17c0010200(Node 0,Zone 2,Lastcpupid 0x1f,slab|head) The documentation and test cases of pGp are also updated. Below is the result of the test_printf, [ 5091.307308] test_printf: loaded. [ 5091.308285] test_printf: all 388 tests passed [ 5091.309105] test_printf: unloaded. v2: - various coding improvement, per Joe, Miaohe, Vlastimil and Andy - remove the prefix completely in patch #2, per Vlastimil - Update the test cases, per Andy Yafang Shao (3): mm, slub: use pGp to print page flags mm, slub: don't combine pr_err with INFO vsprintf: dump full information of page flags in pGp Documentation/core-api/printk-formats.rst | 2 +- lib/test_printf.c | 65 ++- lib/vsprintf.c| 58 +++- mm/slub.c | 13 ++--- 4 files changed, 116 insertions(+), 22 deletions(-) -- 2.17.1
Re: [PATCH 3/3] printk: dump full information of page flags in pGp
On Thu, Jan 28, 2021 at 10:50 PM Andy Shevchenko wrote: > > On Thu, Jan 28, 2021 at 09:18:24PM +0800, Yafang Shao wrote: > > On Thu, Jan 28, 2021 at 8:11 PM Andy Shevchenko > > wrote: > > ... > > > Thanks for the explanation. > > I will change it as you suggested. > > You are welcome! > > Just note, that kasprintf() should work for this as well as for the rest > printf() cases. That's why it's very important to return proper buffer > pointer. > > Also read `man snprintf(3)` RETURN VALUE section to understand it. > Thanks for the information. I will take a look at it. -- Thanks Yafang
Re: [PATCH 0/3] mm, printk: dump full information of page flags in pGp
On Thu, Jan 28, 2021 at 8:12 PM Andy Shevchenko wrote: > > On Thu, Jan 28, 2021 at 10:19:44AM +0800, Yafang Shao wrote: > > Currently the pGp only shows the names of page flags, rather than > > the full information including section, node, zone, last cpupid and > > kasan tag. While it is not easy to parse these information manually > > because there're so many flavors. Let's interpret them in pGp as well. > > > > This patchset also includes some code cleanup in mm/slub.c. > > > > Below is the example of the output in mm/slub.c. > > - Before the patchset > > [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 > > fp=0x8cd1579c flags=0x17c0010200 > > > > - After the patchset > > [ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 > > fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid > > 0x1f,slab|head) > > > Please, add a corresponding test cases to test_printf.c. W/o test cases NAK. > Sure. Will add the test cases in the next version. -- Thanks Yafang
Re: [PATCH 3/3] printk: dump full information of page flags in pGp
On Thu, Jan 28, 2021 at 8:11 PM Andy Shevchenko wrote: > > On Thu, Jan 28, 2021 at 10:19:47AM +0800, Yafang Shao wrote: > > Currently the pGp only shows the names of page flags, rather than > > the full information including section, node, zone, last cpupid and > > kasan tag. While it is not easy to parse these information manually > > because there're so many flavors. Let's interpret them in pGp as well. > > > > - Before the patch, > > [ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 > > fp=0x6d0779d1 flags=0x17c0010200(slab|head) > > > > - After the patch, > > [ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 > > fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid > > 0x1f,slab|head) > > > + int i; > > + > > + for (i = 0; i < sizeof(pfl) / sizeof(struct page_flags_layout) && buf > > < end; i++) { > > 'buf < end' is redundant. > Thanks for pointing this out. > > + if (pfl[i].width == 0) > > + continue; > > + > > + buf = string(buf, end, pfl[i].name, default_str_spec); > > > + if (buf >= end) > > + break; > > Can you rather use usual patter, i.e. > > if (buf < end) { > ...do something... > } > buf++; // or whatever increase should be done > > Moreover, number() and string() IIRC have the proper checks embedded into > them. > I will take a look at the detail in these two functions. > > + buf = number(buf, end, (flags >> pfl[i].shift) & pfl[i].mask, > > + default_flag_spec); > > > + if (buf >= end) > > + break; > > + *buf = ','; > > + buf++; > > Here is a very standard pattern can be used, see code around > > if (buf < end) > *buf = ','; > buf++; > > > + } > Thanks for the explanation. I will change it as you suggested. -- Thanks Yafang
Re: [PATCH 3/3] printk: dump full information of page flags in pGp
On Thu, Jan 28, 2021 at 6:42 PM Vlastimil Babka wrote: > > On 1/28/21 3:19 AM, Yafang Shao wrote: > > Currently the pGp only shows the names of page flags, rather than > > the full information including section, node, zone, last cpupid and > > kasan tag. While it is not easy to parse these information manually > > because there're so many flavors. Let's interpret them in pGp as well. > > > > - Before the patch, > > [ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 > > fp=0x6d0779d1 flags=0x17c0010200(slab|head) > > > > - After the patch, > > [ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 > > fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid > > 0x1f,slab|head) > > node, zone could be decimal? Make sense to me. Will change it. > > > > > Cc: David Hildenbrand > > Signed-off-by: Yafang Shao > > --- > > lib/vsprintf.c | 42 +- > > 1 file changed, 41 insertions(+), 1 deletion(-) > > > > diff --git a/lib/vsprintf.c b/lib/vsprintf.c > > index 3b53c73580c5..bd809f4f1b82 100644 > > --- a/lib/vsprintf.c > > +++ b/lib/vsprintf.c > > @@ -1916,6 +1916,46 @@ char *format_flags(char *buf, char *end, unsigned > > long flags, > > return buf; > > } > > > > +struct page_flags_layout { > > + int width; > > + int shift; > > + int mask; > > + char *name; > > +}; > > + > > +struct page_flags_layout pfl[] = { > > + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, "Section "}, > > + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, "Node "}, > > + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, "Zone "}, > > + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, > > "Lastcpupid "}, > > + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, "Kasantag "}, > > +}; > > + > > +static > > +char *format_layout(char *buf, char *end, unsigned long flags) > > +{ > > + int i; > > + > > + for (i = 0; i < sizeof(pfl) / sizeof(struct page_flags_layout) && buf > > < end; i++) { > > + if (pfl[i].width == 0) > > + continue; > > + > > + buf = string(buf, end, pfl[i].name, default_str_spec); > > + > > + if (buf >= end) > > + break; > > + buf = number(buf, end, (flags >> pfl[i].shift) & pfl[i].mask, > > + default_flag_spec); > > + > > + if (buf >= end) > > + break; > > + *buf = ','; > > + buf++; > > + } > > + > > + return buf; > > +} > > + > > static noinline_for_stack > > char *flags_string(char *buf, char *end, void *flags_ptr, > > struct printf_spec spec, const char *fmt) > > @@ -1929,7 +1969,7 @@ char *flags_string(char *buf, char *end, void > > *flags_ptr, > > switch (fmt[1]) { > > case 'p': > > flags = *(unsigned long *)flags_ptr; > > - /* Remove zone id */ > > + buf = format_layout(buf, end, flags & ~((1UL << NR_PAGEFLAGS) > > - 1)); > > flags &= (1UL << NR_PAGEFLAGS) - 1; > > names = pageflag_names; > > break; > > > -- Thanks Yafang
Re: [PATCH 2/3] mm, slub: don't combine pr_err with INFO
On Thu, Jan 28, 2021 at 6:35 PM Vlastimil Babka wrote: > > On 1/28/21 3:19 AM, Yafang Shao wrote: > > It is strange to combine "pr_err" with "INFO", so let's clean them up. > > This patch is motivated by David's comment[1]. > > > > - before the patch > > [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 > > fp=0x60d32ca8 flags=0x17c0010200(slab|head) > > > > - after the patch > > [ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 > > fp=0x6d0779d1 flags=0x17c0010200(slab|head) > > > > [1]. > > https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t > > > > Cc: David Hildenbrand > > Signed-off-by: Yafang Shao > > These are usually printed as part of slab_bug() with its prominent banner. In > that sense it's additional details, thus INFO. The details itself are not > error, > thus ERR makes little sense imho. How about removing the prefix completely, or > just replacing with an ident to make it visually part of the BUG report. > Thanks for the explanation. I will remove the prefix completely in the next version. > > --- > > mm/slub.c | 10 +- > > 1 file changed, 5 insertions(+), 5 deletions(-) > > > > diff --git a/mm/slub.c b/mm/slub.c > > index 4b9ab267afbc..18b4474c8fa2 100644 > > --- a/mm/slub.c > > +++ b/mm/slub.c > > @@ -615,7 +615,7 @@ static void print_track(const char *s, struct track *t, > > unsigned long pr_time) > > if (!t->addr) > > return; > > > > - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", > > + pr_err("ERR: %s in %pS age=%lu cpu=%u pid=%d\n", > > s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); > > #ifdef CONFIG_STACKTRACE > > { > > @@ -641,7 +641,7 @@ void print_tracking(struct kmem_cache *s, void *object) > > > > static void print_page_info(struct page *page) > > { > > - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p > > flags=%#lx(%pGp)\n", > > + pr_err("ERR: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", > > page, page->objects, page->inuse, page->freelist, > > page->flags, >flags); > > > > @@ -698,7 +698,7 @@ static void print_trailer(struct kmem_cache *s, struct > > page *page, u8 *p) > > > > print_page_info(page); > > > > - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", > > + pr_err("ERR: Object 0x%p @offset=%tu fp=0x%p\n\n", > > p, p - addr, get_freepointer(s, p)); > > > > if (s->flags & SLAB_RED_ZONE) > > @@ -791,7 +791,7 @@ static int check_bytes_and_report(struct kmem_cache *s, > > struct page *page, > > end--; > > > > slab_bug(s, "%s overwritten", what); > > - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of > > 0x%x\n", > > + pr_err("ERR: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of > > 0x%x\n", > > fault, end - 1, fault - addr, > > fault[0], value); > > print_trailer(s, page, object); > > @@ -3855,7 +3855,7 @@ static void list_slab_objects(struct kmem_cache *s, > > struct page *page, > > for_each_object(p, s, addr, page->objects) { > > > > if (!test_bit(__obj_to_index(s, addr, p), map)) { > > - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - > > addr); > > + pr_err("ERR: Object 0x%p @offset=%tu\n", p, p - addr); > > print_tracking(s, p); > > } > > } > > > -- Thanks Yafang
Re: [PATCH 3/3] printk: dump full information of page flags in pGp
On Thu, Jan 28, 2021 at 10:52 AM Miaohe Lin wrote: > > Hi: > On 2021/1/28 10:19, Yafang Shao wrote: > > Currently the pGp only shows the names of page flags, rather than > > the full information including section, node, zone, last cpupid and > > kasan tag. While it is not easy to parse these information manually > > because there're so many flavors. Let's interpret them in pGp as well. > > > > - Before the patch, > > [ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 > > fp=0x6d0779d1 flags=0x17c0010200(slab|head) > > > > - After the patch, > > [ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 > > fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid > > 0x1f,slab|head) > > > > Thanks. This really helps! > > > Cc: David Hildenbrand > > Signed-off-by: Yafang Shao > > --- > > lib/vsprintf.c | 42 +- > > 1 file changed, 41 insertions(+), 1 deletion(-) > > > > diff --git a/lib/vsprintf.c b/lib/vsprintf.c > > index 3b53c73580c5..bd809f4f1b82 100644 > > --- a/lib/vsprintf.c > > +++ b/lib/vsprintf.c > > @@ -1916,6 +1916,46 @@ char *format_flags(char *buf, char *end, unsigned > > long flags, > > return buf; > > } > > > > +struct page_flags_layout { > > + int width; > > + int shift; > > + int mask; > > + char *name; > > Should we add const for name ? > Good suggestion. > > +}; > > + > > +struct page_flags_layout pfl[] = { > > Should we add static const for pfl[] as we won't change its value and use it > outside this file ? > Sure. > > + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, "Section "}, > > + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, "Node "}, > > + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, "Zone "}, > > + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, > > "Lastcpupid "}, > > + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, "Kasantag "}, > > +}; > > + > > +static > > +char *format_layout(char *buf, char *end, unsigned long flags) > > +{ > > + int i; > > + > > + for (i = 0; i < sizeof(pfl) / sizeof(struct page_flags_layout) && buf > > < end; i++) { > > I think we can use ARRAY_SIZE here. > Sure. > > + if (pfl[i].width == 0) > > + continue; > > + > > + buf = string(buf, end, pfl[i].name, default_str_spec); > > + > > + if (buf >= end) > > + break; > > + buf = number(buf, end, (flags >> pfl[i].shift) & pfl[i].mask, > > + default_flag_spec); > > + > > + if (buf >= end) > > + break; > > + *buf = ','; > > + buf++; > > + } > > + > > + return buf; > > +} > > + > > static noinline_for_stack > > char *flags_string(char *buf, char *end, void *flags_ptr, > > struct printf_spec spec, const char *fmt) > > @@ -1929,7 +1969,7 @@ char *flags_string(char *buf, char *end, void > > *flags_ptr, > > switch (fmt[1]) { > > case 'p': > > flags = *(unsigned long *)flags_ptr; > > - /* Remove zone id */ > > + buf = format_layout(buf, end, flags & ~((1UL << NR_PAGEFLAGS) > > - 1)); > > flags &= (1UL << NR_PAGEFLAGS) - 1; > > names = pageflag_names; > > break; > > > Many thanks. -- Thanks Yafang
Re: [PATCH 3/3] printk: dump full information of page flags in pGp
On Thu, Jan 28, 2021 at 10:35 AM Joe Perches wrote: > > On Thu, 2021-01-28 at 10:19 +0800, Yafang Shao wrote: > > Currently the pGp only shows the names of page flags, rather than > > the full information including section, node, zone, last cpupid and > > kasan tag. While it is not easy to parse these information manually > > because there're so many flavors. Let's interpret them in pGp as well. > [] > > diff --git a/lib/vsprintf.c b/lib/vsprintf.c > [] > > @@ -1916,6 +1916,46 @@ char *format_flags(char *buf, char *end, unsigned > > long flags, > > return buf; > > } > > > > +struct page_flags_layout { > > + int width; > > + int shift; > > + int mask; > > + char *name; > > +}; > > + > > +struct page_flags_layout pfl[] = { > > static const struct page_flags_layout pfl[] = { Sure. > > > + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, "Section "}, > > + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, "Node "}, > > + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, "Zone "}, > > + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, > > "Lastcpupid "}, > > + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, "Kasantag "}, > > +}; > > + > > +static > > +char *format_layout(char *buf, char *end, unsigned long flags) > > poor name. perhaps format_page_flags > Thanks for the suggestion. > > +{ > > + int i; > > + > > + for (i = 0; i < sizeof(pfl) / sizeof(struct page_flags_layout) && buf > > < end; i++) { > > for (i = 0; i < ARRAY_SIZE(pfl) && buf < end; i++) { > Sure. > > > @@ -1929,7 +1969,7 @@ char *flags_string(char *buf, char *end, void > > *flags_ptr, > > switch (fmt[1]) { > > case 'p': > > flags = *(unsigned long *)flags_ptr; > > - /* Remove zone id */ > > + buf = format_layout(buf, end, flags & ~((1UL << NR_PAGEFLAGS) > > - 1)); > > flags &= (1UL << NR_PAGEFLAGS) - 1; > > Perhaps store the bitshift into a temp and use the temp twice > > foo = BIT(NR_PAGEFLAGS) - 1; > > buf = format_layout(buf, end, flags & ~foo); > flags &= foo; > > Thanks for the suggestion. I will change them all. -- Thanks Yafang
[PATCH 0/3] mm, printk: dump full information of page flags in pGp
Currently the pGp only shows the names of page flags, rather than the full information including section, node, zone, last cpupid and kasan tag. While it is not easy to parse these information manually because there're so many flavors. Let's interpret them in pGp as well. This patchset also includes some code cleanup in mm/slub.c. Below is the example of the output in mm/slub.c. - Before the patchset [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 fp=0x8cd1579c flags=0x17c0010200 - After the patchset [ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid 0x1f,slab|head) Yafang Shao (3): mm, slub: use pGp to print page flags mm, slub: don't combine pr_err with INFO printk: dump full information of page flags in pGp lib/vsprintf.c | 42 +- mm/slub.c | 13 +++-- 2 files changed, 48 insertions(+), 7 deletions(-) -- 2.17.1
[PATCH 3/3] printk: dump full information of page flags in pGp
Currently the pGp only shows the names of page flags, rather than the full information including section, node, zone, last cpupid and kasan tag. While it is not easy to parse these information manually because there're so many flavors. Let's interpret them in pGp as well. - Before the patch, [ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 fp=0x6d0779d1 flags=0x17c0010200(slab|head) - After the patch, [ 6315.235783] ERR: Slab 0x6d1133b9 objects=33 used=3 fp=0x6d0779d1 flags=0x17c0010200(Node 0x0,Zone 0x2,Lastcpupid 0x1f,slab|head) Cc: David Hildenbrand Signed-off-by: Yafang Shao --- lib/vsprintf.c | 42 +- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3b53c73580c5..bd809f4f1b82 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1916,6 +1916,46 @@ char *format_flags(char *buf, char *end, unsigned long flags, return buf; } +struct page_flags_layout { + int width; + int shift; + int mask; + char *name; +}; + +struct page_flags_layout pfl[] = { + {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, "Section "}, + {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, "Node "}, + {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, "Zone "}, + {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, "Lastcpupid "}, + {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, "Kasantag "}, +}; + +static +char *format_layout(char *buf, char *end, unsigned long flags) +{ + int i; + + for (i = 0; i < sizeof(pfl) / sizeof(struct page_flags_layout) && buf < end; i++) { + if (pfl[i].width == 0) + continue; + + buf = string(buf, end, pfl[i].name, default_str_spec); + + if (buf >= end) + break; + buf = number(buf, end, (flags >> pfl[i].shift) & pfl[i].mask, +default_flag_spec); + + if (buf >= end) + break; + *buf = ','; + buf++; + } + + return buf; +} + static noinline_for_stack char *flags_string(char *buf, char *end, void *flags_ptr, struct printf_spec spec, const char *fmt) @@ -1929,7 +1969,7 @@ char *flags_string(char *buf, char *end, void *flags_ptr, switch (fmt[1]) { case 'p': flags = *(unsigned long *)flags_ptr; - /* Remove zone id */ + buf = format_layout(buf, end, flags & ~((1UL << NR_PAGEFLAGS) - 1)); flags &= (1UL << NR_PAGEFLAGS) - 1; names = pageflag_names; break; -- 2.17.1
[PATCH 1/3] mm, slub: use pGp to print page flags
As pGp has been already introduced in printk, we'd better use it to make the output human readable. Before this change, the output is, [ 6155.716018] INFO: Slab 0x4027dd4f objects=33 used=3 fp=0x8cd1579c flags=0x17c0010200 While after this change, the output is, [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) Reviewed-by: David Hildenbrand Signed-off-by: Yafang Shao --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 69742ab9a21d..4b9ab267afbc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -641,8 +641,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + page, page->objects, page->inuse, page->freelist, + page->flags, >flags); } -- 2.17.1
[PATCH 2/3] mm, slub: don't combine pr_err with INFO
It is strange to combine "pr_err" with "INFO", so let's clean them up. This patch is motivated by David's comment[1]. - before the patch [ 8846.517809] INFO: Slab 0xf42a2c60 objects=33 used=3 fp=0x60d32ca8 flags=0x17c0010200(slab|head) - after the patch [ 6312.639698] ERR: Slab 0x6d1133b9 objects=33 used=3 fp=0x6d0779d1 flags=0x17c0010200(slab|head) [1]. https://lore.kernel.org/linux-mm/b9c0f2b6-e9b0-0c36-ebdd-2bc684c5a...@redhat.com/#t Cc: David Hildenbrand Signed-off-by: Yafang Shao --- mm/slub.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 4b9ab267afbc..18b4474c8fa2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -615,7 +615,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) if (!t->addr) return; - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + pr_err("ERR: %s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { @@ -641,7 +641,7 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + pr_err("ERR: Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", page, page->objects, page->inuse, page->freelist, page->flags, >flags); @@ -698,7 +698,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + pr_err("ERR: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) @@ -791,7 +791,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + pr_err("ERR: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); @@ -3855,7 +3855,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + pr_err("ERR: Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } -- 2.17.1
Re: [xfs] db962cd266: Assertion_failed
On Sat, Jan 2, 2021 at 5:53 AM Dave Chinner wrote: > > On Fri, Jan 01, 2021 at 05:10:49PM +0800, Yafang Shao wrote: > > On Thu, Dec 31, 2020 at 10:46 AM kernel test robot > > wrote: > . > > > [ 552.905799] XFS: Assertion failed: !current->journal_info, file: > > > fs/xfs/xfs_trans.h, line: 280 > > > [ 553.104459] xfs_trans_reserve+0x225/0x320 [xfs] > > > [ 553.110556] xfs_trans_roll+0x6e/0xe0 [xfs] > > > [ 553.116134] xfs_defer_trans_roll+0x104/0x2a0 [xfs] > > > [ 553.122489] ? xfs_extent_free_create_intent+0x62/0xc0 [xfs] > > > [ 553.129780] xfs_defer_finish_noroll+0xb8/0x620 [xfs] > > > [ 553.136299] xfs_defer_finish+0x11/0xa0 [xfs] > > > [ 553.142017] xfs_itruncate_extents_flags+0x141/0x440 [xfs] > > > [ 553.149053] xfs_setattr_size+0x3da/0x480 [xfs] > > > [ 553.154939] ? setattr_prepare+0x6a/0x1e0 > > > [ 553.160250] xfs_vn_setattr+0x70/0x120 [xfs] > > > [ 553.165833] notify_change+0x364/0x500 > > > [ 553.170820] ? do_truncate+0x76/0xe0 > > > [ 553.175673] do_truncate+0x76/0xe0 > > > [ 553.180184] path_openat+0xe6c/0x10a0 > > > [ 553.184981] do_filp_open+0x91/0x100 > > > [ 553.189707] ? __check_object_size+0x136/0x160 > > > [ 553.195493] do_sys_openat2+0x20d/0x2e0 > > > [ 553.200481] do_sys_open+0x44/0x80 > > > [ 553.204926] do_syscall_64+0x33/0x40 > > > [ 553.209588] entry_SYSCALL_64_after_hwframe+0x44/0xa9 > > > > Thanks for the report. > > > > At a first glance, it seems we should make a similar change as we did > > in xfs_trans_context_clear(). > > > > static inline void > > xfs_trans_context_set(struct xfs_trans *tp) > > { > > /* > > * We have already handed over the context via xfs_trans_context_swap(). > > */ > > if (current->journal_info) > > return; > > current->journal_info = tp; > > tp->t_pflags = memalloc_nofs_save(); > > } > > Ah, no. > > Remember how I said "split out the wrapping of transaction > context setup in xfs_trans_reserve() from > the lifting of the context setting into xfs_trans_alloc()"? > > Well, you did the former and dropped the latter out of the patch > set. > I misunderstood what you mean. > Now when a transaction rolls after xfs_trans_context_swap(), it > calls xfs_trans_reserve() and tries to do transaction context setup > work inside a transaction context that already exists. IOWs, you > need to put the patch that lifts of the context setting up into > xfs_trans_alloc() back into the patchset before adding the > current->journal functionality patch. > Sure. > Also, you need to test XFS code with CONFIG_XFS_DEBUG=y so that > asserts are actually built into the code and exercised, because this > ASSERT should have fired on the first rolling transaction that the > kernel executes... > I really forgot to enable CONFIG_XFS_DEBUG... -_-b -- Thanks Yafang
Re: [xfs] db962cd266: Assertion_failed
On Thu, Dec 31, 2020 at 10:46 AM kernel test robot wrote: > > > Greeting, > > FYI, we noticed the following commit (built with gcc-9): > > commit: db962cd266c4d3230436aec2899186510797d49f ("[PATCH v14 4/4] xfs: use > current->journal_info to avoid transaction reservation recursion") > url: > https://github.com/0day-ci/linux/commits/Yafang-Shao/xfs-avoid-transaction-reservation-recursion/20201222-092315 > base: https://git.kernel.org/cgit/fs/xfs/xfs-linux.git for-next > > in testcase: filebench > version: filebench-x86_64-22620e6-1_20201112 > with following parameters: > > disk: 1HDD > fs: btrfs > test: fivestreamwrite.f > cpufreq_governor: performance > ucode: 0x42e > > > > on test machine: 48 threads Intel(R) Xeon(R) CPU E5-2697 v2 @ 2.70GHz with > 112G memory > > caused below changes (please refer to attached dmesg/kmsg for entire > log/backtrace): > > > If you fix the issue, kindly add following tag > Reported-by: kernel test robot > > > [ 552.503501] > [ 552.525993] /usr/bin/wget -q --timeout=1800 --tries=1 > --local-encoding=UTF-8 > http://internal-lkp-server:80/~lkp/pkg/linux/x86_64-rhel-8.3/gcc-9/5c8fe583cce542aa0b84adc939ce85293de36e5e/modules.cgz > -N -P > /opt/rootfs/tmp/pkg/linux/x86_64-rhel-8.3/gcc-9/5c8fe583cce542aa0b84adc939ce85293de36e5e > [ 552.525995] > [ 552.884581] > /opt/rootfs/tmp/pkg/linux/x86_64-rhel-8.3/gcc-9/5c8fe583cce542aa0b84adc939ce85293de36e5e/modules.cgz > isn't modified > [ 552.884583] > [ 552.905799] XFS: Assertion failed: !current->journal_info, file: > fs/xfs/xfs_trans.h, line: 280 > [ 552.907594] /usr/bin/wget -q --timeout=1800 --tries=1 > --local-encoding=UTF-8 > http://internal-lkp-server:80/~lkp/osimage/ucode/intel-ucode-20201117.cgz -N > -P /opt/rootfs/tmp/osimage/ucode > [ 552.916568] > [ 552.916574] [ cut here ] > [ 552.939361] /opt/rootfs/tmp/osimage/ucode/intel-ucode-20201117.cgz isn't > modified > [ 552.940036] kernel BUG at fs/xfs/xfs_message.c:110! > [ 552.946338] > [ 552.955784] invalid opcode: [#1] SMP PTI > [ 552.971010] CPU: 46 PID: 3793 Comm: kexec-lkp Not tainted > 5.10.0-rc5-00044-gdb962cd266c4 #1 > [ 552.981331] Hardware name: Intel Corporation S2600WP/S2600WP, BIOS > SE5C600.86B.02.02.0002.122320131210 12/23/2013 > [ 552.993907] RIP: 0010:assfail+0x23/0x28 [xfs] > [ 552.999797] Code: 67 fc ff ff 0f 0b c3 0f 1f 44 00 00 41 89 c8 48 89 d1 48 > 89 f2 48 c7 c6 30 58 be c0 e8 82 f9 ff ff 80 3d b1 80 0a 00 00 74 02 <0f> 0b > 0f 0b c3 48 8d 45 10 48 89 e2 4c 89 e6 48 89 1c 24 48 89 44 > [ 553.022798] RSP: 0018:c90006a139c8 EFLAGS: 00010202 > [ 553.029624] RAX: RBX: 889c3edea700 RCX: > > [ 553.038646] RDX: ffc0 RSI: 000a RDI: > c0bd7bab > [ 553.047600] RBP: c90006a13a14 R08: R09: > > [ 553.056536] R10: 000a R11: f000 R12: > > [ 553.065546] R13: R14: 889c3ede91c8 R15: > 888f44608000 > [ 553.074455] FS: 77fc9580() GS:889bea38() > knlGS: > [ 553.084494] CS: 0010 DS: ES: CR0: 80050033 > [ 553.091837] CR2: 555a1420 CR3: 001c30fbe002 CR4: > 001706e0 > [ 553.100720] Call Trace: > [ 553.104459] xfs_trans_reserve+0x225/0x320 [xfs] > [ 553.110556] xfs_trans_roll+0x6e/0xe0 [xfs] > [ 553.116134] xfs_defer_trans_roll+0x104/0x2a0 [xfs] > [ 553.122489] ? xfs_extent_free_create_intent+0x62/0xc0 [xfs] > [ 553.129780] xfs_defer_finish_noroll+0xb8/0x620 [xfs] > [ 553.136299] xfs_defer_finish+0x11/0xa0 [xfs] > [ 553.142017] xfs_itruncate_extents_flags+0x141/0x440 [xfs] > [ 553.149053] xfs_setattr_size+0x3da/0x480 [xfs] > [ 553.154939] ? setattr_prepare+0x6a/0x1e0 > [ 553.160250] xfs_vn_setattr+0x70/0x120 [xfs] > [ 553.165833] notify_change+0x364/0x500 > [ 553.170820] ? do_truncate+0x76/0xe0 > [ 553.175673] do_truncate+0x76/0xe0 > [ 553.180184] path_openat+0xe6c/0x10a0 > [ 553.184981] do_filp_open+0x91/0x100 > [ 553.189707] ? __check_object_size+0x136/0x160 > [ 553.195493] do_sys_openat2+0x20d/0x2e0 > [ 553.200481] do_sys_open+0x44/0x80 > [ 553.204926] do_syscall_64+0x33/0x40 > [ 553.209588] entry_SYSCALL_64_after_hwframe+0x44/0xa9 > [ 553.215867] RIP: 0033:0x77ef11ae > [ 553.220579] Code: 25 00 00 41 00 3d 00 00 41 00 74 48 48 8d 05 59 65 0d 00 > 8b 00 85 c0 75 69 89 f2 b8 01 01 00 00 48 89 fe bf 9c ff ff ff 0f 05 <48> 3d > 00 f0 ff ff 0f 87 a6 00 00 00 48 8b 4c 24 28 64 48 33 0c 25 > [ 553.24287
Re: [PATCH 3/6] sched: make struct sched_statistics independent of fair sched class
On Tue, Dec 1, 2020 at 8:41 PM Mel Gorman wrote: > > On Tue, Dec 01, 2020 at 07:54:13PM +0800, Yafang Shao wrote: > > If we want to use schedstats facility, we should move out of > > struct sched_statistics from the struct sched_entity or add it into other > > sctructs of sched entity as well. Obviously the latter one is bad because > > that requires more spaces. So we should move it into a common struct which > > can be used by all sched classes. > > > > The struct sched_statistics is the schedular statistics of a task_struct > > or a task_group. So we can move it into struct task_struct and > > struct task_group to achieve the goal. > > > > Below is the detailed explaination of the change in the structs. > > > > - Before this patch > > > > struct task_struct {|-> struct sched_entity { > > ... | ... > > struct sched_entity *se; ---| struct sched_statistics statistics; > > struct sched_rt_entity *rt; ... > > ... ... > > }; }; > > > > struct task_group { |--> se[0]->statistics : schedstats of CPU0 > > ... | > > #ifdef CONFIG_FAIR_GROUP_SCHED | > > struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1 > > | > > #endif | > > |--> se[N]->statistics : schedstats of CPUn > > > > #ifdef CONFIG_FAIR_GROUP_SCHED > > struct sched_rt_entity **rt_se; (N/A) > > #endif > > ... > > }; > > > > The '**se' in task_group is allocated in the fair sched class, which is > > hard to be reused by other sched class. > > > > - After this patch > > > > struct task_struct { > > ... > > struct sched_statistics statistics; > > ... > > struct sched_entity *se; > > struct sched_rt_entity *rt; > > ... > > }; > > > > struct task_group {|---> stats[0] : of CPU0 > > ...| > > struct sched_statistics **stats; --|---> stats[1] : of CPU1 > > ...| > >|---> stats[n] : of CPUn > > #ifdef CONFIG_FAIR_GROUP_SCHED > > struct sched_entity **se; > > #endif > > #ifdef CONFIG_RT_GROUP_SCHED > > struct sched_rt_entity **rt_se; > > #endif > > ... > > }; > > > > After the patch it is clearly that both of se or rt_se can easily get the > > sched_statistics by a task_struct or a task_group. > > > > Signed-off-by: Yafang Shao > > I didn't see anything wrong as such, it's mostly a mechanical > conversion. The one slight caveat is the potential change in cache > location for the statistics but it's not necessarily negative. The stats > potentially move to a different cache line but it's less obvious whether > that even matters given the location is very similar. > > There is increased overhead now when schedstats are *enabled* because > _schedstat_from_sched_entity() has to be called but it appears that it is > protected by a schedstat_enabled() check. So ok, schedstats when enabled > are now a bit more expensive but they were expensive in the first place > so does it matter? > > I'd have been happied if there was a comparison with schedstats enabled > just in case the overhead is too high but it could also do with a second > set of eyeballs. > Sure, I will do the comparison. Thanks for the review again. > It's somewhat tentative but > > Acked-by: Mel Gorman > > -- > Mel Gorman > SUSE Labs -- Thanks Yafang
Re: [PATCH 4/6] sched: make schedstats helpers independent of fair sched class
On Tue, Dec 1, 2020 at 8:46 PM Mel Gorman wrote: > > On Tue, Dec 01, 2020 at 07:54:14PM +0800, Yafang Shao wrote: > > The original prototype of the schedstats helpers are > > > > update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se) > > > > The cfs_rq in these helpers is used to get the rq_clock, and the se is > > used to get the struct sched_statistics and the struct task_struct. In > > order to make these helpers available by all sched classes, we can pass > > the rq, sched_statistics and task_struct directly. > > > > Then the new helpers are > > > > update_stats_wait_*(struct rq *rq, struct task_struct *p, > > struct sched_statistics *stats) > > > > which are independent of fair sched class. > > > > To avoid vmlinux growing too large or introducing ovehead when > > !schedstat_enabled(), some new helpers after schedstat_enabled() are also > > introduced, Suggested by Mel. These helpers are in sched/stats.c, > > > > __update_stats_wait_*(struct rq *rq, struct task_struct *p, > > struct sched_statistics *stats) > > > > Cc: Mel Gorman > > Signed-off-by: Yafang Shao > > Think it's ok, it's mostly code shuffling. I'd have been happier if > there was evidence showing a before/after comparison of the sched stats > for something simple like "perf bench sched pipe" and a clear statement > of no functional change as well as a comparison of the vmlinux files but > I think it's ok so; > Thanks for the review, I will do the comparison as you suggested. > Acked-by: Mel Gorman > Thanks! > I didn't look at the rt.c parts > > -- > Mel Gorman > SUSE Labs -- Thanks Yafang
[PATCH 5/6] sched, rt: support sched_stat_runtime tracepoint for RT sched class
The runtime of a RT task has already been there, so we only need to place the sched_stat_runtime tracepoint there. One difference between fair task and RT task is that there is no vruntime in RT task. To reuse the sched_stat_runtime tracepoint, '0' is passed as vruntime for RT task. The output of this tracepoint for RT task as follows, stress-9748[039] d.h. 113.519352: sched_stat_runtime: comm=stress pid=9748 runtime=997573 [ns] vruntime=0 [ns] stress-9748[039] d.h. 113.520352: sched_stat_runtime: comm=stress pid=9748 runtime=997627 [ns] vruntime=0 [ns] stress-9748[039] d.h. 113.521352: sched_stat_runtime: comm=stress pid=9748 runtime=998203 [ns] vruntime=0 [ns] Signed-off-by: Yafang Shao --- kernel/sched/rt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2d543a270dfe..da989653b0a2 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1011,6 +1011,8 @@ static void update_curr_rt(struct rq *rq) schedstat_set(curr->stats.exec_max, max(curr->stats.exec_max, delta_exec)); + trace_sched_stat_runtime(curr, delta_exec, 0); + curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); -- 2.18.4
[PATCH 6/6] sched, rt: support schedstats for RT sched class
We want to measure the latency of RT tasks in our production environment with schedstats facility, but currently schedstats is only supported for fair sched class. This patch enable it for RT sched class as well. After we make the struct sched_statistics and the helpers of it independent of fair sched class, we can easily use the schedstats facility for RT sched class. The schedstat usage in RT sched class is similar with fair sched class, for example, fairRT enqueue update_stats_enqueue_fair update_stats_enqueue_rt dequeue update_stats_dequeue_fair update_stats_dequeue_rt put_prev_task update_stats_wait_start update_stats_wait_start set_next_task update_stats_wait_end update_stats_wait_end The user can get the schedstats information in the same way in fair sched class. For example, fairRT task show /proc/[pid]/sched /proc/[pid]/sched group show cpu.stat in cgroup cpu.stat in cgroup The output of a RT task's schedstats as follows, $ cat /proc/10461/sched ... stats.sum_sleep_runtime : 37966.502936 stats.wait_start : 0.00 stats.sleep_start: 0.00 stats.block_start:279182.986012 stats.sleep_max : 9.001121 stats.block_max : 9.292692 stats.exec_max : 0.090009 stats.slice_max : 0.00 stats.wait_max : 0.005305 stats.wait_sum : 0.352352 stats.wait_count : 236173 stats.iowait_sum : 37875.625128 stats.iowait_count : 235933 stats.nr_migrations_cold :0 stats.nr_failed_migrations_affine:0 stats.nr_failed_migrations_running :0 stats.nr_failed_migrations_hot :0 stats.nr_forced_migrations :0 stats.nr_wakeups : 236172 stats.nr_wakeups_sync:0 stats.nr_wakeups_migrate :2 stats.nr_wakeups_local : 235865 stats.nr_wakeups_remote : 307 stats.nr_wakeups_affine :0 stats.nr_wakeups_affine_attempts :0 stats.nr_wakeups_passive :0 stats.nr_wakeups_idle:0 ... The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can be used to trace RT tasks as well. The output of these tracepoints for a RT tasks as follows, - blocked & iowait kworker/48:1-442 [048] d... 539.830872: sched_stat_iowait: comm=stress pid=10461 delay=158242 [ns] kworker/48:1-442 [048] d... 539.830872: sched_stat_blocked: comm=stress pid=10461 delay=158242 [ns] - wait stress-10460 [001] dN.. 813.965304: sched_stat_wait: comm=stress pid=10462 delay=7536 [ns] stress-10462 [001] d.h. 813.966300: sched_stat_runtime: comm=stress pid=10462 runtime=993812 [ns] vruntime=0 [ns] [...] stress-10462 [001] d.h. 814.065300: sched_stat_runtime: comm=stress pid=10462 runtime=994484 [ns] vruntime=0 [ns] [ totally 100 times of sched_stat_runtime for pid 10462] [ The delay of pid 10462 is the sum of above runtime ] stress-10462 [001] dN.. 814.065307: sched_stat_wait: comm=stress pid=10460 delay=11089 [ns] stress-10460 [001] d.h. 814.066299: sched_stat_runtime: comm=stress pid=10460 runtime=991934 [ns] vruntime=0 [ns] - sleep sleep-15582 [041] dN.. 1732.814348: sched_stat_sleep: comm=sleep.sh pid=15474 delay=1001223130 [ns] sleep-15584 [041] dN.. 1733.815908: sched_stat_sleep: comm=sleep.sh pid=15474 delay=1001238954 [ns] [ In sleep.sh, it sleeps 1 sec each time. ] Signed-off-by: Yafang Shao --- kernel/sched/rt.c | 134 +- 1 file changed, 133 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index da989653b0a2..f764c2b9070d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1271,6 +1271,121 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr rt_se->on_list = 0; } +static inline void +__schedstat_from_sched_rt_entity(struct sched_rt_entity
[PATCH 4/6] sched: make schedstats helpers independent of fair sched class
The original prototype of the schedstats helpers are update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se) The cfs_rq in these helpers is used to get the rq_clock, and the se is used to get the struct sched_statistics and the struct task_struct. In order to make these helpers available by all sched classes, we can pass the rq, sched_statistics and task_struct directly. Then the new helpers are update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) which are independent of fair sched class. To avoid vmlinux growing too large or introducing ovehead when !schedstat_enabled(), some new helpers after schedstat_enabled() are also introduced, Suggested by Mel. These helpers are in sched/stats.c, __update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) Cc: Mel Gorman Signed-off-by: Yafang Shao --- kernel/sched/fair.c | 140 +++ kernel/sched/stats.c | 104 kernel/sched/stats.h | 32 ++ 3 files changed, 157 insertions(+), 119 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 14d8df308d44..b869a83fac29 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -917,69 +917,44 @@ static void update_curr_fair(struct rq *rq) } static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats = NULL; - u64 wait_start, prev_wait_start; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; - __schedstat_from_sched_entity(se, ); - - wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(stats->wait_start); + if (entity_is_task(se)) + p = task_of(se); - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > prev_wait_start)) - wait_start -= prev_wait_start; + __schedstat_from_sched_entity(se, ); - __schedstat_set(stats->wait_start, wait_start); + __update_stats_wait_start(rq_of(cfs_rq), p, stats); } static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats = NULL; struct task_struct *p = NULL; - u64 delta; if (!schedstat_enabled()) return; - __schedstat_from_sched_entity(se, ); - - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start); - if (entity_is_task(se)) { + if (entity_is_task(se)) p = task_of(se); - if (task_on_rq_migrating(p)) { - /* -* Preserve migrating task's wait time so wait_start -* time stamp can be adjusted to accumulate wait time -* prior to migration. -*/ - __schedstat_set(stats->wait_start, delta); - - return; - } - - trace_sched_stat_wait(p, delta); - } + __schedstat_from_sched_entity(se, ); - __schedstat_set(stats->wait_max, - max(schedstat_val(stats->wait_max), delta)); - __schedstat_inc(stats->wait_count); - __schedstat_add(stats->wait_sum, delta); - __schedstat_set(stats->wait_start, 0); + __update_stats_wait_end(rq_of(cfs_rq), p, stats); } static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats = NULL; struct task_struct *p = NULL; - u64 sleep_start, block_start; if (!schedstat_enabled()) return; @@ -989,67 +964,14 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) __schedstat_from_sched_entity(se, ); - sleep_start = schedstat_val(stats->sleep_start); - block_start = schedstat_val(stats->block_start); - - if (sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(stats->sleep_max))) - __schedstat_set(stats->sleep_max, delta); - - __schedstat_set(stats->sleep_start, 0); - __schedstat_add(stats->sum_sleep_runtime, delta); - - if (p) { - account_scheduler_latency(p, delta >> 10, 1); - trace_sched_stat_sleep(p, delta)
[PATCH 2/6] sched, fair: use __schedstat_set() in set_next_entity()
schedstat_enabled() has been already checked, so we can use __schedstat_set() directly. Signed-off-by: Yafang Shao --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8ff1daa3d9bb..7e7c03cede94 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4414,7 +4414,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (schedstat_enabled() && rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - schedstat_set(se->statistics.slice_max, + __schedstat_set(se->statistics.slice_max, max((u64)schedstat_val(se->statistics.slice_max), se->sum_exec_runtime - se->prev_sum_exec_runtime)); } -- 2.18.4
[PATCH 3/6] sched: make struct sched_statistics independent of fair sched class
If we want to use schedstats facility, we should move out of struct sched_statistics from the struct sched_entity or add it into other sctructs of sched entity as well. Obviously the latter one is bad because that requires more spaces. So we should move it into a common struct which can be used by all sched classes. The struct sched_statistics is the schedular statistics of a task_struct or a task_group. So we can move it into struct task_struct and struct task_group to achieve the goal. Below is the detailed explaination of the change in the structs. - Before this patch struct task_struct {|-> struct sched_entity { ... | ... struct sched_entity *se; ---| struct sched_statistics statistics; struct sched_rt_entity *rt; ... ... ... }; }; struct task_group { |--> se[0]->statistics : schedstats of CPU0 ... | #ifdef CONFIG_FAIR_GROUP_SCHED | struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1 | #endif | |--> se[N]->statistics : schedstats of CPUn #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_rt_entity **rt_se; (N/A) #endif ... }; The '**se' in task_group is allocated in the fair sched class, which is hard to be reused by other sched class. - After this patch struct task_struct { ... struct sched_statistics statistics; ... struct sched_entity *se; struct sched_rt_entity *rt; ... }; struct task_group {|---> stats[0] : of CPU0 ...| struct sched_statistics **stats; --|---> stats[1] : of CPU1 ...| |---> stats[n] : of CPUn #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity **se; #endif #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; #endif ... }; After the patch it is clearly that both of se or rt_se can easily get the sched_statistics by a task_struct or a task_group. Signed-off-by: Yafang Shao --- include/linux/sched.h| 3 +- kernel/sched/core.c | 24 --- kernel/sched/deadline.c | 4 +- kernel/sched/debug.c | 81 +++--- kernel/sched/fair.c | 142 ++- kernel/sched/rt.c| 4 +- kernel/sched/sched.h | 3 + kernel/sched/stats.h | 46 + kernel/sched/stop_task.c | 4 +- 9 files changed, 206 insertions(+), 105 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 063cd120b459..f8e969be4bee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -462,8 +462,6 @@ struct sched_entity { u64 nr_migrations; - struct sched_statistics statistics; - #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; @@ -681,6 +679,7 @@ struct task_struct { unsigned intrt_priority; const struct sched_class*sched_class; + struct sched_statistics stats; struct sched_entity se; struct sched_rt_entity rt; #ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fd76628778f7..081b4f1f2cb4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2427,11 +2427,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); - __schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(p->stats.nr_wakeups_local); } else { struct sched_domain *sd; - __schedstat_inc(p->se.statistics.nr_wakeups_remote); + __schedstat_inc(p->stats.nr_wakeups_remote); rcu_read_lock(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { @@ -2443,14 +2443,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - __schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->stats.nr_wakeups_migrate); #endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); - __schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(p->stats.nr_wakeups); if (wake_flags & WF_SYNC) - __schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->stats.nr_wakeups_sync); } /* @@ -3075,7 +3075,7 @@ static void __sched_fork(unsigned long clone_flags, struct task
[PATCH 1/6] sched: don't include stats.h in sched.h
This patch is a preparation of the followup patches. In the followup patches some common helpers will be defined in stats.h, and these common helpers require some definitions in sched.h, so let's move stats.h out of sched.h. The source files which require stats.h include it specifically. Signed-off-by: Yafang Shao --- kernel/sched/core.c | 1 + kernel/sched/deadline.c | 1 + kernel/sched/debug.c | 1 + kernel/sched/fair.c | 1 + kernel/sched/idle.c | 1 + kernel/sched/rt.c| 2 +- kernel/sched/sched.h | 6 +- kernel/sched/stats.c | 1 + kernel/sched/stats.h | 2 ++ kernel/sched/stop_task.c | 1 + 10 files changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..fd76628778f7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,7 @@ #undef CREATE_TRACE_POINTS #include "sched.h" +#include "stats.h" #include diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f232305dcefe..7a0124f81a4f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,7 @@ *Fabio Checconi */ #include "sched.h" +#include "stats.h" #include "pelt.h" struct dl_bandwidth def_dl_bandwidth; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2357921580f9..9758aa1bba1e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -7,6 +7,7 @@ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ #include "sched.h" +#include "stats.h" static DEFINE_SPINLOCK(sched_debug_lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8917d2d715ef..8ff1daa3d9bb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,7 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#include "stats.h" /* * Targeted preemption latency for CPU-bound tasks: diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 24d0ee26377d..95c02cbca04a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -7,6 +7,7 @@ *tasks which are handled in sched/fair.c ) */ #include "sched.h" +#include "stats.h" #include diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 49ec096a8aa1..af772ac0f32d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -4,7 +4,7 @@ * policies) */ #include "sched.h" - +#include "stats.h" #include "pelt.h" int sched_rr_timeslice = RR_TIMESLICE; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index df80bfcea92e..871544bb9a38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,9 @@ /* * Scheduler internal types and methods: */ +#ifndef _KERNEL_SCHED_SCHED_H +#define _KERNEL_SCHED_SCHED_H + #include #include @@ -1538,7 +1541,6 @@ extern void flush_smp_call_function_from_idle(void); static inline void flush_smp_call_function_from_idle(void) { } #endif -#include "stats.h" #include "autogroup.h" #ifdef CONFIG_CGROUP_SCHED @@ -2633,3 +2635,5 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +#endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 750fb3c67eed..844bd9dbfbf0 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -3,6 +3,7 @@ * /proc/schedstat implementation */ #include "sched.h" +#include "stats.h" /* * Current schedstat API version. diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 33d0daf83842..c23b653ffc53 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -2,6 +2,8 @@ #ifdef CONFIG_SCHEDSTATS +#include "sched.h" + /* * Expects runqueue lock to be held for atomicity of update */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ceb5b6b12561..a5d289049388 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -8,6 +8,7 @@ * See kernel/stop_machine.c */ #include "sched.h" +#include "stats.h" #ifdef CONFIG_SMP static int -- 2.18.4
[PATCH 0/6] sched: support schedstats for RT sched class
We want to measure the latency of RT tasks in our production environment with schedstats facility, but currently schedstats is only supported for fair sched class. This patch enable it for RT sched class as well. - Structure Before we make schedstats available for RT sched class, we should make struct sched_statistics independent of fair sched class first. The struct sched_statistics is the schedular statistics of a task_struct or a task_group. So we can move it into struct task_struct and struct task_group to achieve the goal. Below is the detailed explaination of the change in the structs. The struct before this patch: struct task_struct {|-> struct sched_entity { ... | ... struct sched_entity *se; ---| struct sched_statistics statistics; struct sched_rt_entity *rt; ... ... ... }; }; struct task_group { |--> se[0]->statistics : schedstats of CPU0 ... | #ifdef CONFIG_FAIR_GROUP_SCHED | struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1 | #endif | |--> se[N]->statistics : schedstats of CPUn #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_rt_entity **rt_se; (N/A) #endif ... }; The '**se' in task_group is allocated in the fair sched class, which is hard to be reused by other sched class. The struct after this patch: struct task_struct { ... struct sched_statistics statistics; ... struct sched_entity *se; struct sched_rt_entity *rt; ... }; struct task_group {|---> stats[0] : of CPU0 ...| struct sched_statistics **stats; --|---> stats[1] : of CPU1 ...| |---> stats[n] : of CPUn #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity **se; #endif #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; #endif ... }; After the patch it is clearly that both of se or rt_se can easily get the sched_statistics by a task_struct or a task_group. - Function Interface The original prototype of the schedstats helpers are update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se) The cfs_rq in these helpers is used to get the rq_clock, and the se is used to get the struct sched_statistics and the struct task_struct. In order to make these helpers available by all sched classes, we can pass the rq, sched_statistics and task_struct directly. Then the new helpers are update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) which are independent of fair sched class. To avoid vmlinux growing too large or introducing ovehead when !schedstat_enabled(), some new helpers after schedstat_enabled() are also introduced, Suggested by Mel. These helpers are in sched/stats.c, __update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) - Implementation After we make the struct sched_statistics and the helpers of it independent of fair sched class, we can easily use the schedstats facility for RT sched class. The schedstat usage in RT sched class is similar with fair sched class, for example, fairRT enqueue update_stats_enqueue_fair update_stats_enqueue_rt dequeue update_stats_dequeue_fair update_stats_dequeue_rt put_prev_task update_stats_wait_start update_stats_wait_start set_next_task update_stats_wait_end update_stats_wait_end - Usage The user can get the schedstats information in the same way in fair sched class. For example, fairRT task show /proc/[pid]/sched /proc/[pid]/sched group show cpu.stat in cgroup cpu.stat in cgroup The sched:sched_stat_{wait, sleep, iowait, blocked, runtime} tracepoints can be used to trace RT tasks as well. Detailed examples of the output of schedstats for RT tasks are in patch #6. Changes since RFC: - improvement of schedstats helpers, per Mel. - make struct schedstats independent of fair sched class Yafang Shao (6): sched: don't include stats.h in sched.h sched, fair: use __schedstat_set() in set_next_entity() sched: make struct sched_statistics independent of fair sched class sched: make schedstats helpers independent of fair sched class sched, rt: support sched_stat_runtime tracepoint for RT sched class sched, rt: support schedstats for RT sched class include/linux/sched.h| 3 +- kernel/sched/core.c | 25 +++-- kernel/sched/deadline.c | 5 +- kernel/sched/debug.c | 82 +++ kernel/sched/fair.c | 209 +++---
Re: [RFC PATCH v3 0/5] sched: support schedstats for RT sched class
On Sat, Nov 28, 2020 at 12:12 AM Yafang Shao wrote: > > We want to measure the latency of RT tasks in our production > environment with schedstats facility, but currently schedstats is only > supported for fair sched class. This patch enable it for RT sched class > as well. > > - Structure > > Before we make schedstats available for RT sched class, we should make > struct sched_statistics independent of fair sched class first. > > > The struct sched_statistics is the schedular statistics of a task_struct > or a task_group. So we can move it into struct task_struct and > struct task_group to achieve the goal. > > Below is the detailed explaination of the change in the structs. > > The struct before this patch: > > struct task_struct {|-> struct sched_entity { > ... | ... > struct sched_entity *se; ---| struct sched_statistics statistics; > struct sched_rt_entity *rt; ... > ... ... > }; }; > > struct task_group { |--> se[0]->statistics : schedstats of CPU0 > ... | > #ifdef CONFIG_FAIR_GROUP_SCHED | > struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1 > | > #endif | > |--> se[N]->statistics : schedstats of CPUn > > #ifdef CONFIG_FAIR_GROUP_SCHED > struct sched_rt_entity **rt_se; (N/A) > #endif > ... > }; > > The '**se' in task_group is allocated in the fair sched class, which is > hard to be reused by other sched class. > > The struct after this patch: > struct task_struct { > ... > struct sched_statistics statistics; > ... > struct sched_entity *se; > struct sched_rt_entity *rt; > ... > }; > > struct task_group {|---> stats[0] : of CPU0 > ...| > struct sched_statistics **stats; --|---> stats[1] : of CPU1 > ...| >|---> stats[n] : of CPUn > #ifdef CONFIG_FAIR_GROUP_SCHED > struct sched_entity **se; > #endif > #ifdef CONFIG_RT_GROUP_SCHED > struct sched_rt_entity **rt_se; > #endif > ... > }; > > After the patch it is clearly that both of se or rt_se can easily get the > sched_statistics by a task_struct or a task_group. > > - Function Interface > > The original prototype of the schedstats helpers are > > update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se) > > The cfs_rq in these helpers is used to get the rq_clock, and the se is > used to get the struct sched_statistics and the struct task_struct. In > order to make these helpers available by all sched classes, we can pass > the rq, sched_statistics and task_struct directly. > > Then the new helpers are > > update_stats_wait_*(struct rq *rq, struct task_struct *p, > struct sched_statistics *stats) > > which are independent of fair sched class. > > To avoid vmlinux growing too large or introducing ovehead when > !schedstat_enabled(), some new helpers after schedstat_enabled() are also > introduced, Suggested by Mel. These helpers are in sched/stats.c, > > __update_stats_wait_*(struct rq *rq, struct task_struct *p, > struct sched_statistics *stats) > > - Implementation > > After we make the struct sched_statistics and the helpers of it > independent of fair sched class, we can easily use the schedstats > facility for RT sched class. > > The schedstat usage in RT sched class is similar with fair sched class, > for example, > fairRT > enqueue update_stats_enqueue_fair update_stats_enqueue_rt > dequeue update_stats_dequeue_fair update_stats_dequeue_rt > put_prev_task update_stats_wait_start update_stats_wait_start > set_next_task update_stats_wait_end update_stats_wait_end > > - Usage > > The user can get the schedstats information in the same way in fair sched > class. For example, > fairRT > task show /proc/[pid]/sched /proc/[pid]/sched > group show cpu.stat in cgroup cpu.stat in cgroup > > The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can > be used to trace RT tasks as well. sched_stat_runtime can also be > supported in the future if it is helpful. > > Yafang Shao (5): > sched: don't include stats.h in sched.h > sched, fair: use __sched
[RFC PATCH v3 4/5] sched: make schedstats helpers independent of fair sched class
The original prototype of the schedstats helpers are update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se) The cfs_rq in these helpers is used to get the rq_clock, and the se is used to get the struct sched_statistics and the struct task_struct. In order to make these helpers available by all sched classes, we can pass the rq, sched_statistics and task_struct directly. Then the new helpers are update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) which are independent of fair sched class. To avoid vmlinux growing too large or introducing ovehead when !schedstat_enabled(), some new helpers after schedstat_enabled() are also introduced, Suggested by Mel. These helpers are in sched/stats.c, __update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) Cc: Mel Gorman Signed-off-by: Yafang Shao --- kernel/sched/fair.c | 140 +++ kernel/sched/stats.c | 104 kernel/sched/stats.h | 32 ++ 3 files changed, 157 insertions(+), 119 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 14d8df308d44..b869a83fac29 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -917,69 +917,44 @@ static void update_curr_fair(struct rq *rq) } static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats = NULL; - u64 wait_start, prev_wait_start; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; - __schedstat_from_sched_entity(se, ); - - wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(stats->wait_start); + if (entity_is_task(se)) + p = task_of(se); - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > prev_wait_start)) - wait_start -= prev_wait_start; + __schedstat_from_sched_entity(se, ); - __schedstat_set(stats->wait_start, wait_start); + __update_stats_wait_start(rq_of(cfs_rq), p, stats); } static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats = NULL; struct task_struct *p = NULL; - u64 delta; if (!schedstat_enabled()) return; - __schedstat_from_sched_entity(se, ); - - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start); - if (entity_is_task(se)) { + if (entity_is_task(se)) p = task_of(se); - if (task_on_rq_migrating(p)) { - /* -* Preserve migrating task's wait time so wait_start -* time stamp can be adjusted to accumulate wait time -* prior to migration. -*/ - __schedstat_set(stats->wait_start, delta); - - return; - } - - trace_sched_stat_wait(p, delta); - } + __schedstat_from_sched_entity(se, ); - __schedstat_set(stats->wait_max, - max(schedstat_val(stats->wait_max), delta)); - __schedstat_inc(stats->wait_count); - __schedstat_add(stats->wait_sum, delta); - __schedstat_set(stats->wait_start, 0); + __update_stats_wait_end(rq_of(cfs_rq), p, stats); } static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats = NULL; struct task_struct *p = NULL; - u64 sleep_start, block_start; if (!schedstat_enabled()) return; @@ -989,67 +964,14 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) __schedstat_from_sched_entity(se, ); - sleep_start = schedstat_val(stats->sleep_start); - block_start = schedstat_val(stats->block_start); - - if (sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(stats->sleep_max))) - __schedstat_set(stats->sleep_max, delta); - - __schedstat_set(stats->sleep_start, 0); - __schedstat_add(stats->sum_sleep_runtime, delta); - - if (p) { - account_scheduler_latency(p, delta >> 10, 1); - trace_sched_stat_sleep(p, delta)
[RFC PATCH v3 3/5] sched: make struct sched_statistics independent of fair sched class
If we want to use schedstats facility, we should move out of struct sched_statistics from the struct sched_entity or add it into other sctructs of sched entity as well. Obviously the latter one is bad because that requires more spaces. So we should move it into a common struct which can be used by all sched classes. The struct sched_statistics is the schedular statistics of a task_struct or a task_group. So we can move it into struct task_struct and struct task_group to achieve the goal. Below is the detailed explaination of the change in the structs. - Before this patch struct task_struct {|-> struct sched_entity { ... | ... struct sched_entity *se; ---| struct sched_statistics statistics; struct sched_rt_entity *rt; ... ... ... }; }; struct task_group { |--> se[0]->statistics : schedstats of CPU0 ... | #ifdef CONFIG_FAIR_GROUP_SCHED | struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1 | #endif | |--> se[N]->statistics : schedstats of CPUn #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_rt_entity **rt_se; (N/A) #endif ... }; The '**se' in task_group is allocated in the fair sched class, which is hard to be reused by other sched class. - After this patch struct task_struct { ... struct sched_statistics statistics; ... struct sched_entity *se; struct sched_rt_entity *rt; ... }; struct task_group {|---> stats[0] : of CPU0 ...| struct sched_statistics **stats; --|---> stats[1] : of CPU1 ...| |---> stats[n] : of CPUn #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity **se; #endif #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; #endif ... }; After the patch it is clearly that both of se or rt_se can easily get the sched_statistics by a task_struct or a task_group. Signed-off-by: Yafang Shao --- include/linux/sched.h| 3 +- kernel/sched/core.c | 24 --- kernel/sched/deadline.c | 4 +- kernel/sched/debug.c | 81 +++--- kernel/sched/fair.c | 142 ++- kernel/sched/rt.c| 4 +- kernel/sched/sched.h | 3 + kernel/sched/stats.h | 46 + kernel/sched/stop_task.c | 4 +- 9 files changed, 206 insertions(+), 105 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 063cd120b459..f8e969be4bee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -462,8 +462,6 @@ struct sched_entity { u64 nr_migrations; - struct sched_statistics statistics; - #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; @@ -681,6 +679,7 @@ struct task_struct { unsigned intrt_priority; const struct sched_class*sched_class; + struct sched_statistics stats; struct sched_entity se; struct sched_rt_entity rt; #ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fd76628778f7..081b4f1f2cb4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2427,11 +2427,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); - __schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(p->stats.nr_wakeups_local); } else { struct sched_domain *sd; - __schedstat_inc(p->se.statistics.nr_wakeups_remote); + __schedstat_inc(p->stats.nr_wakeups_remote); rcu_read_lock(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { @@ -2443,14 +2443,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - __schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->stats.nr_wakeups_migrate); #endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); - __schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(p->stats.nr_wakeups); if (wake_flags & WF_SYNC) - __schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->stats.nr_wakeups_sync); } /* @@ -3075,7 +3075,7 @@ static void __sched_fork(unsigned long clone_flags, struct task
[RFC PATCH v3 2/5] sched, fair: use __schedstat_set() in set_next_entity()
schedstat_enabled() has been already checked, so we can use __schedstat_set() directly. Signed-off-by: Yafang Shao --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8ff1daa3d9bb..7e7c03cede94 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4414,7 +4414,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (schedstat_enabled() && rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - schedstat_set(se->statistics.slice_max, + __schedstat_set(se->statistics.slice_max, max((u64)schedstat_val(se->statistics.slice_max), se->sum_exec_runtime - se->prev_sum_exec_runtime)); } -- 2.18.4
[RFC PATCH v3 0/5] sched: support schedstats for RT sched class
We want to measure the latency of RT tasks in our production environment with schedstats facility, but currently schedstats is only supported for fair sched class. This patch enable it for RT sched class as well. - Structure Before we make schedstats available for RT sched class, we should make struct sched_statistics independent of fair sched class first. The struct sched_statistics is the schedular statistics of a task_struct or a task_group. So we can move it into struct task_struct and struct task_group to achieve the goal. Below is the detailed explaination of the change in the structs. The struct before this patch: struct task_struct {|-> struct sched_entity { ... | ... struct sched_entity *se; ---| struct sched_statistics statistics; struct sched_rt_entity *rt; ... ... ... }; }; struct task_group { |--> se[0]->statistics : schedstats of CPU0 ... | #ifdef CONFIG_FAIR_GROUP_SCHED | struct sched_entity **se; --|--> se[1]->statistics : schedstats of CPU1 | #endif | |--> se[N]->statistics : schedstats of CPUn #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_rt_entity **rt_se; (N/A) #endif ... }; The '**se' in task_group is allocated in the fair sched class, which is hard to be reused by other sched class. The struct after this patch: struct task_struct { ... struct sched_statistics statistics; ... struct sched_entity *se; struct sched_rt_entity *rt; ... }; struct task_group {|---> stats[0] : of CPU0 ...| struct sched_statistics **stats; --|---> stats[1] : of CPU1 ...| |---> stats[n] : of CPUn #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity **se; #endif #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; #endif ... }; After the patch it is clearly that both of se or rt_se can easily get the sched_statistics by a task_struct or a task_group. - Function Interface The original prototype of the schedstats helpers are update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se) The cfs_rq in these helpers is used to get the rq_clock, and the se is used to get the struct sched_statistics and the struct task_struct. In order to make these helpers available by all sched classes, we can pass the rq, sched_statistics and task_struct directly. Then the new helpers are update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) which are independent of fair sched class. To avoid vmlinux growing too large or introducing ovehead when !schedstat_enabled(), some new helpers after schedstat_enabled() are also introduced, Suggested by Mel. These helpers are in sched/stats.c, __update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) - Implementation After we make the struct sched_statistics and the helpers of it independent of fair sched class, we can easily use the schedstats facility for RT sched class. The schedstat usage in RT sched class is similar with fair sched class, for example, fairRT enqueue update_stats_enqueue_fair update_stats_enqueue_rt dequeue update_stats_dequeue_fair update_stats_dequeue_rt put_prev_task update_stats_wait_start update_stats_wait_start set_next_task update_stats_wait_end update_stats_wait_end - Usage The user can get the schedstats information in the same way in fair sched class. For example, fairRT task show /proc/[pid]/sched /proc/[pid]/sched group show cpu.stat in cgroup cpu.stat in cgroup The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can be used to trace RT tasks as well. sched_stat_runtime can also be supported in the future if it is helpful. Yafang Shao (5): sched: don't include stats.h in sched.h sched, fair: use __schedstat_set() in set_next_entity() sched: make struct sched_statistics independent of fair sched class sched: make schedstats helpers independent of fair sched class sched, rt: support schedstats for RT sched class include/linux/sched.h| 3 +- kernel/sched/core.c | 25 +++-- kernel/sched/deadline.c | 5 +- kernel/sched/debug.c | 82 +++ kernel/sched/fair.c | 209 +++ kernel/sched/idle.c | 1 + kernel/sched/rt.c| 178 - kernel/sched/sched.h | 13 ++- kernel/sched/stats.c | 105
[RFC PATCH v3 1/5] sched: don't include stats.h in sched.h
This patch is a preparation of the followup patches. In the followup patches some common helpers will be defined in stats.h, and these common helpers require some definitions in sched.h, so let's move stats.h out of sched.h. The source files which require stats.h include it specifically. Signed-off-by: Yafang Shao --- kernel/sched/core.c | 1 + kernel/sched/deadline.c | 1 + kernel/sched/debug.c | 1 + kernel/sched/fair.c | 1 + kernel/sched/idle.c | 1 + kernel/sched/rt.c| 2 +- kernel/sched/sched.h | 6 +- kernel/sched/stats.c | 1 + kernel/sched/stats.h | 2 ++ kernel/sched/stop_task.c | 1 + 10 files changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..fd76628778f7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,7 @@ #undef CREATE_TRACE_POINTS #include "sched.h" +#include "stats.h" #include diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f232305dcefe..7a0124f81a4f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,7 @@ *Fabio Checconi */ #include "sched.h" +#include "stats.h" #include "pelt.h" struct dl_bandwidth def_dl_bandwidth; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2357921580f9..9758aa1bba1e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -7,6 +7,7 @@ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ #include "sched.h" +#include "stats.h" static DEFINE_SPINLOCK(sched_debug_lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8917d2d715ef..8ff1daa3d9bb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,7 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#include "stats.h" /* * Targeted preemption latency for CPU-bound tasks: diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 24d0ee26377d..95c02cbca04a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -7,6 +7,7 @@ *tasks which are handled in sched/fair.c ) */ #include "sched.h" +#include "stats.h" #include diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 49ec096a8aa1..af772ac0f32d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -4,7 +4,7 @@ * policies) */ #include "sched.h" - +#include "stats.h" #include "pelt.h" int sched_rr_timeslice = RR_TIMESLICE; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index df80bfcea92e..871544bb9a38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,9 @@ /* * Scheduler internal types and methods: */ +#ifndef _KERNEL_SCHED_SCHED_H +#define _KERNEL_SCHED_SCHED_H + #include #include @@ -1538,7 +1541,6 @@ extern void flush_smp_call_function_from_idle(void); static inline void flush_smp_call_function_from_idle(void) { } #endif -#include "stats.h" #include "autogroup.h" #ifdef CONFIG_CGROUP_SCHED @@ -2633,3 +2635,5 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +#endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 750fb3c67eed..844bd9dbfbf0 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -3,6 +3,7 @@ * /proc/schedstat implementation */ #include "sched.h" +#include "stats.h" /* * Current schedstat API version. diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 33d0daf83842..c23b653ffc53 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -2,6 +2,8 @@ #ifdef CONFIG_SCHEDSTATS +#include "sched.h" + /* * Expects runqueue lock to be held for atomicity of update */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ceb5b6b12561..a5d289049388 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -8,6 +8,7 @@ * See kernel/stop_machine.c */ #include "sched.h" +#include "stats.h" #ifdef CONFIG_SMP static int -- 2.18.4
Re: [RFC PATCH v2 3/5] sched: make schedstats helper independent of cfs_rq
On Tue, Nov 24, 2020 at 7:40 PM Mel Gorman wrote: > > On Mon, Nov 23, 2020 at 08:58:06PM +0800, Yafang Shao wrote: > > The 'cfs_rq' in these helpers > > update_stats_{wait_start, wait_end, enqueue_sleeper} is only used to get > > the rq_clock, so we can pass the rq directly. Then these helpers can be > > used by all sched class after being moved into stats.h. > > > > After that change, the size of vmlinux is increased around 824Bytes. > > w/o this patch, with this patch > > Size of vmlinux: 7844383278444656 > > > > Signed-off-by: Yafang Shao > > The inline helpers are quite large. When I was suggesting that the overhead > was minimal, what I expected what that the inline functions would be a > schedstat_enabled() followed by a real function call. It would introduce > a small additional overhead when schedstats are enabled but avoid vmlinux > growing too large > > e.g. > > static inline void > update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) > { > if (!schedstat_enabled()) > return; > > __update_stats_wait_start(cfs_rq, se); > } > > where __update_stats_wait_start then lives in kernel/sched/stats.c > Good idea! Now I understand what you mean. Thanks for the detailed explanation. I will update it in the next version. -- Thanks Yafang
[RFC PATCH v2 1/5] sched: don't include stats.h in sched.h
This patch is a preparation of the followup patches. In the followup patches some common helpers will be defined in stats.h, and these common helpers require some definitions in sched.h, so let's move stats.h out of sched.h. The source files which require stats.h include it specifically. Signed-off-by: Yafang Shao --- kernel/sched/core.c | 1 + kernel/sched/deadline.c | 1 + kernel/sched/debug.c | 1 + kernel/sched/fair.c | 1 + kernel/sched/idle.c | 1 + kernel/sched/rt.c| 2 +- kernel/sched/sched.h | 6 +- kernel/sched/stats.c | 1 + kernel/sched/stats.h | 2 ++ kernel/sched/stop_task.c | 1 + 10 files changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..fd76628778f7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,7 @@ #undef CREATE_TRACE_POINTS #include "sched.h" +#include "stats.h" #include diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f232305dcefe..7a0124f81a4f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,7 @@ *Fabio Checconi */ #include "sched.h" +#include "stats.h" #include "pelt.h" struct dl_bandwidth def_dl_bandwidth; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2357921580f9..9758aa1bba1e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -7,6 +7,7 @@ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ #include "sched.h" +#include "stats.h" static DEFINE_SPINLOCK(sched_debug_lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8917d2d715ef..8ff1daa3d9bb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,7 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#include "stats.h" /* * Targeted preemption latency for CPU-bound tasks: diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 24d0ee26377d..95c02cbca04a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -7,6 +7,7 @@ *tasks which are handled in sched/fair.c ) */ #include "sched.h" +#include "stats.h" #include diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 49ec096a8aa1..af772ac0f32d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -4,7 +4,7 @@ * policies) */ #include "sched.h" - +#include "stats.h" #include "pelt.h" int sched_rr_timeslice = RR_TIMESLICE; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index df80bfcea92e..871544bb9a38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,9 @@ /* * Scheduler internal types and methods: */ +#ifndef _KERNEL_SCHED_SCHED_H +#define _KERNEL_SCHED_SCHED_H + #include #include @@ -1538,7 +1541,6 @@ extern void flush_smp_call_function_from_idle(void); static inline void flush_smp_call_function_from_idle(void) { } #endif -#include "stats.h" #include "autogroup.h" #ifdef CONFIG_CGROUP_SCHED @@ -2633,3 +2635,5 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +#endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 750fb3c67eed..844bd9dbfbf0 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -3,6 +3,7 @@ * /proc/schedstat implementation */ #include "sched.h" +#include "stats.h" /* * Current schedstat API version. diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 33d0daf83842..c23b653ffc53 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -2,6 +2,8 @@ #ifdef CONFIG_SCHEDSTATS +#include "sched.h" + /* * Expects runqueue lock to be held for atomicity of update */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ceb5b6b12561..a5d289049388 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -8,6 +8,7 @@ * See kernel/stop_machine.c */ #include "sched.h" +#include "stats.h" #ifdef CONFIG_SMP static int -- 2.18.4
[RFC PATCH v2 4/5] sched: define update_stats_curr_start() as a common helper
update_stats_curr_start() is used to update the exec_start when we are starting a new run period, which is used by all sched class. So we'd better define it as a common helper. Signed-off-by: Yafang Shao --- kernel/sched/fair.c | 14 +- kernel/sched/rt.c| 2 +- kernel/sched/sched.h | 12 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 946b60f586e4..13e803369ced 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -915,18 +915,6 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } } -/* - * We are picking a new current task - update its stats: - */ -static inline void -update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* -* We are starting a new run period: -*/ - se->exec_start = rq_clock_task(rq_of(cfs_rq)); -} - /** * Scheduling class queueing methods: */ @@ -4255,7 +4243,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); } - update_stats_curr_start(cfs_rq, se); + update_stats_curr_start(rq_of(cfs_rq), se); cfs_rq->curr = se; /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index af772ac0f32d..3422dd85cfb4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1574,7 +1574,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) { - p->se.exec_start = rq_clock_task(rq); + update_stats_curr_start(rq, >se); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9a4576ccf3d7..3948112dc31c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2644,4 +2644,16 @@ static inline struct task_struct *task_of(struct sched_entity *se) return container_of(se, struct task_struct, se); } +/* + * We are picking a new current task - update its stats: + */ +static inline void +update_stats_curr_start(struct rq *rq, struct sched_entity *se) +{ + /* +* We are starting a new run period: +*/ + se->exec_start = rq_clock_task(rq); +} + #endif /* _KERNEL_SCHED_SCHED_H */ -- 2.18.4
[RFC PATCH v2 0/5] sched: support schedstat for RT sched class
We want to measure the latency of RT tasks in our production environment with schedstat facility, but currently schedstat is only supported for fair sched class. This patchset enable it for RT sched class as well. The schedstat statistics are defined in struct sched_entity, which is a member of struct task_struct, so we can resue it for RT sched class. The schedstat usage in RT sched class is similar with fair sched class, for example, fairRT enqueue update_stats_enqueue_fair update_stats_enqueue_rt dequeue update_stats_dequeue_fair update_stats_dequeue_rt put_prev_task update_stats_wait_start update_stats_wait_start set_next_task update_stats_wait_end update_stats_wait_end show/proc/[pid]/sched /proc/[pid]/sched The sched:sched_stats_* tracepoints can be used to trace RT tasks as well after that patchset. PATCH #1 ~ #4 are the preparation of PATCH #5. - v2: keep the schedstats functions inline, per Mel. Yafang Shao (5): sched: don't include stats.h in sched.h sched: define task_of() as a common helper sched: make schedstats helper independent of cfs_rq sched: define update_stats_curr_start() as a common helper sched, rt: support schedstat for RT sched class kernel/sched/core.c | 1 + kernel/sched/deadline.c | 1 + kernel/sched/debug.c | 1 + kernel/sched/fair.c | 174 ++- kernel/sched/idle.c | 1 + kernel/sched/rt.c| 94 - kernel/sched/sched.h | 30 ++- kernel/sched/stats.c | 1 + kernel/sched/stats.h | 146 kernel/sched/stop_task.c | 1 + 10 files changed, 280 insertions(+), 170 deletions(-) -- 2.18.4
[RFC PATCH v2 2/5] sched: define task_of() as a common helper
task_of() is used to get task_struct from sched_entity. As sched_entity in struct task_struct can be used by all sched class, we'd better move this macro into sched.h, then it can be used by all sched class. Signed-off-by: Yafang Shao --- kernel/sched/fair.c | 11 --- kernel/sched/sched.h | 8 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8ff1daa3d9bb..59e454cae3be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -259,12 +259,6 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -} - /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) @@ -446,11 +440,6 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #define for_each_sched_entity(se) \ for (; se; se = NULL) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 871544bb9a38..9a4576ccf3d7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2636,4 +2636,12 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + SCHED_WARN_ON(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + #endif /* _KERNEL_SCHED_SCHED_H */ -- 2.18.4
[RFC PATCH v2 3/5] sched: make schedstats helper independent of cfs_rq
The 'cfs_rq' in these helpers update_stats_{wait_start, wait_end, enqueue_sleeper} is only used to get the rq_clock, so we can pass the rq directly. Then these helpers can be used by all sched class after being moved into stats.h. After that change, the size of vmlinux is increased around 824Bytes. w/o this patch, with this patch Size of vmlinux:7844383278444656 Signed-off-by: Yafang Shao --- kernel/sched/fair.c | 148 ++- kernel/sched/stats.h | 144 + 2 files changed, 149 insertions(+), 143 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 59e454cae3be..946b60f586e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -869,124 +869,6 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(>curr->se)); } -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - u64 wait_start, prev_wait_start; - - if (!schedstat_enabled()) - return; - - wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(se->statistics.wait_start); - - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > prev_wait_start)) - wait_start -= prev_wait_start; - - __schedstat_set(se->statistics.wait_start, wait_start); -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct task_struct *p; - u64 delta; - - if (!schedstat_enabled()) - return; - - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); - - if (entity_is_task(se)) { - p = task_of(se); - if (task_on_rq_migrating(p)) { - /* -* Preserve migrating task's wait time so wait_start -* time stamp can be adjusted to accumulate wait time -* prior to migration. -*/ - __schedstat_set(se->statistics.wait_start, delta); - return; - } - trace_sched_stat_wait(p, delta); - } - - __schedstat_set(se->statistics.wait_max, - max(schedstat_val(se->statistics.wait_max), delta)); - __schedstat_inc(se->statistics.wait_count); - __schedstat_add(se->statistics.wait_sum, delta); - __schedstat_set(se->statistics.wait_start, 0); -} - -static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct task_struct *tsk = NULL; - u64 sleep_start, block_start; - - if (!schedstat_enabled()) - return; - - sleep_start = schedstat_val(se->statistics.sleep_start); - block_start = schedstat_val(se->statistics.block_start); - - if (entity_is_task(se)) - tsk = task_of(se); - - if (sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - __schedstat_set(se->statistics.sleep_max, delta); - - __schedstat_set(se->statistics.sleep_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.block_max))) - __schedstat_set(se->statistics.block_max, delta); - - __schedstat_set(se->statistics.block_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - if (tsk->in_iowait) { - __schedstat_add(se->statistics.iowait_sum, delta); - __schedstat_inc(se->statistics.iowait_count); - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* -* Blocking time is in units of nanosecs, so shift by -* 20 to get a milliseconds-range estimation of the -* amount of time that the task spent sleeping: -*/ -
[RFC PATCH v2 5/5] sched, rt: support schedstat for RT sched class
We want to measure the latency of RT tasks in our production environment with schedstat facility, but currently schedstat is only supported for fair sched class. This patch enable it for RT sched class as well. The schedstat statistics are define in struct sched_entity, which is a member of struct task_struct, so we can resue it for RT sched class. The schedstat usage in RT sched class is similar with fair sched class, for example, fairRT enqueue update_stats_enqueue_fair update_stats_enqueue_rt dequeue update_stats_dequeue_fair update_stats_dequeue_rt put_prev_task update_stats_wait_start update_stats_wait_start set_next_task update_stats_wait_end update_stats_wait_end show/proc/[pid]/sched /proc/[pid]/sched The sched:sched_stats_* tracepoints can be used to trace RT tasks as well. Signed-off-by: Yafang Shao --- kernel/sched/rt.c| 90 kernel/sched/sched.h | 4 ++ 2 files changed, 94 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3422dd85cfb4..f2eff92275f0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1246,6 +1246,75 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) dec_rt_group(rt_se, rt_rq); } +#ifdef CONFIG_SCHEDSTATS + +static inline bool +rt_se_is_waiting(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + return rt_se != rt_rq->curr; +} + +static inline void +rt_rq_curr_set(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + rt_rq->curr = rt_se; +} + +#else + +static inline bool +rt_se_is_waiting(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + return false; +} + +static inline void +rt_rq_curr_set(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ +} + +#endif + +static inline void +update_stats_enqueue_rt(struct rq *rq, struct sched_entity *se, + struct sched_rt_entity *rt_se, int flags) +{ + struct rt_rq *rt_rq = >rt; + + if (!schedstat_enabled()) + return; + + if (rt_se_is_waiting(rt_rq, rt_se)) + update_stats_wait_start(rq, se); + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper(rq, se); +} + +static inline void +update_stats_dequeue_rt(struct rq *rq, struct sched_entity *se, + struct sched_rt_entity *rt_se, int flags) +{ + struct rt_rq *rt_rq = >rt; + + if (!schedstat_enabled()) + return; + + if (rt_se_is_waiting(rt_rq, rt_se)) + update_stats_wait_end(rq, se); + + if ((flags & DEQUEUE_SLEEP) && rt_entity_is_task(rt_se)) { + struct task_struct *tsk = rt_task_of(rt_se); + + if (tsk->state & TASK_INTERRUPTIBLE) + __schedstat_set(se->statistics.sleep_start, + rq_clock(rq)); + if (tsk->state & TASK_UNINTERRUPTIBLE) + __schedstat_set(se->statistics.block_start, + rq_clock(rq)); + } +} + /* * Change rt_se->run_list location unless SAVE && !MOVE * @@ -1275,6 +1344,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag struct rt_prio_array *array = _rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); struct list_head *queue = array->queue + rt_se_prio(rt_se); + struct task_struct *task = rt_task_of(rt_se); /* * Don't enqueue the group if its throttled, or when empty. @@ -1288,6 +1358,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag return; } + update_stats_enqueue_rt(rq_of_rt_rq(rt_rq), >se, rt_se, flags); + if (move_entity(flags)) { WARN_ON_ONCE(rt_se->on_list); if (flags & ENQUEUE_HEAD) @@ -1307,7 +1379,9 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = _rq->active; + struct task_struct *task = rt_task_of(rt_se); + update_stats_dequeue_rt(rq_of_rt_rq(rt_rq), >se, rt_se, flags); if (move_entity(flags)) { WARN_ON_ONCE(!rt_se->on_list); __delist_rt_entity(rt_se, array); @@ -1374,6 +1448,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; + check_schedstat_required(); enqueue_rt_entity(rt_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) @@ -1574,6 +1649,12 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag static inline void set_next_
Re: [RFC PATCH 4/4] sched, rt: support schedstat for RT sched class
On Fri, Nov 20, 2020 at 10:39 AM jun qian wrote: > > Yafang Shao 于2020年11月19日周四 上午11:55写道: > > > > We want to measure the latency of RT tasks in our production > > environment with schedstat facility, but currently schedstat is only > > supported for fair sched class. This patch enable it for RT sched class > > as well. > > > > The schedstat statistics are define in struct sched_entity, which is a > > member of struct task_struct, so we can resue it for RT sched class. > > > > The schedstat usage in RT sched class is similar with fair sched class, > > for example, > > fairRT > > enqueue update_stats_enqueue_fair update_stats_enqueue_rt > > dequeue update_stats_dequeue_fair update_stats_dequeue_rt > > put_prev_task update_stats_wait_start update_stats_wait_start > > set_next_task update_stats_wait_end update_stats_wait_end > > show/proc/[pid]/sched /proc/[pid]/sched > > > > The sched:sched_stats_* tracepoints can be used to trace RT tasks as > > well. > > > > Signed-off-by: Yafang Shao > > --- > > kernel/sched/rt.c| 61 > > kernel/sched/sched.h | 2 ++ > > 2 files changed, 63 insertions(+) > > > > diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c > > index b9ec886702a1..a318236b7166 100644 > > --- a/kernel/sched/rt.c > > +++ b/kernel/sched/rt.c > > @@ -1246,6 +1246,46 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, > > struct rt_rq *rt_rq) > > dec_rt_group(rt_se, rt_rq); > > } > > > > Does the deadline schedule class should be considered also? > deadline sched class can be supported as well per my understanding, I think we can do it later. This patchset only aims to support RT sched class. > thanks > > > +static inline void > > +update_stats_enqueue_rt(struct rq *rq, struct sched_entity *se, > > + struct sched_rt_entity *rt_se, int flags) > > +{ > > + struct rt_rq *rt_rq = >rt; > > + > > + if (!schedstat_enabled()) > > + return; > > + > > + if (rt_se != rt_rq->curr) > > + update_stats_wait_start(rq, se); > > + > > + if (flags & ENQUEUE_WAKEUP) > > + update_stats_enqueue_sleeper(rq, se); > > +} > > + > > +static inline void > > +update_stats_dequeue_rt(struct rq *rq, struct sched_entity *se, > > + struct sched_rt_entity *rt_se, int flags) > > +{ > > + struct rt_rq *rt_rq = >rt; > > + > > + if (!schedstat_enabled()) > > + return; > > + > > + if (rt_se != rt_rq->curr) > > + update_stats_wait_end(rq, se); > > + > > + if ((flags & DEQUEUE_SLEEP) && rt_entity_is_task(rt_se)) { > > + struct task_struct *tsk = rt_task_of(rt_se); > > + > > + if (tsk->state & TASK_INTERRUPTIBLE) > > + __schedstat_set(se->statistics.sleep_start, > > + rq_clock(rq)); > > + if (tsk->state & TASK_UNINTERRUPTIBLE) > > + __schedstat_set(se->statistics.block_start, > > + rq_clock(rq)); > > + } > > +} > > + > > /* > > * Change rt_se->run_list location unless SAVE && !MOVE > > * > > @@ -1275,6 +1315,7 @@ static void __enqueue_rt_entity(struct > > sched_rt_entity *rt_se, unsigned int flag > > struct rt_prio_array *array = _rq->active; > > struct rt_rq *group_rq = group_rt_rq(rt_se); > > struct list_head *queue = array->queue + rt_se_prio(rt_se); > > + struct task_struct *task = rt_task_of(rt_se); > > > > /* > > * Don't enqueue the group if its throttled, or when empty. > > @@ -1288,6 +1329,8 @@ static void __enqueue_rt_entity(struct > > sched_rt_entity *rt_se, unsigned int flag > > return; > > } > > > > + update_stats_enqueue_rt(rq_of_rt_rq(rt_rq), >se, rt_se, > > flags); > > + > > if (move_entity(flags)) { > > WARN_ON_ONCE(rt_se->on_list); > > if (flags & ENQUEUE_HEAD) > > @@ -1307,7 +1350,9 @@ static void __dequeue_rt_entity(struct > > sched_rt_entity *rt_se, unsigned int flag > > { > >
Re: [RFC PATCH 2/4] sched: make schedstats helpers not depend on cfs_rq
On Thu, Nov 19, 2020 at 3:45 PM Mel Gorman wrote: > > On Thu, Nov 19, 2020 at 11:52:28AM +0800, Yafang Shao wrote: > > The 'cfs_rq' in these helpers is only used to get the rq_clock, so we > > can pass the rq_clock directly. After that, these helpers can be used by > > all sched class. > > > > Signed-off-by: Yafang Shao > > This introduces overhead in the general case even when schedstats is > disabled. Previously, update_stats_wait_start was a static inline so > function call overhead was avoided and schedstat_enabled() meant the > overhead was negligible. As it's now a function call, the cost of the > function entry/exit will be unconditionally hit regardless of intrest > in schedstat. > > Regardless of the merit of adding schedstats for RT, the overhead of > schedstats when stats are disabled should remain the same with the > static branch check done in an inline function. > Thanks for the explanation. I will make them inline in the next version. -- Thanks Yafang
[RFC PATCH 2/4] sched: make schedstats helpers not depend on cfs_rq
The 'cfs_rq' in these helpers is only used to get the rq_clock, so we can pass the rq_clock directly. After that, these helpers can be used by all sched class. Signed-off-by: Yafang Shao --- kernel/sched/fair.c | 148 ++- kernel/sched/stats.c | 134 +++ kernel/sched/stats.h | 11 3 files changed, 150 insertions(+), 143 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9d73e8e5ebec..aba21191283d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -868,124 +868,6 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(>curr->se)); } -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - u64 wait_start, prev_wait_start; - - if (!schedstat_enabled()) - return; - - wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(se->statistics.wait_start); - - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > prev_wait_start)) - wait_start -= prev_wait_start; - - __schedstat_set(se->statistics.wait_start, wait_start); -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct task_struct *p; - u64 delta; - - if (!schedstat_enabled()) - return; - - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); - - if (entity_is_task(se)) { - p = task_of(se); - if (task_on_rq_migrating(p)) { - /* -* Preserve migrating task's wait time so wait_start -* time stamp can be adjusted to accumulate wait time -* prior to migration. -*/ - __schedstat_set(se->statistics.wait_start, delta); - return; - } - trace_sched_stat_wait(p, delta); - } - - __schedstat_set(se->statistics.wait_max, - max(schedstat_val(se->statistics.wait_max), delta)); - __schedstat_inc(se->statistics.wait_count); - __schedstat_add(se->statistics.wait_sum, delta); - __schedstat_set(se->statistics.wait_start, 0); -} - -static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct task_struct *tsk = NULL; - u64 sleep_start, block_start; - - if (!schedstat_enabled()) - return; - - sleep_start = schedstat_val(se->statistics.sleep_start); - block_start = schedstat_val(se->statistics.block_start); - - if (entity_is_task(se)) - tsk = task_of(se); - - if (sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - __schedstat_set(se->statistics.sleep_max, delta); - - __schedstat_set(se->statistics.sleep_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.block_max))) - __schedstat_set(se->statistics.block_max, delta); - - __schedstat_set(se->statistics.block_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - if (tsk->in_iowait) { - __schedstat_add(se->statistics.iowait_sum, delta); - __schedstat_inc(se->statistics.iowait_count); - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* -* Blocking time is in units of nanosecs, so shift by -* 20 to get a milliseconds-range estimation of the -* amount of time that the task spent sleeping: -*/ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), -
[RFC PATCH 3/4] sched: define update_stats_curr_start() as a common helper
update_stats_curr_start() is used to update the exec_start when we are starting a new run period, which is used by all sched class. So we'd better define it as a common helper. Signed-off-by: Yafang Shao --- kernel/sched/fair.c | 14 +- kernel/sched/rt.c| 2 +- kernel/sched/sched.h | 12 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aba21191283d..b762cc3e165c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -914,18 +914,6 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } } -/* - * We are picking a new current task - update its stats: - */ -static inline void -update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* -* We are starting a new run period: -*/ - se->exec_start = rq_clock_task(rq_of(cfs_rq)); -} - /** * Scheduling class queueing methods: */ @@ -4254,7 +4242,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); } - update_stats_curr_start(cfs_rq, se); + update_stats_curr_start(rq_of(cfs_rq), se); cfs_rq->curr = se; /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 49ec096a8aa1..b9ec886702a1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1574,7 +1574,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) { - p->se.exec_start = rq_clock_task(rq); + update_stats_curr_start(rq, >se); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e29f2375c4f5..28986736ced9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2642,3 +2642,15 @@ static inline struct task_struct *task_of(struct sched_entity *se) return container_of(se, struct task_struct, se); } +/* + * We are picking a new current task - update its stats: + */ +static inline void +update_stats_curr_start(struct rq *rq, struct sched_entity *se) +{ + /* +* We are starting a new run period: +*/ + se->exec_start = rq_clock_task(rq); +} + -- 2.18.4
[RFC PATCH 4/4] sched, rt: support schedstat for RT sched class
We want to measure the latency of RT tasks in our production environment with schedstat facility, but currently schedstat is only supported for fair sched class. This patch enable it for RT sched class as well. The schedstat statistics are define in struct sched_entity, which is a member of struct task_struct, so we can resue it for RT sched class. The schedstat usage in RT sched class is similar with fair sched class, for example, fairRT enqueue update_stats_enqueue_fair update_stats_enqueue_rt dequeue update_stats_dequeue_fair update_stats_dequeue_rt put_prev_task update_stats_wait_start update_stats_wait_start set_next_task update_stats_wait_end update_stats_wait_end show/proc/[pid]/sched /proc/[pid]/sched The sched:sched_stats_* tracepoints can be used to trace RT tasks as well. Signed-off-by: Yafang Shao --- kernel/sched/rt.c| 61 kernel/sched/sched.h | 2 ++ 2 files changed, 63 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b9ec886702a1..a318236b7166 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1246,6 +1246,46 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) dec_rt_group(rt_se, rt_rq); } +static inline void +update_stats_enqueue_rt(struct rq *rq, struct sched_entity *se, + struct sched_rt_entity *rt_se, int flags) +{ + struct rt_rq *rt_rq = >rt; + + if (!schedstat_enabled()) + return; + + if (rt_se != rt_rq->curr) + update_stats_wait_start(rq, se); + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper(rq, se); +} + +static inline void +update_stats_dequeue_rt(struct rq *rq, struct sched_entity *se, + struct sched_rt_entity *rt_se, int flags) +{ + struct rt_rq *rt_rq = >rt; + + if (!schedstat_enabled()) + return; + + if (rt_se != rt_rq->curr) + update_stats_wait_end(rq, se); + + if ((flags & DEQUEUE_SLEEP) && rt_entity_is_task(rt_se)) { + struct task_struct *tsk = rt_task_of(rt_se); + + if (tsk->state & TASK_INTERRUPTIBLE) + __schedstat_set(se->statistics.sleep_start, + rq_clock(rq)); + if (tsk->state & TASK_UNINTERRUPTIBLE) + __schedstat_set(se->statistics.block_start, + rq_clock(rq)); + } +} + /* * Change rt_se->run_list location unless SAVE && !MOVE * @@ -1275,6 +1315,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag struct rt_prio_array *array = _rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); struct list_head *queue = array->queue + rt_se_prio(rt_se); + struct task_struct *task = rt_task_of(rt_se); /* * Don't enqueue the group if its throttled, or when empty. @@ -1288,6 +1329,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag return; } + update_stats_enqueue_rt(rq_of_rt_rq(rt_rq), >se, rt_se, flags); + if (move_entity(flags)) { WARN_ON_ONCE(rt_se->on_list); if (flags & ENQUEUE_HEAD) @@ -1307,7 +1350,9 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = _rq->active; + struct task_struct *task = rt_task_of(rt_se); + update_stats_dequeue_rt(rq_of_rt_rq(rt_rq), >se, rt_se, flags); if (move_entity(flags)) { WARN_ON_ONCE(!rt_se->on_list); __delist_rt_entity(rt_se, array); @@ -1374,6 +1419,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; + check_schedstat_required(); enqueue_rt_entity(rt_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) @@ -1574,6 +1620,12 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) { + struct sched_rt_entity *rt_se = >rt; + struct rt_rq *rt_rq = >rt; + + if (on_rt_rq(>rt)) + update_stats_wait_end(rq, >se); + update_stats_curr_start(rq, >se); /* The running task is never eligible for pushing */ @@ -1591,6 +1643,8 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); rt_queue_push_tasks(rq); +
[RFC PATCH 1/4] sched: define task_of() as a common helper
task_of() is used to get task_struct from sched_entity. As sched_entity in struct task_struct can be used by all sched class, we'd better move this macro into sched.h, then it can be used by all sched class. Signed-off-by: Yafang Shao --- kernel/sched/fair.c | 11 --- kernel/sched/sched.h | 9 + 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8917d2d715ef..9d73e8e5ebec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -258,12 +258,6 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -} - /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) @@ -445,11 +439,6 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #define for_each_sched_entity(se) \ for (; se; se = NULL) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index df80bfcea92e..e29f2375c4f5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2633,3 +2633,12 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + SCHED_WARN_ON(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + -- 2.18.4
[RFC PATCH 0/4] sched: support schedstat for RT sched class
We want to measure the latency of RT tasks in our production environment with schedstat facility, but currently schedstat is only supported for fair sched class. This patchset enable it for RT sched class as well. The schedstat statistics are defined in struct sched_entity, which is a member of struct task_struct, so we can resue it for RT sched class. The schedstat usage in RT sched class is similar with fair sched class, for example, fairRT enqueue update_stats_enqueue_fair update_stats_enqueue_rt dequeue update_stats_dequeue_fair update_stats_dequeue_rt put_prev_task update_stats_wait_start update_stats_wait_start set_next_task update_stats_wait_end update_stats_wait_end show/proc/[pid]/sched /proc/[pid]/sched The sched:sched_stats_* tracepoints can be used to trace RT tasks as well after that patchset. PATCH #1 ~ #3 are the preparation of PATCH #4. Yafang Shao (4): sched: define task_of() as a common helper sched: make schedstats helpers not depend on cfs_rq sched: define update_stats_curr_start() as a common helper sched, rt: support schedstat for RT sched class kernel/sched/fair.c | 173 ++- kernel/sched/rt.c| 63 +++- kernel/sched/sched.h | 23 ++ kernel/sched/stats.c | 134 + kernel/sched/stats.h | 11 +++ 5 files changed, 236 insertions(+), 168 deletions(-) -- 2.18.4
Re: [PATCH 1/1] Sched/fair: Improve the accuracy of sched_stat_wait statistics
On Wed, Oct 14, 2020 at 9:19 PM Peter Zijlstra wrote: > > On Fri, Oct 09, 2020 at 05:25:30PM +0800, qianjun.ker...@gmail.com wrote: > > From: jun qian > > > > When the sched_schedstat changes from 0 to 1, some sched se maybe > > already in the runqueue, the se->statistics.wait_start will be 0. > > So it will let the (rq_of(cfs_rq)) - se->statistics.wait_start) > > wrong. We need to avoid this scenario. > > > > Signed-off-by: jun qian > > Signed-off-by: Yafang Shao > > This SoB chain isn't valid. Did Yafang's tag need to a reviewed-by or > something? > This patch improves the behavior when sched_schedstat is changed from 0 to 1, so it looks good to me. Reviewed-by: Yafang Shao > > --- > > kernel/sched/fair.c | 9 + > > 1 file changed, 9 insertions(+) > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > index 1a68a05..6f8ca0c 100644 > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -906,6 +906,15 @@ static void update_curr_fair(struct rq *rq) > > if (!schedstat_enabled()) > > return; > > > > + /* > > + * When the sched_schedstat changes from 0 to 1, some sched se > > + * maybe already in the runqueue, the se->statistics.wait_start > > + * will be 0.So it will let the delta wrong. We need to avoid this > > + * scenario. > > + */ > > + if (unlikely(!schedstat_val(se->statistics.wait_start))) > > + return; > > + > > delta = rq_clock(rq_of(cfs_rq)) - > > schedstat_val(se->statistics.wait_start); > > > > if (entity_is_task(se)) { > > -- > > 1.8.3.1 > > -- Thanks Yafang
Re: [PATCH] tracing: add tgid into common field
On Tue, Oct 13, 2020 at 9:05 PM Steven Rostedt wrote: > > On Tue, 13 Oct 2020 13:54:54 +0800 > Yafang Shao wrote: > > > --- a/include/linux/trace_events.h > > +++ b/include/linux/trace_events.h > > @@ -67,6 +67,7 @@ struct trace_entry { > > unsigned char flags; > > unsigned char preempt_count; > > int pid; > > + int tgid; > > }; > > > > Sorry, this adds another 4 bytes to *every* event, if you want it to or > not. I'd rather have some ways to remove fields from this header, and not > add more to it. > > I can't take this patch. > Right, that is an overhead. I will think about some ways to avoid adding it. -- Thanks Yafang
[PATCH] tracing: add tgid into common field
Sometimes we want to trace a specific mutil-threaded process, which may create threads dynamically. Currently it is not easy to trace all its threads, because we can only filter these threads out by using common_pid. This patch adds the tgid into the common field as well, with which we can easily filter this mutil-threaded process out. E.g. $ cd /sys/kernel/debug/tracing $ echo 'common_tgid == 4054' > events/sched/sched_wakeup/filter $ cat trace_pipe python-4057[005] d... 48003.898560: sched_wakeup: comm=python pid=4054 prio=120 target_cpu=002 python-4054[002] dNs. 48003.932906: sched_wakeup: comm=kworker/2:2 pid=130 prio=120 target_cpu=002 python-4054[002] dNH. 48003.932907: sched_wakeup: comm=cat pid=4084 prio=120 target_cpu=004 python-4055[003] d... 48004.816596: sched_wakeup: comm=python pid=4054 prio=120 target_cpu=002 With record-tgid set into trace_options, we can show the tgid, $ echo record-tgid > trace_options $ cat trace_pipe python-4054( 4054) [002] d... 48166.611771: sched_wakeup: comm=python pid=4055 prio=120 target_cpu=004 python-4057( 4054) [005] d... 48166.611776: sched_wakeup: comm=python pid=4054 prio=120 target_cpu=002 python-4055( 4054) [004] d... 48166.611848: sched_wakeup: comm=python pid=4054 prio=120 target_cpu=002 After that change, tgid_map is only used by saved_tgid, which may be used by some user tools, so I just keep it as-is. Signed-off-by: Yafang Shao --- include/linux/trace_events.h | 1 + kernel/trace/trace.c | 1 + kernel/trace/trace_events.c | 1 + kernel/trace/trace_output.c | 2 +- 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 5c6943354049..3725c05f0b01 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -67,6 +67,7 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; + int tgid; }; #define TRACE_EVENT_TYPE_MAX \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d3e5de717df2..c2423efaac2c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2445,6 +2445,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, entry->preempt_count= pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; + entry->tgid = (tsk) ? tsk->tgid : 0; entry->type = type; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a85effb2373b..9a5adcecf245 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -182,6 +182,7 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); + __common_field(int, tgid); return ret; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 000e9dc224c6..e04dd45267c7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -591,7 +591,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid); if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { - unsigned int tgid = trace_find_tgid(entry->pid); + unsigned int tgid = entry->tgid; if (!tgid) trace_seq_printf(s, "(---) "); -- 2.17.1
Re: [PATCH v3 2/2] mmap_lock: add tracepoints around lock acquisition
On Sat, Oct 10, 2020 at 6:05 AM Axel Rasmussen wrote: > > The goal of these tracepoints is to be able to debug lock contention > issues. This lock is acquired on most (all?) mmap / munmap / page fault > operations, so a multi-threaded process which does a lot of these can > experience significant contention. > > We trace just before we start acquisition, when the acquisition returns > (whether it succeeded or not), and when the lock is released (or > downgraded). The events are broken out by lock type (read / write). > > The events are also broken out by memcg path. For container-based > workloads, users often think of several processes in a memcg as a single > logical "task", so collecting statistics at this level is useful. > > The end goal is to get latency information. This isn't directly included > in the trace events. Instead, users are expected to compute the time > between "start locking" and "acquire returned", using e.g. synthetic > events or BPF. The benefit we get from this is simpler code. > > Because we use tracepoint_enabled() to decide whether or not to trace, > this patch has effectively no overhead unless tracepoints are enabled at > runtime. If tracepoints are enabled, there is a performance impact, but > how much depends on exactly what e.g. the BPF program does. > > Signed-off-by: Axel Rasmussen Acked-by: Yafang Shao > --- > include/linux/mmap_lock.h| 93 ++-- > include/trace/events/mmap_lock.h | 70 > mm/Makefile | 2 +- > mm/mmap_lock.c | 87 ++ > 4 files changed, 246 insertions(+), 6 deletions(-) > create mode 100644 include/trace/events/mmap_lock.h > create mode 100644 mm/mmap_lock.c > > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > index 0707671851a8..6586b42b4faa 100644 > --- a/include/linux/mmap_lock.h > +++ b/include/linux/mmap_lock.h > @@ -1,11 +1,63 @@ > #ifndef _LINUX_MMAP_LOCK_H > #define _LINUX_MMAP_LOCK_H > > +#include > +#include > #include > +#include > +#include > +#include > > #define MMAP_LOCK_INITIALIZER(name) \ > .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), > > +DECLARE_TRACEPOINT(mmap_lock_start_locking); > +DECLARE_TRACEPOINT(mmap_lock_acquire_returned); > +DECLARE_TRACEPOINT(mmap_lock_released); > + > +#ifdef CONFIG_TRACING > + > +void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write); > +void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, > + bool success); > +void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write); > + > +static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, > + bool write) > +{ > + if (tracepoint_enabled(mmap_lock_start_locking)) > + __mmap_lock_do_trace_start_locking(mm, write); > +} > + > +static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, > + bool write, bool > success) > +{ > + if (tracepoint_enabled(mmap_lock_acquire_returned)) > + __mmap_lock_do_trace_acquire_returned(mm, write, success); > +} > + > +static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool > write) > +{ > + if (tracepoint_enabled(mmap_lock_released)) > + __mmap_lock_do_trace_released(mm, write); > +} > + > +#else /* !CONFIG_TRACING */ > + > +static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, > + bool write) > +{ > +} > +static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, > + bool write, bool > success) > +{ > +} > +static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool > write) > +{ > +} > + > +#endif /* CONFIG_TRACING */ > + > static inline void mmap_init_lock(struct mm_struct *mm) > { > init_rwsem(>mmap_lock); > @@ -13,58 +65,88 @@ static inline void mmap_init_lock(struct mm_struct *mm) > > static inline void mmap_write_lock(struct mm_struct *mm) > { > + __mmap_lock_trace_start_locking(mm, true); > down_write(>mmap_lock); > + __mmap_lock_trace_acquire_returned(mm, true, true); > } > > static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) > { > + __mmap_lock_trace_start_locking(mm, true); > down_write_nested(>mmap_lock, subclass); > +
Re: [PATCH] mmap_lock: add tracepoints around lock acquisition
On Thu, Sep 24, 2020 at 12:09 AM Steven Rostedt wrote: > > On Wed, 23 Sep 2020 18:04:17 +0800 > Yafang Shao wrote: > > > > What you can do, and what we have done is the following: > > > > > > (see include/linux/page_ref.h) > > > > > > > > > #ifdef CONFIG_TRACING > > > extern struct tracepoint __tracepoint_mmap_lock_start_locking; > > > extern struct tracepoint __tracepoint_mmap_lock_acquire_returned; > > > > > > #define mmap_lock_tracepoint_active(t) > > > static_key_false(&(__tracepoint_mmap_lock_##t).key) > > > > > > #else > > > #define mmap_lock_tracepoint_active(t) false > > > #endif > > > > > > static inline void mmap_write_lock(struct mm_struct *mm) > > > { > > > if (mmap_lock_tracepoint_active(start_locking)) > > > mmap_lock_start_trace_wrapper(); > > > down_write(>mmap_lock); > > > if (mmap_lock_tracepoint_active(acquire_returned)) > > > mmap_lock_acquire_trace_wrapper(); > > > } > > > > > > > > > -- Steve > > > > > > Great! > > > > Thanks Steve. > > If the above becomes useful, I may just add helper functions into a > header file that you can include. Perhaps call it: tracepoint_active() > and you need to pass in as an argument the name of the tracepoint. > Yes, please. The new helper would be useful. -- Thanks Yafang
Re: [PATCH] mmap_lock: add tracepoints around lock acquisition
On Wed, Sep 23, 2020 at 12:51 AM Steven Rostedt wrote: > > On Tue, 22 Sep 2020 12:09:19 +0800 > Yafang Shao wrote: > > > > > Are there any methods to avoid un-inlining these wrappers ? > > > > > > > > For example, > > > > // include/linux/mmap_lock.h > > > > > > > > void mmap_lock_start_trace_wrapper(); > > > > void mmap_lock_acquire_trace_wrapper(); > > > > > > > > static inline void mmap_write_lock(struct mm_struct *mm) > > > > { > > > > mmap_lock_start_trace_wrapper(); > > > > down_write(>mmap_lock); > > > > mmap_lock_acquire_trace_wrapper(); > > > > } > > > > > > > > // mm/mmap_lock.c > > > > void mmap_lock_start_trace_wrapper() > > > > { > > > > trace_mmap_lock_start(); > > > > } > > > > > > > > void mmap_lock_start_trace_wrapper() > > > > { > > > > trace_mmap_lock_acquired(); > > > > } > > > > > > We can do something like that, but I don't think it would end up being > > > better. > > > > > > At the end of the day, because the trace stuff cannot be in the > > > header, we have to add an extra function call one way or the other. > > > This would just move the call one step further down the call stack. > > > So, I don't think it would affect performance in the > > > CONFIG_MMAP_LOCK_STATS + tracepoints not enabled at runtime case. > > > > > > > Right, it seems we have to add an extra function call. > > > > > Also the wrappers aren't quite so simple as this, they need some > > > parameters to work. (the struct mm_struct, whether it was a read or a > > > write lock, and whether or not the lock operation succeeded), so it > > > would mean adding more inlined code, which I think adds up to be a > > > nontrivial amount since these wrappers are called so often in the > > > kernel. > > > > > > If you feel strongly, let me know and I can send a version as you > > > describe and we can compare the two. > > > > > > > These tracepoints will be less useful if we have to turn on the config > > to enable it. > > I don't mind implementing it that way if we can't optimize it. > > > > Maybe Steven can give some suggestions, Steven ? > > > > > What you can do, and what we have done is the following: > > (see include/linux/page_ref.h) > > > #ifdef CONFIG_TRACING > extern struct tracepoint __tracepoint_mmap_lock_start_locking; > extern struct tracepoint __tracepoint_mmap_lock_acquire_returned; > > #define mmap_lock_tracepoint_active(t) > static_key_false(&(__tracepoint_mmap_lock_##t).key) > > #else > #define mmap_lock_tracepoint_active(t) false > #endif > > static inline void mmap_write_lock(struct mm_struct *mm) > { > if (mmap_lock_tracepoint_active(start_locking)) > mmap_lock_start_trace_wrapper(); > down_write(>mmap_lock); > if (mmap_lock_tracepoint_active(acquire_returned)) > mmap_lock_acquire_trace_wrapper(); > } > > > -- Steve Great! Thanks Steve. -- Thanks Yafang
Re: [PATCH] mm/memcontrol: Add the drop_cache interface for cgroup v2
On Tue, Sep 22, 2020 at 3:27 PM Michal Hocko wrote: > > On Tue 22-09-20 12:20:52, Yafang Shao wrote: > > On Mon, Sep 21, 2020 at 7:36 PM Michal Hocko wrote: > > > > > > On Mon 21-09-20 19:23:01, Yafang Shao wrote: > > > > On Mon, Sep 21, 2020 at 7:05 PM Michal Hocko wrote: > > > > > > > > > > On Mon 21-09-20 18:55:40, Yafang Shao wrote: > > > > > > On Mon, Sep 21, 2020 at 4:12 PM Michal Hocko > > > > > > wrote: > > > > > > > > > > > > > > On Mon 21-09-20 16:02:55, zangchun...@bytedance.com wrote: > > > > > > > > From: Chunxin Zang > > > > > > > > > > > > > > > > In the cgroup v1, we have 'force_mepty' interface. This is very > > > > > > > > useful for userspace to actively release memory. But the cgroup > > > > > > > > v2 does not. > > > > > > > > > > > > > > > > This patch reuse cgroup v1's function, but have a new name for > > > > > > > > the interface. Because I think 'drop_cache' may be is easier to > > > > > > > > understand :) > > > > > > > > > > > > > > This should really explain a usecase. Global drop_caches is a > > > > > > > terrible > > > > > > > interface and it has caused many problems in the past. People have > > > > > > > learned to use it as a remedy to any problem they might see and > > > > > > > cause > > > > > > > other problems without realizing that. This is the reason why we > > > > > > > even > > > > > > > log each attempt to drop caches. > > > > > > > > > > > > > > I would rather not repeat the same mistake on the memcg level > > > > > > > unless > > > > > > > there is a very strong reason for it. > > > > > > > > > > > > > > > > > > > I think we'd better add these comments above the function > > > > > > mem_cgroup_force_empty() to explain why we don't want to expose this > > > > > > interface in cgroup2, otherwise people will continue to send this > > > > > > proposal without any strong reason. > > > > > > > > > > I do not mind people sending this proposal. "V1 used to have an > > > > > interface, we need it in v2 as well" is not really viable without > > > > > providing more reasoning on the specific usecase. > > > > > > > > > > _Any_ patch should have a proper justification. This is nothing really > > > > > new to the process and I am wondering why this is coming as a > > > > > surprise. > > > > > > > > > > > > > Container users always want to drop cache in a specific container, > > > > because they used to use drop_caches to fix memory pressure issues. > > > > > > This is exactly the kind of problems we have seen in the past. There > > > should be zero reason to addre potential reclaim problems by dropping > > > page cache on the floor. There is a huge cargo cult about this > > > procedure and I have seen numerous reports when people complained about > > > performance afterwards just to learn that the dropped page cache was one > > > of the resons for that. > > > > > > > Although drop_caches can cause some unexpected issues, it could also > > > > fix some issues. > > > > > > "Some issues" is way too general. We really want to learn about those > > > issues and address them properly. > > > > > > > One use case in our production environment is that some of our tasks > > become very latency sensitive from 7am to 10am, so before these tasks > > become active we will use drop_caches to drop page caches generated by > > other tasks at night to avoid these tasks triggering direct reclaim. > > > > The best way to do it is to fix the latency in direct reclaim, but it > > will take great effort. > > What is the latency triggered by the memory reclaim? It should be mostly > a clean page cache right as drop_caches only drops clean pages. Or is > this more about [id]cache? Do you have any profiles where is the time > spent? > Yes, we have analyzed the issues in the direct reclaim, but that is not the point. The point is that each case may take us several
Re: [PATCH] mm/memcontrol: Add the drop_cache interface for cgroup v2
On Mon, Sep 21, 2020 at 7:36 PM Michal Hocko wrote: > > On Mon 21-09-20 19:23:01, Yafang Shao wrote: > > On Mon, Sep 21, 2020 at 7:05 PM Michal Hocko wrote: > > > > > > On Mon 21-09-20 18:55:40, Yafang Shao wrote: > > > > On Mon, Sep 21, 2020 at 4:12 PM Michal Hocko wrote: > > > > > > > > > > On Mon 21-09-20 16:02:55, zangchun...@bytedance.com wrote: > > > > > > From: Chunxin Zang > > > > > > > > > > > > In the cgroup v1, we have 'force_mepty' interface. This is very > > > > > > useful for userspace to actively release memory. But the cgroup > > > > > > v2 does not. > > > > > > > > > > > > This patch reuse cgroup v1's function, but have a new name for > > > > > > the interface. Because I think 'drop_cache' may be is easier to > > > > > > understand :) > > > > > > > > > > This should really explain a usecase. Global drop_caches is a terrible > > > > > interface and it has caused many problems in the past. People have > > > > > learned to use it as a remedy to any problem they might see and cause > > > > > other problems without realizing that. This is the reason why we even > > > > > log each attempt to drop caches. > > > > > > > > > > I would rather not repeat the same mistake on the memcg level unless > > > > > there is a very strong reason for it. > > > > > > > > > > > > > I think we'd better add these comments above the function > > > > mem_cgroup_force_empty() to explain why we don't want to expose this > > > > interface in cgroup2, otherwise people will continue to send this > > > > proposal without any strong reason. > > > > > > I do not mind people sending this proposal. "V1 used to have an > > > interface, we need it in v2 as well" is not really viable without > > > providing more reasoning on the specific usecase. > > > > > > _Any_ patch should have a proper justification. This is nothing really > > > new to the process and I am wondering why this is coming as a surprise. > > > > > > > Container users always want to drop cache in a specific container, > > because they used to use drop_caches to fix memory pressure issues. > > This is exactly the kind of problems we have seen in the past. There > should be zero reason to addre potential reclaim problems by dropping > page cache on the floor. There is a huge cargo cult about this > procedure and I have seen numerous reports when people complained about > performance afterwards just to learn that the dropped page cache was one > of the resons for that. > > > Although drop_caches can cause some unexpected issues, it could also > > fix some issues. > > "Some issues" is way too general. We really want to learn about those > issues and address them properly. > One use case in our production environment is that some of our tasks become very latency sensitive from 7am to 10am, so before these tasks become active we will use drop_caches to drop page caches generated by other tasks at night to avoid these tasks triggering direct reclaim. The best way to do it is to fix the latency in direct reclaim, but it will take great effort. while drop_caches give us an easier way to achieve the same goal. IOW, drop_caches give the users an option to achieve their goal before they find a better solution. > > So container users want to use it in containers as > > well. > > If this feature is not implemented in cgroup, they will ask you why > > but there is no explanation in the kernel. > > There is no usecase that would really require it so far. > > > Regarding the memory.high, it is not perfect as well, because you have > > to set it to 0 to drop_caches, and the processes in the containers > > have to reclaim pages as well because they reach the memory.high, but > > memory.force_empty won't make other processes to reclaim. > > Since 536d3bf261a2 ("mm: memcontrol: avoid workload stalls when lowering > memory.high") the limit is set after the reclaim so the race window when > somebody would be pushed to high limit reclaim is reduced. But I do > agree this is just a workaround. > > > That doesn't mean I agree to add this interface, while I really mean > > that if we discard one feature we'd better explain why. > > We need to understand why somebody wants an interface because once it is > added it will have to be maintained for ever. > -- > Michal Hocko > SUSE Labs -- Thanks Yafang
Re: [PATCH] mmap_lock: add tracepoints around lock acquisition
On Tue, Sep 22, 2020 at 12:53 AM Axel Rasmussen wrote: > > On Sun, Sep 20, 2020 at 9:58 PM Yafang Shao wrote: > > > > On Fri, Sep 18, 2020 at 2:13 AM Axel Rasmussen > > wrote: > > > > > > The goal of these tracepoints is to be able to debug lock contention > > > issues. This lock is acquired on most (all?) mmap / munmap / page fault > > > operations, so a multi-threaded process which does a lot of these can > > > experience significant contention. > > > > > > We trace just before we start acquisition, when the acquisition returns > > > (whether it succeeded or not), and when the lock is released (or > > > downgraded). The events are broken out by lock type (read / write). > > > > > > The events are also broken out by memcg path. For container-based > > > workloads, users often think of several processes in a memcg as a single > > > logical "task", so collecting statistics at this level is useful. > > > > > > These events *do not* include latency bucket information, which means > > > for a proper latency histogram users will need to use BPF instead of > > > event histograms. The benefit we get from this is simpler code. > > > > > > This patch is a no-op if the Kconfig option is not enabled. If it is, > > > tracepoints are still disabled by default (configurable at runtime); > > > the only fixed cost here is un-inlining a few functions. As best as > > > I've been able to measure, the overhead this introduces is a small > > > fraction of 1%. Actually hooking up the tracepoints to BPF introduces > > > additional overhead, depending on exactly what the BPF program is > > > collecting. > > > > Are there any methods to avoid un-inlining these wrappers ? > > > > For example, > > // include/linux/mmap_lock.h > > > > void mmap_lock_start_trace_wrapper(); > > void mmap_lock_acquire_trace_wrapper(); > > > > static inline void mmap_write_lock(struct mm_struct *mm) > > { > > mmap_lock_start_trace_wrapper(); > > down_write(>mmap_lock); > > mmap_lock_acquire_trace_wrapper(); > > } > > > > // mm/mmap_lock.c > > void mmap_lock_start_trace_wrapper() > > { > > trace_mmap_lock_start(); > > } > > > > void mmap_lock_start_trace_wrapper() > > { > > trace_mmap_lock_acquired(); > > } > > We can do something like that, but I don't think it would end up being better. > > At the end of the day, because the trace stuff cannot be in the > header, we have to add an extra function call one way or the other. > This would just move the call one step further down the call stack. > So, I don't think it would affect performance in the > CONFIG_MMAP_LOCK_STATS + tracepoints not enabled at runtime case. > Right, it seems we have to add an extra function call. > Also the wrappers aren't quite so simple as this, they need some > parameters to work. (the struct mm_struct, whether it was a read or a > write lock, and whether or not the lock operation succeeded), so it > would mean adding more inlined code, which I think adds up to be a > nontrivial amount since these wrappers are called so often in the > kernel. > > If you feel strongly, let me know and I can send a version as you > describe and we can compare the two. > These tracepoints will be less useful if we have to turn on the config to enable it. I don't mind implementing it that way if we can't optimize it. Maybe Steven can give some suggestions, Steven ? > > > > > > > > > --- > > > include/linux/mmap_lock.h| 28 +++- > > > include/trace/events/mmap_lock.h | 73 ++ > > > mm/Kconfig | 17 +++ > > > mm/Makefile | 1 + > > > mm/mmap_lock.c | 224 +++ > > > 5 files changed, 342 insertions(+), 1 deletion(-) > > > create mode 100644 include/trace/events/mmap_lock.h > > > create mode 100644 mm/mmap_lock.c > > > > > > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > > > index 0707671851a8..d12aa2ff6c05 100644 > > > --- a/include/linux/mmap_lock.h > > > +++ b/include/linux/mmap_lock.h > > > @@ -1,11 +1,35 @@ > > > #ifndef _LINUX_MMAP_LOCK_H > > > #define _LINUX_MMAP_LOCK_H > > > > > > +#include > > > +#include > > > #include > > > +#include > > > +#include > > > > > > #define MMAP_LOCK_INITIALIZER(name) \ > > >
Re: [PATCH] mm/memcontrol: Add the drop_cache interface for cgroup v2
On Mon, Sep 21, 2020 at 7:05 PM Michal Hocko wrote: > > On Mon 21-09-20 18:55:40, Yafang Shao wrote: > > On Mon, Sep 21, 2020 at 4:12 PM Michal Hocko wrote: > > > > > > On Mon 21-09-20 16:02:55, zangchun...@bytedance.com wrote: > > > > From: Chunxin Zang > > > > > > > > In the cgroup v1, we have 'force_mepty' interface. This is very > > > > useful for userspace to actively release memory. But the cgroup > > > > v2 does not. > > > > > > > > This patch reuse cgroup v1's function, but have a new name for > > > > the interface. Because I think 'drop_cache' may be is easier to > > > > understand :) > > > > > > This should really explain a usecase. Global drop_caches is a terrible > > > interface and it has caused many problems in the past. People have > > > learned to use it as a remedy to any problem they might see and cause > > > other problems without realizing that. This is the reason why we even > > > log each attempt to drop caches. > > > > > > I would rather not repeat the same mistake on the memcg level unless > > > there is a very strong reason for it. > > > > > > > I think we'd better add these comments above the function > > mem_cgroup_force_empty() to explain why we don't want to expose this > > interface in cgroup2, otherwise people will continue to send this > > proposal without any strong reason. > > I do not mind people sending this proposal. "V1 used to have an > interface, we need it in v2 as well" is not really viable without > providing more reasoning on the specific usecase. > > _Any_ patch should have a proper justification. This is nothing really > new to the process and I am wondering why this is coming as a surprise. > Container users always want to drop cache in a specific container, because they used to use drop_caches to fix memory pressure issues. Although drop_caches can cause some unexpected issues, it could also fix some issues. So container users want to use it in containers as well. If this feature is not implemented in cgroup, they will ask you why but there is no explanation in the kernel. Regarding the memory.high, it is not perfect as well, because you have to set it to 0 to drop_caches, and the processes in the containers have to reclaim pages as well because they reach the memory.high, but memory.force_empty won't make other processes to reclaim. That doesn't mean I agree to add this interface, while I really mean that if we discard one feature we'd better explain why. -- Thanks Yafang
Re: [PATCH] mm/memcontrol: Add the drop_cache interface for cgroup v2
On Mon, Sep 21, 2020 at 4:12 PM Michal Hocko wrote: > > On Mon 21-09-20 16:02:55, zangchun...@bytedance.com wrote: > > From: Chunxin Zang > > > > In the cgroup v1, we have 'force_mepty' interface. This is very > > useful for userspace to actively release memory. But the cgroup > > v2 does not. > > > > This patch reuse cgroup v1's function, but have a new name for > > the interface. Because I think 'drop_cache' may be is easier to > > understand :) > > This should really explain a usecase. Global drop_caches is a terrible > interface and it has caused many problems in the past. People have > learned to use it as a remedy to any problem they might see and cause > other problems without realizing that. This is the reason why we even > log each attempt to drop caches. > > I would rather not repeat the same mistake on the memcg level unless > there is a very strong reason for it. > I think we'd better add these comments above the function mem_cgroup_force_empty() to explain why we don't want to expose this interface in cgroup2, otherwise people will continue to send this proposal without any strong reason. > > Signed-off-by: Chunxin Zang > > --- > > Documentation/admin-guide/cgroup-v2.rst | 11 +++ > > mm/memcontrol.c | 5 + > > 2 files changed, 16 insertions(+) > > > > diff --git a/Documentation/admin-guide/cgroup-v2.rst > > b/Documentation/admin-guide/cgroup-v2.rst > > index ce3e05e41724..fbff959c8116 100644 > > --- a/Documentation/admin-guide/cgroup-v2.rst > > +++ b/Documentation/admin-guide/cgroup-v2.rst > > @@ -1181,6 +1181,17 @@ PAGE_SIZE multiple when read back. > > high limit is used and monitored properly, this limit's > > utility is limited to providing the final safety net. > > > > + memory.drop_cache > > +A write-only single value file which exists on non-root > > +cgroups. > > + > > +Provide a mechanism for users to actively trigger memory > > +reclaim. The cgroup will be reclaimed and as many pages > > +reclaimed as possible. > > + > > +It will broke low boundary. Because it tries to reclaim the > > +memory many times, until the memory drops to a certain level. > > + > >memory.oom.group > > A read-write single value file which exists on non-root > > cgroups. The default value is "0". > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > > index 0b38b6ad547d..98646484efff 100644 > > --- a/mm/memcontrol.c > > +++ b/mm/memcontrol.c > > @@ -6226,6 +6226,11 @@ static struct cftype memory_files[] = { > > .write = memory_max_write, > > }, > > { > > + .name = "drop_cache", > > + .flags = CFTYPE_NOT_ON_ROOT, > > + .write = mem_cgroup_force_empty_write, > > + }, > > + { > > .name = "events", > > .flags = CFTYPE_NOT_ON_ROOT, > > .file_offset = offsetof(struct mem_cgroup, events_file), > > -- > > 2.11.0 > > -- > Michal Hocko > SUSE Labs > -- Thanks Yafang
Re: [PATCH] mmap_lock: add tracepoints around lock acquisition
On Fri, Sep 18, 2020 at 2:13 AM Axel Rasmussen wrote: > > The goal of these tracepoints is to be able to debug lock contention > issues. This lock is acquired on most (all?) mmap / munmap / page fault > operations, so a multi-threaded process which does a lot of these can > experience significant contention. > > We trace just before we start acquisition, when the acquisition returns > (whether it succeeded or not), and when the lock is released (or > downgraded). The events are broken out by lock type (read / write). > > The events are also broken out by memcg path. For container-based > workloads, users often think of several processes in a memcg as a single > logical "task", so collecting statistics at this level is useful. > > These events *do not* include latency bucket information, which means > for a proper latency histogram users will need to use BPF instead of > event histograms. The benefit we get from this is simpler code. > > This patch is a no-op if the Kconfig option is not enabled. If it is, > tracepoints are still disabled by default (configurable at runtime); > the only fixed cost here is un-inlining a few functions. As best as > I've been able to measure, the overhead this introduces is a small > fraction of 1%. Actually hooking up the tracepoints to BPF introduces > additional overhead, depending on exactly what the BPF program is > collecting. Are there any methods to avoid un-inlining these wrappers ? For example, // include/linux/mmap_lock.h void mmap_lock_start_trace_wrapper(); void mmap_lock_acquire_trace_wrapper(); static inline void mmap_write_lock(struct mm_struct *mm) { mmap_lock_start_trace_wrapper(); down_write(>mmap_lock); mmap_lock_acquire_trace_wrapper(); } // mm/mmap_lock.c void mmap_lock_start_trace_wrapper() { trace_mmap_lock_start(); } void mmap_lock_start_trace_wrapper() { trace_mmap_lock_acquired(); } > --- > include/linux/mmap_lock.h| 28 +++- > include/trace/events/mmap_lock.h | 73 ++ > mm/Kconfig | 17 +++ > mm/Makefile | 1 + > mm/mmap_lock.c | 224 +++ > 5 files changed, 342 insertions(+), 1 deletion(-) > create mode 100644 include/trace/events/mmap_lock.h > create mode 100644 mm/mmap_lock.c > > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > index 0707671851a8..d12aa2ff6c05 100644 > --- a/include/linux/mmap_lock.h > +++ b/include/linux/mmap_lock.h > @@ -1,11 +1,35 @@ > #ifndef _LINUX_MMAP_LOCK_H > #define _LINUX_MMAP_LOCK_H > > +#include > +#include > #include > +#include > +#include > > #define MMAP_LOCK_INITIALIZER(name) \ > .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), > > +#ifdef CONFIG_MMAP_LOCK_STATS > + > +void mmap_init_lock(struct mm_struct *mm); > +void mmap_write_lock(struct mm_struct *mm); > +void mmap_write_lock_nested(struct mm_struct *mm, int subclass); > +int mmap_write_lock_killable(struct mm_struct *mm); > +bool mmap_write_trylock(struct mm_struct *mm); > +void mmap_write_unlock(struct mm_struct *mm); > +void mmap_write_downgrade(struct mm_struct *mm); > +void mmap_read_lock(struct mm_struct *mm); > +int mmap_read_lock_killable(struct mm_struct *mm); > +bool mmap_read_trylock(struct mm_struct *mm); > +void mmap_read_unlock(struct mm_struct *mm); > +bool mmap_read_trylock_non_owner(struct mm_struct *mm); > +void mmap_read_unlock_non_owner(struct mm_struct *mm); > +void mmap_assert_locked(struct mm_struct *mm); > +void mmap_assert_write_locked(struct mm_struct *mm); > + > +#else /* !CONFIG_MMAP_LOCK_STATS */ > + > static inline void mmap_init_lock(struct mm_struct *mm) > { > init_rwsem(>mmap_lock); > @@ -63,7 +87,7 @@ static inline void mmap_read_unlock(struct mm_struct *mm) > > static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) > { > - if (down_read_trylock(>mmap_lock)) { > + if (mmap_read_trylock(mm)) { > rwsem_release(>mmap_lock.dep_map, _RET_IP_); > return true; > } > @@ -87,4 +111,6 @@ static inline void mmap_assert_write_locked(struct > mm_struct *mm) > VM_BUG_ON_MM(!rwsem_is_locked(>mmap_lock), mm); > } > > +#endif /* CONFIG_MMAP_LOCK_STATS */ > + > #endif /* _LINUX_MMAP_LOCK_H */ > diff --git a/include/trace/events/mmap_lock.h > b/include/trace/events/mmap_lock.h > new file mode 100644 > index ..549c662e6ed8 > --- /dev/null > +++ b/include/trace/events/mmap_lock.h > @@ -0,0 +1,73 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#undef TRACE_SYSTEM > +#define TRACE_SYSTEM mmap_lock > + > +#if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) > +#define _TRACE_MMAP_LOCK_H > + > +#include > +#include > + > +struct mm_struct; > + > +DECLARE_EVENT_CLASS( > + mmap_lock_template, > + > + TP_PROTO(struct mm_struct *mm, const char *memcg_path, u64 duration, > + bool write, bool success), > + > + TP_ARGS(mm,