[tip: sched/core] sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG

2021-04-16 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: b7cc6ec744b307db59568c654a8904a5928aa855
Gitweb:
https://git.kernel.org/tip/b7cc6ec744b307db59568c654a8904a5928aa855
Author:Mel Gorman 
AuthorDate:Wed, 24 Mar 2021 13:39:16 
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:33 +02:00

sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG

The ability to enable/disable NUMA balancing is not a debugging feature
and should not depend on CONFIG_SCHED_DEBUG.  For example, machines within
a HPC cluster may disable NUMA balancing temporarily for some jobs and
re-enable it for other jobs without needing to reboot.

This patch removes the dependency on CONFIG_SCHED_DEBUG for
kernel.numa_balancing sysctl. The other numa balancing related sysctls
are left as-is because if they need to be tuned then it is more likely
that NUMA balancing needs to be fixed instead.

Signed-off-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210324133916.gq15...@suse.de
---
 kernel/sysctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09..8042098 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1753,6 +1753,9 @@ static struct ctl_table kern_table[] = {
.proc_handler   = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_NUMA_BALANCING
{
.procname   = "numa_balancing",
.data   = NULL, /* filled in by handler */
@@ -1763,7 +1766,6 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
 #endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
{
.procname   = "sched_rt_period_us",
.data   = _sched_rt_period,


[tip: sched/core] sched/fair: Merge select_idle_core/cpu()

2021-02-17 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 9fe1f127b913318c631d0041ecf71486e38c2c2d
Gitweb:
https://git.kernel.org/tip/9fe1f127b913318c631d0041ecf71486e38c2c2d
Author:Mel Gorman 
AuthorDate:Wed, 27 Jan 2021 13:52:03 
Committer: Ingo Molnar 
CommitterDate: Wed, 17 Feb 2021 14:07:25 +01:00

sched/fair: Merge select_idle_core/cpu()

Both select_idle_core() and select_idle_cpu() do a loop over the same
cpumask. Observe that by clearing the already visited CPUs, we can
fold the iteration and iterate a core at a time.

All we need to do is remember any non-idle CPU we encountered while
scanning for an idle core. This way we'll only iterate every CPU once.

Signed-off-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Reviewed-by: Vincent Guittot 
Link: 
https://lkml.kernel.org/r/20210127135203.19633-5-mgor...@techsingularity.net
---
 kernel/sched/fair.c |  99 +--
 1 file changed, 59 insertions(+), 40 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a0fc8a..c73d588 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6019,6 +6019,14 @@ static inline int find_idlest_cpu(struct sched_domain 
*sd, struct task_struct *p
return new_cpu;
 }
 
+static inline int __select_idle_cpu(int cpu)
+{
+   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+   return cpu;
+
+   return -1;
+}
+
 #ifdef CONFIG_SCHED_SMT
 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
 EXPORT_SYMBOL_GPL(sched_smt_present);
@@ -6077,48 +6085,51 @@ unlock:
  * there are no idle cores left in the system; tracked through
  * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
  */
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, 
int target)
+static int select_idle_core(struct task_struct *p, int core, struct cpumask 
*cpus, int *idle_cpu)
 {
-   struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
-   int core, cpu;
+   bool idle = true;
+   int cpu;
 
if (!static_branch_likely(_smt_present))
-   return -1;
-
-   if (!test_idle_cores(target, false))
-   return -1;
-
-   cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+   return __select_idle_cpu(core);
 
-   for_each_cpu_wrap(core, cpus, target) {
-   bool idle = true;
-
-   for_each_cpu(cpu, cpu_smt_mask(core)) {
-   if (!available_idle_cpu(cpu)) {
-   idle = false;
-   break;
+   for_each_cpu(cpu, cpu_smt_mask(core)) {
+   if (!available_idle_cpu(cpu)) {
+   idle = false;
+   if (*idle_cpu == -1) {
+   if (sched_idle_cpu(cpu) && 
cpumask_test_cpu(cpu, p->cpus_ptr)) {
+   *idle_cpu = cpu;
+   break;
+   }
+   continue;
}
+   break;
}
-
-   if (idle)
-   return core;
-
-   cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
+   if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
+   *idle_cpu = cpu;
}
 
-   /*
-* Failed to find an idle core; stop looking for one.
-*/
-   set_idle_cores(target, 0);
+   if (idle)
+   return core;
 
+   cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
return -1;
 }
 
 #else /* CONFIG_SCHED_SMT */
 
-static inline int select_idle_core(struct task_struct *p, struct sched_domain 
*sd, int target)
+static inline void set_idle_cores(int cpu, int val)
 {
-   return -1;
+}
+
+static inline bool test_idle_cores(int cpu, bool def)
+{
+   return def;
+}
+
+static inline int select_idle_core(struct task_struct *p, int core, struct 
cpumask *cpus, int *idle_cpu)
+{
+   return __select_idle_cpu(core);
 }
 
 #endif /* CONFIG_SCHED_SMT */
@@ -6131,10 +6142,11 @@ static inline int select_idle_core(struct task_struct 
*p, struct sched_domain *s
 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
target)
 {
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+   int i, cpu, idle_cpu = -1, nr = INT_MAX;
+   bool smt = test_idle_cores(target, false);
+   int this = smp_processor_id();
struct sched_domain *this_sd;
u64 time;
-   int this = smp_processor_id();
-   int cpu, nr = INT_MAX;
 
this_sd = rcu_dereference(*this_cpu_ptr(_llc));
if (!this_sd)
@@ -6142,7 +6154,7 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
 
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 
-   if 

[tip: sched/core] sched/fair: Remove select_idle_smt()

2021-02-17 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 6cd56ef1df399a004f90ecb682427f9964969fc9
Gitweb:
https://git.kernel.org/tip/6cd56ef1df399a004f90ecb682427f9964969fc9
Author:Mel Gorman 
AuthorDate:Mon, 25 Jan 2021 08:59:08 
Committer: Ingo Molnar 
CommitterDate: Wed, 17 Feb 2021 14:06:59 +01:00

sched/fair: Remove select_idle_smt()

In order to make the next patch more readable, and to quantify the
actual effectiveness of this pass, start by removing it.

Signed-off-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Reviewed-by: Vincent Guittot 
Link: 
https://lkml.kernel.org/r/20210125085909.4600-4-mgor...@techsingularity.net
---
 kernel/sched/fair.c | 30 --
 1 file changed, 30 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4c18ef6..6a0fc8a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6114,27 +6114,6 @@ static int select_idle_core(struct task_struct *p, 
struct sched_domain *sd, int 
return -1;
 }
 
-/*
- * Scan the local SMT mask for idle CPUs.
- */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int 
target)
-{
-   int cpu;
-
-   if (!static_branch_likely(_smt_present))
-   return -1;
-
-   for_each_cpu(cpu, cpu_smt_mask(target)) {
-   if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
-   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
-   continue;
-   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
-   return cpu;
-   }
-
-   return -1;
-}
-
 #else /* CONFIG_SCHED_SMT */
 
 static inline int select_idle_core(struct task_struct *p, struct sched_domain 
*sd, int target)
@@ -6142,11 +6121,6 @@ static inline int select_idle_core(struct task_struct 
*p, struct sched_domain *s
return -1;
 }
 
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
*sd, int target)
-{
-   return -1;
-}
-
 #endif /* CONFIG_SCHED_SMT */
 
 /*
@@ -6336,10 +6310,6 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
 
-   i = select_idle_smt(p, sd, target);
-   if ((unsigned)i < nr_cpumask_bits)
-   return i;
-
return target;
 }
 


[tip: sched/core] sched/fair: Clear SMT siblings after determining the core is not idle

2020-12-11 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 13d5a5e9f9b8515da3c04305ae1bb03ab91be7a7
Gitweb:
https://git.kernel.org/tip/13d5a5e9f9b8515da3c04305ae1bb03ab91be7a7
Author:Mel Gorman 
AuthorDate:Mon, 30 Nov 2020 14:40:20 
Committer: Ingo Molnar 
CommitterDate: Fri, 11 Dec 2020 10:30:38 +01:00

sched/fair: Clear SMT siblings after determining the core is not idle

The clearing of SMT siblings from the SIS mask before checking for an idle
core is a small but unnecessary cost. Defer the clearing of the siblings
until the scan moves to the next potential target. The cost of this was
not measured as it is borderline noise but it should be self-evident.

Signed-off-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Reviewed-by: Vincent Guittot 
Link: https://lkml.kernel.org/r/20201130144020.gs3...@techsingularity.net
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f5dceda..efac224 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6086,10 +6086,11 @@ static int select_idle_core(struct task_struct *p, 
struct sched_domain *sd, int 
break;
}
}
-   cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
 
if (idle)
return core;
+
+   cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
}
 
/*


[tip: sched/core] sched/fair: Clear SMT siblings after determining the core is not idle

2020-12-03 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 82b738de57d571cd366d89e75b5fd60f3060852b
Gitweb:
https://git.kernel.org/tip/82b738de57d571cd366d89e75b5fd60f3060852b
Author:Mel Gorman 
AuthorDate:Mon, 30 Nov 2020 14:40:20 
Committer: Peter Zijlstra 
CommitterDate: Thu, 03 Dec 2020 10:00:36 +01:00

sched/fair: Clear SMT siblings after determining the core is not idle

The clearing of SMT siblings from the SIS mask before checking for an idle
core is a small but unnecessary cost. Defer the clearing of the siblings
until the scan moves to the next potential target. The cost of this was
not measured as it is borderline noise but it should be self-evident.

Signed-off-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Vincent Guittot 
Link: https://lkml.kernel.org/r/20201130144020.gs3...@techsingularity.net
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f5dceda..efac224 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6086,10 +6086,11 @@ static int select_idle_core(struct task_struct *p, 
struct sched_domain *sd, int 
break;
}
}
-   cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
 
if (idle)
return core;
+
+   cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
}
 
/*


[tip: sched/core] sched/numa: Allow a floating imbalance between NUMA nodes

2020-11-25 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 7d2b5dd0bcc48095651f1b85f751eef610b3e034
Gitweb:
https://git.kernel.org/tip/7d2b5dd0bcc48095651f1b85f751eef610b3e034
Author:Mel Gorman 
AuthorDate:Fri, 20 Nov 2020 09:06:29 
Committer: Peter Zijlstra 
CommitterDate: Tue, 24 Nov 2020 16:47:47 +01:00

sched/numa: Allow a floating imbalance between NUMA nodes

Currently, an imbalance is only allowed when a destination node
is almost completely idle. This solved one basic class of problems
and was the cautious approach.

This patch revisits the possibility that NUMA nodes can be imbalanced
until 25% of the CPUs are occupied. The reasoning behind 25% is somewhat
superficial -- it's half the cores when HT is enabled.  At higher
utilisations, balancing should continue as normal and keep things even
until scheduler domains are fully busy or over utilised.

Note that this is not expected to be a universal win. Any benchmark
that prefers spreading as wide as possible with limited communication
will favour the old behaviour as there is more memory bandwidth.
Workloads that communicate heavily in pairs such as netperf or tbench
benefit. For the tests I ran, the vast majority of workloads saw
a benefit so it seems to be a worthwhile trade-off.

Signed-off-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Vincent Guittot 
Link: 
https://lkml.kernel.org/r/20201120090630.3286-4-mgor...@techsingularity.net
---
 kernel/sched/fair.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2626c6b..377c77b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1559,7 +1559,8 @@ struct task_numa_env {
 static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
 static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int dst_running);
+static inline long adjust_numa_imbalance(int imbalance,
+   int dst_running, int dst_weight);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -1939,7 +1940,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
-   imbalance = adjust_numa_imbalance(imbalance, dst_running);
+   imbalance = adjust_numa_imbalance(imbalance, dst_running,
+   env->dst_stats.weight);
 
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -8995,16 +8997,14 @@ next_group:
 
 #define NUMA_IMBALANCE_MIN 2
 
-static inline long adjust_numa_imbalance(int imbalance, int dst_running)
+static inline long adjust_numa_imbalance(int imbalance,
+   int dst_running, int dst_weight)
 {
-   unsigned int imbalance_min;
-
/*
 * Allow a small imbalance based on a simple pair of communicating
-* tasks that remain local when the source domain is almost idle.
+* tasks that remain local when the destination is lightly loaded.
 */
-   imbalance_min = NUMA_IMBALANCE_MIN;
-   if (dst_running <= imbalance_min)
+   if (dst_running < (dst_weight >> 2) && imbalance <= NUMA_IMBALANCE_MIN)
return 0;
 
return imbalance;
@@ -9106,9 +9106,10 @@ static inline void calculate_imbalance(struct lb_env 
*env, struct sd_lb_stats *s
}
 
/* Consider allowing a small imbalance between NUMA groups */
-   if (env->sd->flags & SD_NUMA)
+   if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
-   busiest->sum_nr_running);
+   busiest->sum_nr_running, busiest->group_weight);
+   }
 
return;
}


[tip: sched/core] sched: Limit the amount of NUMA imbalance that can exist at fork time

2020-11-25 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 23e6082a522e32232f7377540b4d42d8304253b8
Gitweb:
https://git.kernel.org/tip/23e6082a522e32232f7377540b4d42d8304253b8
Author:Mel Gorman 
AuthorDate:Fri, 20 Nov 2020 09:06:30 
Committer: Peter Zijlstra 
CommitterDate: Tue, 24 Nov 2020 16:47:48 +01:00

sched: Limit the amount of NUMA imbalance that can exist at fork time

At fork time currently, a local node can be allowed to fill completely
and allow the periodic load balancer to fix the problem. This can be
problematic in cases where a task creates lots of threads that idle until
woken as part of a worker poll causing a memory bandwidth problem.

However, a "real" workload suffers badly from this behaviour. The workload
in question is mostly NUMA aware but spawns large numbers of threads
that act as a worker pool that can be called from anywhere. These need
to spread early to get reasonable behaviour.

This patch limits how much a local node can fill before spilling over
to another node and it will not be a universal win. Specifically,
very short-lived workloads that fit within a NUMA node would prefer
the memory bandwidth.

As I cannot describe the "real" workload, the best proxy measure I found
for illustration was a page fault microbenchmark. It's not representative
of the workload but demonstrates the hazard of the current behaviour.

pft timings
 5.10.0-rc2 5.10.0-rc2
  imbalancefloat-v2  forkspread-v2
Amean elapsed-146.37 (   0.00%)   46.05 *   0.69%*
Amean elapsed-412.43 (   0.00%)   12.49 *  -0.47%*
Amean elapsed-7 7.61 (   0.00%)7.55 *   0.81%*
Amean elapsed-124.79 (   0.00%)4.80 (  -0.17%)
Amean elapsed-213.13 (   0.00%)2.89 *   7.74%*
Amean elapsed-303.65 (   0.00%)2.27 *  37.62%*
Amean elapsed-483.08 (   0.00%)2.13 *  30.69%*
Amean elapsed-792.00 (   0.00%)1.90 *   4.95%*
Amean elapsed-802.00 (   0.00%)1.90 *   4.70%*

This is showing the time to fault regions belonging to threads. The target
machine has 80 logical CPUs and two nodes. Note the ~30% gain when the
machine is approximately the point where one node becomes fully utilised.
The slower results are borderline noise.

Kernel building shows similar benefits around the same balance point.
Generally performance was either neutral or better in the tests conducted.
The main consideration with this patch is the point where fork stops
spreading a task so some workloads may benefit from different balance
points but it would be a risky tuning parameter.

Signed-off-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Vincent Guittot 
Link: 
https://lkml.kernel.org/r/20201120090630.3286-5-mgor...@techsingularity.net
---
 kernel/sched/fair.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 377c77b..2e8aade 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8762,6 +8762,16 @@ static bool update_pick_idlest(struct sched_group 
*idlest,
 }
 
 /*
+ * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
+ * This is an approximation as the number of running tasks may not be
+ * related to the number of busy CPUs due to sched_setaffinity.
+ */
+static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+{
+   return (dst_running < (dst_weight >> 2));
+}
+
+/*
  * find_idlest_group() finds and returns the least busy CPU group within the
  * domain.
  *
@@ -8893,7 +8903,7 @@ find_idlest_group(struct sched_domain *sd, struct 
task_struct *p, int this_cpu)
 * a real need of migration, periodic load balance will
 * take care of it.
 */
-   if (local_sgs.idle_cpus)
+   if (allow_numa_imbalance(local_sgs.sum_nr_running, 
sd->span_weight))
return NULL;
}
 
@@ -9000,11 +9010,14 @@ next_group:
 static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight)
 {
+   if (!allow_numa_imbalance(dst_running, dst_weight))
+   return imbalance;
+
/*
 * Allow a small imbalance based on a simple pair of communicating
 * tasks that remain local when the destination is lightly loaded.
 */
-   if (dst_running < (dst_weight >> 2) && imbalance <= NUMA_IMBALANCE_MIN)
+   if (imbalance <= NUMA_IMBALANCE_MIN)
return 0;
 
return imbalance;


[tip: sched/core] sched/numa: Rename nr_running and break out the magic number

2020-11-25 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: abeae76a47005aa3f07c9be12d8076365622e25c
Gitweb:
https://git.kernel.org/tip/abeae76a47005aa3f07c9be12d8076365622e25c
Author:Mel Gorman 
AuthorDate:Fri, 20 Nov 2020 09:06:27 
Committer: Peter Zijlstra 
CommitterDate: Tue, 24 Nov 2020 16:47:47 +01:00

sched/numa: Rename nr_running and break out the magic number

This is simply a preparation patch to make the following patches easier
to read. No functional change.

Signed-off-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Vincent Guittot 
Link: 
https://lkml.kernel.org/r/20201120090630.3286-2-mgor...@techsingularity.net
---
 kernel/sched/fair.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6691e28..9d10abe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1559,7 +1559,7 @@ struct task_numa_env {
 static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
 static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int nr_running);
+static inline long adjust_numa_imbalance(int imbalance, int dst_running);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -8991,7 +8991,9 @@ next_group:
}
 }
 
-static inline long adjust_numa_imbalance(int imbalance, int nr_running)
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long adjust_numa_imbalance(int imbalance, int dst_running)
 {
unsigned int imbalance_min;
 
@@ -8999,8 +9001,8 @@ static inline long adjust_numa_imbalance(int imbalance, 
int nr_running)
 * Allow a small imbalance based on a simple pair of communicating
 * tasks that remain local when the source domain is almost idle.
 */
-   imbalance_min = 2;
-   if (nr_running <= imbalance_min)
+   imbalance_min = NUMA_IMBALANCE_MIN;
+   if (dst_running <= imbalance_min)
return 0;
 
return imbalance;


[tip: sched/core] sched: Avoid unnecessary calculation of load imbalance at clone time

2020-11-25 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 5c339005f854fa75aa46078ad640919425658b3e
Gitweb:
https://git.kernel.org/tip/5c339005f854fa75aa46078ad640919425658b3e
Author:Mel Gorman 
AuthorDate:Fri, 20 Nov 2020 09:06:28 
Committer: Peter Zijlstra 
CommitterDate: Tue, 24 Nov 2020 16:47:47 +01:00

sched: Avoid unnecessary calculation of load imbalance at clone time

In find_idlest_group(), the load imbalance is only relevant when the group
is either overloaded or fully busy but it is calculated unconditionally.
This patch moves the imbalance calculation to the context it is required.
Technically, it is a micro-optimisation but really the benefit is avoiding
confusing one type of imbalance with another depending on the group_type
in the next patch.

No functional change.

Signed-off-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Vincent Guittot 
Link: 
https://lkml.kernel.org/r/20201120090630.3286-3-mgor...@techsingularity.net
---
 kernel/sched/fair.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9d10abe..2626c6b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8777,9 +8777,6 @@ find_idlest_group(struct sched_domain *sd, struct 
task_struct *p, int this_cpu)
.group_type = group_overloaded,
};
 
-   imbalance = scale_load_down(NICE_0_LOAD) *
-   (sd->imbalance_pct-100) / 100;
-
do {
int local_group;
 
@@ -8833,6 +8830,11 @@ find_idlest_group(struct sched_domain *sd, struct 
task_struct *p, int this_cpu)
switch (local_sgs.group_type) {
case group_overloaded:
case group_fully_busy:
+
+   /* Calculate allowed imbalance based on load */
+   imbalance = scale_load_down(NICE_0_LOAD) *
+   (sd->imbalance_pct-100) / 100;
+
/*
 * When comparing groups across NUMA domains, it's possible for
 * the local domain to be very lightly loaded relative to the


[tip: sched/core] sched/core: Offload wakee task activation if it the wakee is descheduling

2020-05-24 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 2ebb17717550607bcd85fb8cf7d24ac870e9d762
Gitweb:
https://git.kernel.org/tip/2ebb17717550607bcd85fb8cf7d24ac870e9d762
Author:Mel Gorman 
AuthorDate:Sun, 24 May 2020 21:29:56 +01:00
Committer: Ingo Molnar 
CommitterDate: Mon, 25 May 2020 07:04:10 +02:00

sched/core: Offload wakee task activation if it the wakee is descheduling

The previous commit:

  c6e7bd7afaeb: ("sched/core: Optimize ttwu() spinning on p->on_cpu")

avoids spinning on p->on_rq when the task is descheduling, but only if the
wakee is on a CPU that does not share cache with the waker.

This patch offloads the activation of the wakee to the CPU that is about to
go idle if the task is the only one on the runqueue. This potentially allows
the waker task to continue making progress when the wakeup is not strictly
synchronous.

This is very obvious with netperf UDP_STREAM running on localhost. The
waker is sending packets as quickly as possible without waiting for any
reply. It frequently wakes the server for the processing of packets and
when netserver is using local memory, it quickly completes the processing
and goes back to idle. The waker often observes that netserver is on_rq
and spins excessively leading to a drop in throughput.

This is a comparison of 5.7-rc6 against "sched: Optimize ttwu() spinning
on p->on_cpu" and against this patch labeled vanilla, optttwu-v1r1 and
localwakelist-v1r2 respectively.

  5.7.0-rc6  5.7.0-rc6  
5.7.0-rc6
vanilla   optttwu-v1r1 
localwakelist-v1r2
Hmean send-64 251.49 (   0.00%)  258.05 *   2.61%*  305.59 
*  21.51%*
Hmean send-128497.86 (   0.00%)  519.89 *   4.43%*  600.25 
*  20.57%*
Hmean send-256944.90 (   0.00%)  997.45 *   5.56%* 1140.19 
*  20.67%*
Hmean send-1024  3779.03 (   0.00%) 3859.18 *   2.12%* 4518.19 
*  19.56%*
Hmean send-2048  7030.81 (   0.00%) 7315.99 *   4.06%* 8683.01 
*  23.50%*
Hmean send-3312 10847.44 (   0.00%)11149.43 *   2.78%*12896.71 
*  18.89%*
Hmean send-4096 13436.19 (   0.00%)13614.09 (   1.32%)15041.09 
*  11.94%*
Hmean send-8192 22624.49 (   0.00%)23265.32 *   2.83%*24534.96 
*   8.44%*
Hmean send-1638434441.87 (   0.00%)36457.15 *   5.85%*35986.21 
*   4.48%*

Note that this benefit is not universal to all wakeups, it only applies
to the case where the waker often spins on p->on_rq.

The impact can be seen from a "perf sched latency" report generated from
a single iteration of one packet size:

   
-
Task  |   Runtime ms  | Switches | Average delay ms | 
Maximum delay ms | Maximum delay at   |
   
-

  vanilla
netperf:4337  |  21709.193 ms | 2932 | avg:0.002 ms | max:  
  0.041 ms | max at:112.154512 s
netserver:4338|  14629.459 ms |  5146990 | avg:0.001 ms | max: 
1615.864 ms | max at:140.134496 s

  localwakelist-v1r2
netperf:4339  |  29789.717 ms | 2460 | avg:0.002 ms | max:  
  0.059 ms | max at:138.205389 s
netserver:4340|  18858.767 ms |  7279005 | avg:0.001 ms | max:  
  0.362 ms | max at:135.709683 s
   
-

Note that the average wakeup delay is quite small on both the vanilla
kernel and with the two patches applied. However, there are significant
outliers with the vanilla kernel with the maximum one measured as 1615
milliseconds with a vanilla kernel but never worse than 0.362 ms with
both patches applied and a much higher rate of context switching.

Similarly a separate profile of cycles showed that 2.83% of all cycles
were spent in try_to_wake_up() with almost half of the cycles spent
on spinning on p->on_rq. With the two patches, the percentage of cycles
spent in try_to_wake_up() drops to 1.13%

Signed-off-by: Mel Gorman 
Signed-off-by: Ingo Molnar 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Jirka Hladky 
Cc: Vincent Guittot 
Cc: valentin.schnei...@arm.com
Cc: Hillf Danton 
Cc: Rik van Riel 
Link: 
https://lore.kernel.org/r/20200524202956.27665-3-mgor...@techsingularity.net
---
 kernel/sched/core.c  | 39 +--
 kernel/sched/sched.h |  3 ++-
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 903c9ee..6130ab1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2312,7 +2312,13 @@ static void wake_csd_func(void *info)
sched_ttwu_pending();
 }