For different workloads the optimal "softness" of soft affinity can be
different. Introduce tunables sched_allowed and sched_preferred that can
be tuned via /proc. This allows to chose at what utilization difference
the scheduler will chose cpus_allowed over cpus_preferred in the first
level of search. Depending on the extent of data sharing, cache coherency
overhead of the system etc. the optimal point may vary.

Signed-off-by: subhra mazumdar <subhra.mazum...@oracle.com>
---
 include/linux/sched/sysctl.h |  2 ++
 kernel/sched/fair.c          | 19 ++++++++++++++++++-
 kernel/sched/sched.h         |  2 ++
 kernel/sysctl.c              | 14 ++++++++++++++
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 99ce6d7..0e75602 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -41,6 +41,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 #ifdef CONFIG_SCHED_DEBUG
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+extern __read_mostly unsigned int sysctl_sched_preferred;
+extern __read_mostly unsigned int sysctl_sched_allowed;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *length,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 53aa7f2..d222d78 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -85,6 +85,8 @@ unsigned int sysctl_sched_wakeup_granularity                  
= 1000000UL;
 static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost   = 500000UL;
+const_debug unsigned int sysctl_sched_preferred                = 1UL;
+const_debug unsigned int sysctl_sched_allowed          = 100UL;
 
 #ifdef CONFIG_SMP
 /*
@@ -6739,7 +6741,22 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int sd_flag, int wake_f
        int new_cpu = prev_cpu;
        int want_affine = 0;
        int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
-       struct cpumask *cpus = &p->cpus_preferred;
+       int cpux, cpuy;
+       struct cpumask *cpus;
+
+       if (!p->affinity_unequal) {
+               cpus = &p->cpus_allowed;
+       } else {
+               cpux = cpumask_any(&p->cpus_preferred);
+               cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+               cpumask_andnot(cpus, &p->cpus_allowed, &p->cpus_preferred);
+               cpuy = cpumask_any(cpus);
+               if (sysctl_sched_preferred * cpu_rq(cpux)->cfs.avg.util_avg >
+                   sysctl_sched_allowed * cpu_rq(cpuy)->cfs.avg.util_avg)
+                       cpus = &p->cpus_allowed;
+               else
+                       cpus = &p->cpus_preferred;
+       }
 
        if (sd_flag & SD_BALANCE_WAKE) {
                record_wakee(p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1a..f856bdb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1863,6 +1863,8 @@ extern void check_preempt_curr(struct rq *rq, struct 
task_struct *p, int flags);
 
 extern const_debug unsigned int sysctl_sched_nr_migrate;
 extern const_debug unsigned int sysctl_sched_migration_cost;
+extern const_debug unsigned int sysctl_sched_preferred;
+extern const_debug unsigned int sysctl_sched_allowed;
 
 #ifdef CONFIG_SCHED_HRTICK
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7d1008b..bdffb48 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -383,6 +383,20 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+       {
+               .procname       = "sched_preferred",
+               .data           = &sysctl_sched_preferred,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_allowed",
+               .data           = &sysctl_sched_allowed,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
 #ifdef CONFIG_SCHEDSTATS
        {
                .procname       = "sched_schedstats",
-- 
2.9.3

Reply via email to