Add the base migrate_disable() support (under protest). While migrate_disable() is (currently) required for PREEMPT_RT, it is also one of the biggest flaws in the system.
Notably this is just the base implementation, it is broken vs sched_setaffinity() and hotplug, both solved in additional patches for ease of review. Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> --- include/linux/preempt.h | 60 ++++++++++++++++++++++++ include/linux/sched.h | 3 + kernel/sched/core.c | 116 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 2 lib/smp_processor_id.c | 5 ++ 5 files changed, 179 insertions(+), 7 deletions(-) --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -322,6 +322,64 @@ static inline void preempt_notifier_init #endif +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + +/* + * Migrate-Disable and why it is (strongly) undesired. + * + * The premise of the Real-Time schedulers we have on Linux + * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks + * concurrently, provided there are sufficient runnable tasks, also known as + * work-conserving. For instance SCHED_DEADLINE tries to schedule the M + * earliest deadline threads, and SCHED_FIFO the M highest priority threads. + * + * The correctness of various scheduling models depends on this, but is it + * broken by migrate_disable() that doesn't imply preempt_disable(). Where + * preempt_disable() implies an immediate priority ceiling, preemptible + * migrate_disable() allows nesting. + * + * The worst case is that all tasks preempt one another in a migrate_disable() + * region and stack on a single CPU. This then reduces the available bandwidth + * to a single CPU. And since Real-Time schedulability theory considers the + * Worst-Case only, all Real-Time analysis shall revert to single-CPU + * (instantly solving the SMP analysis problem). + * + * + * The reason we have it anyway. + * + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a + * number of primitives into becoming preemptible, they would also allow + * migration. This turns out to break a bunch of per-cpu usage. To this end, + * all these primitives employ migirate_disable() to restore this implicit + * assumption. + * + * This is a 'temporary' work-around at best. The correct solution is getting + * rid of the above assumptions and reworking the code to employ explicit + * per-cpu locking or short preempt-disable regions. + * + * The end goal must be to get rid of migrate_disable(), alternatively we need + * a schedulability theory that does not depend on abritrary migration. + * + * + * Notes on the implementation. + * + * The implementation is particularly tricky since existing code patterns + * dictate neither migrate_disable() nor migrate_enable() is allowed to block. + * This means that it cannot use cpus_read_lock() to serialize against hotplug, + * nor can it easily migrate itself into a pending affinity mask change on + * migrate_enable(). + * + * + * Note: even non-work-conserving schedulers like semi-partitioned depends on + * migration, so migrate_disable() is not only a problem for + * work-conserving schedulers. + * + */ +extern void migrate_disable(void); +extern void migrate_enable(void); + +#else /* !(CONFIG_SMP && CONFIG_PREEMPT_RT) */ + /** * migrate_disable - Prevent migration of the current task * @@ -352,4 +410,6 @@ static __always_inline void migrate_enab preempt_enable(); } +#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */ + #endif /* __LINUX_PREEMPT_H */ --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -713,6 +713,9 @@ struct task_struct { int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t cpus_mask; +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + int migration_disabled; +#endif #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1700,6 +1700,68 @@ void check_preempt_curr(struct rq *rq, s #ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT_RT + +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags); + +static void migrate_disable_switch(struct rq *rq, struct task_struct *p) +{ + if (likely(!p->migration_disabled)) + return; + + if (p->cpus_ptr != &p->cpus_mask) + return; + + /* + * Violates locking rules! see comment in __do_set_cpus_allowed(). + */ + __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); +} + +void migrate_disable(void) +{ + if (current->migration_disabled++) + return; + + barrier(); +} +EXPORT_SYMBOL_GPL(migrate_disable); + +void migrate_enable(void) +{ + if (--current->migration_disabled) + return; + + barrier(); + + if (p->cpus_ptr == &p->cpus_mask) + return; + + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); +} +EXPORT_SYMBOL_GPL(migrate_enable); + +static inline bool is_migration_disabled(struct task_struct *p) +{ + return p->migration_disabled; +} + +#else + +static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } + +static inline bool is_migration_disabled(struct task_struct *p) +{ + return false; +} + +#endif + /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). @@ -1709,7 +1771,7 @@ static inline bool is_cpu_allowed(struct if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; - if (is_per_cpu_kthread(p)) + if (is_per_cpu_kthread(p) || is_migration_disabled(p)) return cpu_online(cpu); return cpu_active(cpu); @@ -1830,8 +1892,19 @@ static int migration_cpu_stop(void *data */ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { - cpumask_copy(&p->cpus_mask, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); + if (flags & SCA_MIGRATE_DISABLE) { + p->cpus_ptr = new_mask; + p->nr_cpus_allowed = 1; + return; + } + + if (flags & SCA_MIGRATE_ENABLE) + p->cpus_ptr = &p->cpus_mask; + else + cpumask_copy(&p->cpus_mask, new_mask); + + if (p->cpus_ptr == &p->cpus_mask) + p->nr_cpus_allowed = cpumask_weight(p->cpus_ptr); } static void @@ -1840,7 +1913,22 @@ __do_set_cpus_allowed(struct task_struct struct rq *rq = task_rq(p); bool queued, running; - lockdep_assert_held(&p->pi_lock); + /* + * This here violates the locking rules for affinity, since we're only + * supposed to change these variables while holding both rq->lock and + * p->pi_lock. + * + * HOWEVER, it magically works, because ttwu() is the only code that + * accesses these variables under p->pi_lock and only does so after + * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() + * before finish_task(). + * + * XXX do further audits, this smells like something putrid. + */ + if (flags & SCA_MIGRATE_DISABLE) + SCHED_WARN_ON(!p->on_cpu); + else + lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -1891,9 +1979,14 @@ static int __set_cpus_allowed_ptr(struct rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD) { + if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { /* - * Kernel threads are allowed on online && !active CPUs + * Kernel threads are allowed on online && !active CPUs. + * + * Specifically, migration_disabled() tasks must not fail the + * cpumask_and_and_distribute() pick below, esp. so on + * SCA_MIGRATE_ENABLE, otherwise we'll not call + * set_cpus_allowed_common() and actually reset p->cpus_ptr. */ cpu_valid_mask = cpu_online_mask; } @@ -1907,7 +2000,7 @@ static int __set_cpus_allowed_ptr(struct goto out; } - if (cpumask_equal(&p->cpus_mask, new_mask)) + if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask)) goto out; /* @@ -1999,6 +2092,8 @@ void set_task_cpu(struct task_struct *p, * Clearly, migrating tasks to offline CPUs is a fairly daft thing. */ WARN_ON_ONCE(!cpu_online(new_cpu)); + + WARN_ON_ONCE(is_migration_disabled(p)); #endif trace_sched_migrate_task(p, new_cpu); @@ -2329,6 +2424,12 @@ static int select_fallback_rq(int cpu, s } fallthrough; case possible: + /* + * XXX When called from select_task_rq() we only + * hold p->pi_lock and again violate locking order. + * + * More yuck to audit. + */ do_set_cpus_allowed(p, cpu_possible_mask); state = fail; break; @@ -4589,6 +4690,7 @@ static void __sched notrace __schedule(b */ ++*switch_count; + migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); trace_sched_switch(preempt, prev, next); --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1901,6 +1901,8 @@ extern void update_group_capacity(struct extern void trigger_load_balance(struct rq *rq); #define SCA_CHECK 0x01 +#define SCA_MIGRATE_DISABLE 0x02 +#define SCA_MIGRATE_ENABLE 0x04 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(c if (current->nr_cpus_allowed == 1) goto out; +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + if (current->migration_disabled) + goto out; +#endif + /* * It is valid to assume CPU-locality during early bootup: */