There's an accounting issue with the SCHED_DEADLINE and the creation of cpusets. If a SCHED_DEADLINE task already exists and a new root domain is created, the calculation of the bandwidth among the root domains gets corrupted.
For the reproducer, I downloaded Juri's tests: https://github.com/jlelli/tests.git For his burn.c file. https://github.com/jlelli/schedtool-dl.git For his modified schedtool utility. I have a kernel with my patches that show the bandwidth: # grep dl /proc/sched_debug dl_rq[0]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[1]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[2]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[3]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[4]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[5]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[6]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[7]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 Note: the sched_rt runtime and period are 950000 and 1000000 respectively, and the bw ratio is (95/100) << 20 == 996147. This isn't the way I first discovered the issue, but it appears to be the quickest way to reproduce it. Make sure there's no other cpusets. As libvirt created some, I had to remove them first: # rmdir /sys/fs/cgroup/cpuset/libvirt/{qemu,} # burn& # schedtool -E -t 2000000:20000000 $! # grep dl /proc/sched_debug dl_rq[0]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 dl_rq[1]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 dl_rq[2]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 dl_rq[3]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 dl_rq[4]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 dl_rq[5]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 dl_rq[6]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 dl_rq[7]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 Note: (2/20) << 20 == 104857 # echo 0 > /sys/fs/cgroup/cpuset/cpuset.sched_load_balance # grep dl /proc/sched_debug dl_rq[0]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[1]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[2]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[3]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[4]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[5]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[6]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[7]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 Notice that after removing load_balancing from the main cpuset, all the totals went to zero. Let's see what happens when we kill the task. # killall burn # grep dl /proc/sched_debug dl_rq[0]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[1]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[2]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[3]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[4]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[5]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[6]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[7]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 They all went negative! Not good, but we can recover... # echo 1 > /sys/fs/cgroup/cpuset/cpuset.sched_load_balance # grep dl /proc/sched_debugdl_rq[0]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[1]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[2]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[3]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[4]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[5]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[6]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[7]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 Playing with this a bit more, I found that it appears that setting load_balance to 1 in the toplevel cpuset always resets the deadline bandwidth weather or not it should be. At least that's a way to recover from things not working anymore, but I still believe this is a bug. Things can get even worse when adding a bunch of cpusets. But it appears to always be cleared if you kill all sched_deadline tasks and set the toplevel cpuset sched_load_balance to 1. I traced this with the below patch and have this: schedtool-2279 [004] d... 124.195359: __sched_setscheduler: dl_b=ffff88011a01d040 new=104857 old=0 schedtool-2279 [004] d... 124.195361: __dl_add: (ffff88011a01d040) total=0 add tsk=104857 schedtool-2279 [004] d... 124.195362: __dl_add: new total=104857 schedtool-2279 [004] d... 124.195364: <stack trace> => __dl_add => __sched_setscheduler => SyS_sched_setattr => entry_SYSCALL_64_fastpath bash-2213 [003] d... 142.917342: rq_attach_root: old_rd refcount=8 bash-2213 [003] d... 142.917344: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0 bash-2213 [003] d... 142.917351: <stack trace> => rq_attach_root => cpu_attach_domain => partition_sched_domains => rebuild_sched_domains_locked => update_flag => cpuset_write_u64 => cgroup_file_write => kernfs_fop_write => __vfs_write => vfs_write => SyS_write => entry_SYSCALL_64_fastpath bash-2213 [003] d... 142.917387: rq_attach_root: old_rd refcount=7 bash-2213 [003] d... 142.917388: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0 bash-2213 [003] d... 142.917393: <stack trace> => rq_attach_root => cpu_attach_domain => partition_sched_domains => rebuild_sched_domains_locked => update_flag => cpuset_write_u64 => cgroup_file_write => kernfs_fop_write => __vfs_write => vfs_write => SyS_write => entry_SYSCALL_64_fastpath bash-2213 [003] d... 142.917420: rq_attach_root: old_rd refcount=6 bash-2213 [003] d... 142.917420: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0 bash-2213 [003] d... 142.917425: <stack trace> => rq_attach_root => cpu_attach_domain => partition_sched_domains => rebuild_sched_domains_locked => update_flag => cpuset_write_u64 => cgroup_file_write => kernfs_fop_write => __vfs_write => vfs_write => SyS_write => entry_SYSCALL_64_fastpath bash-2213 [003] d... 142.917452: rq_attach_root: old_rd refcount=5 bash-2213 [003] d... 142.917452: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0 bash-2213 [003] d... 142.917457: <stack trace> => rq_attach_root => cpu_attach_domain => partition_sched_domains => rebuild_sched_domains_locked => update_flag => cpuset_write_u64 => cgroup_file_write => kernfs_fop_write => __vfs_write => vfs_write => SyS_write => entry_SYSCALL_64_fastpath bash-2213 [003] d... 142.917485: rq_attach_root: old_rd refcount=4 bash-2213 [003] d... 142.917485: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0 bash-2213 [003] d... 142.917490: <stack trace> => rq_attach_root => cpu_attach_domain => partition_sched_domains => rebuild_sched_domains_locked => update_flag => cpuset_write_u64 => cgroup_file_write => kernfs_fop_write => __vfs_write => vfs_write => SyS_write => entry_SYSCALL_64_fastpath bash-2213 [003] d... 142.917518: rq_attach_root: old_rd refcount=3 bash-2213 [003] d... 142.917519: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0 bash-2213 [003] d... 142.917524: <stack trace> => rq_attach_root => cpu_attach_domain => partition_sched_domains => rebuild_sched_domains_locked => update_flag => cpuset_write_u64 => cgroup_file_write => kernfs_fop_write => __vfs_write => vfs_write => SyS_write => entry_SYSCALL_64_fastpath bash-2213 [003] d... 142.917550: rq_attach_root: old_rd refcount=2 bash-2213 [003] d... 142.917550: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0 bash-2213 [003] d... 142.917556: <stack trace> => rq_attach_root => cpu_attach_domain => partition_sched_domains => rebuild_sched_domains_locked => update_flag => cpuset_write_u64 => cgroup_file_write => kernfs_fop_write => __vfs_write => vfs_write => SyS_write => entry_SYSCALL_64_fastpath bash-2213 [003] d... 142.917582: rq_attach_root: old_rd refcount=1 bash-2213 [003] d... 142.917582: rq_attach_root: old rd ffff88011a01d000 (ffff88011a01d040) total=104857 bash-2213 [003] d... 142.917582: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0 bash-2213 [003] d... 142.917588: <stack trace> => rq_attach_root => cpu_attach_domain => partition_sched_domains => rebuild_sched_domains_locked => update_flag => cpuset_write_u64 => cgroup_file_write => kernfs_fop_write => __vfs_write => vfs_write => SyS_write => entry_SYSCALL_64_fastpath kworker/0:2-268 [000] d... 153.335522: task_dead_dl: [0:ffffffff81fcda00] total=0 tsk=104857 kworker/0:2-268 [000] d... 153.335523: task_dead_dl: new total=-104857 kworker/0:2-268 [000] d... 153.335528: <stack trace> => task_dead_dl => finish_task_switch => __schedule => schedule => worker_thread => kthread => ret_from_fork Seems to be some disconnect between cgroups and sched deadline root domains. -- Steve --- kernel/sched/core.c | 9 +++++++++ kernel/sched/deadline.c | 5 +++++ kernel/sched/sched.h | 6 ++++++ 3 files changed, 20 insertions(+) Index: linux-trace.git/kernel/sched/core.c =================================================================== --- linux-trace.git.orig/kernel/sched/core.c 2016-02-03 10:54:55.158659968 -0500 +++ linux-trace.git/kernel/sched/core.c 2016-02-03 13:52:56.213696814 -0500 @@ -2301,6 +2301,7 @@ static int dl_overflow(struct task_struc if (new_bw == p->dl.dl_bw) return 0; + trace_printk("dl_b=%p new=%lld old=%lld\n", dl_b, new_bw, p->dl.dl_bw); /* * Either if a task, enters, leave, or stays -deadline but changes * its parameters, we may need to update accordingly the total @@ -5068,6 +5069,7 @@ int task_can_attach(struct task_struct * if (overflow) ret = -EBUSY; else { + struct dl_bw *src_dl_b; /* * We reserve space for this task in the destination * root_domain, as we can't fail after this point. @@ -5075,6 +5077,8 @@ int task_can_attach(struct task_struct * * later on (see set_cpus_allowed_dl()). */ __dl_add(dl_b, p->dl.dl_bw); + src_dl_b = dl_bw_of(task_cpu(p)); + trace_printk("source %p total=%lld\n", src_dl_b, src_dl_b->total_bw); } raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); @@ -5636,6 +5640,7 @@ static void rq_attach_root(struct rq *rq cpumask_clear_cpu(rq->cpu, old_rd->span); + trace_printk("old_rd refcount=%d\n", atomic_read(&old_rd->refcount)); /* * If we dont want to free the old_rd yet then * set old_rd to NULL to skip the freeing later @@ -5646,7 +5651,11 @@ static void rq_attach_root(struct rq *rq } atomic_inc(&rd->refcount); + if (old_rd) + trace_printk("old rd %p (%p) total=%lld\n", rq->rd, &rq->rd->dl_bw, rq->rd->dl_bw.total_bw); + trace_printk("add new rd %p (%p) total=%lld\n", rd, &rd->dl_bw, rd->dl_bw.total_bw); rq->rd = rd; + trace_dump_stack(0); cpumask_set_cpu(rq->cpu, rd->span); if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) Index: linux-trace.git/kernel/sched/deadline.c =================================================================== --- linux-trace.git.orig/kernel/sched/deadline.c 2016-02-03 10:21:27.140280992 -0500 +++ linux-trace.git/kernel/sched/deadline.c 2016-02-03 13:44:41.321432980 -0500 @@ -66,6 +66,8 @@ void init_dl_bw(struct dl_bw *dl_b) else dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); + trace_printk("clear total_bw %p (was:%lld)\n", dl_b, dl_b->total_bw); + trace_dump_stack(0); dl_b->total_bw = 0; } @@ -1224,7 +1226,10 @@ static void task_dead_dl(struct task_str */ raw_spin_lock_irq(&dl_b->lock); /* XXX we should retain the bw until 0-lag */ + trace_printk("[%d:%p] total=%lld tsk=%lld\n", task_cpu(p), dl_b, dl_b->total_bw, p->dl.dl_bw); dl_b->total_bw -= p->dl.dl_bw; + trace_printk("new total=%lld\n", dl_b->total_bw); + trace_dump_stack(0); raw_spin_unlock_irq(&dl_b->lock); } Index: linux-trace.git/kernel/sched/sched.h =================================================================== --- linux-trace.git.orig/kernel/sched/sched.h 2016-02-03 10:54:55.182659590 -0500 +++ linux-trace.git/kernel/sched/sched.h 2016-02-03 13:44:41.321432980 -0500 @@ -192,13 +192,19 @@ struct dl_bw { static inline void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) { + trace_printk("(%p) total=%lld sub tsk=%lld\n", dl_b, dl_b->total_bw, tsk_bw); dl_b->total_bw -= tsk_bw; + trace_printk("new total=%lld\n", dl_b->total_bw); + trace_dump_stack(0); } static inline void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) { + trace_printk("(%p) total=%lld add tsk=%lld\n", dl_b, dl_b->total_bw, tsk_bw); dl_b->total_bw += tsk_bw; + trace_printk("new total=%lld\n", dl_b->total_bw); + trace_dump_stack(0); } static inline