[BUG] Corrupted SCHED_DEADLINE bandwidth with cpusets

Steven Rostedt Wed, 03 Feb 2016 10:56:40 -0800

There's an accounting issue with the SCHED_DEADLINE and the creation of
cpusets. If a SCHED_DEADLINE task already exists and a new root domain
is created, the calculation of the bandwidth among the root domains
gets corrupted.


For the reproducer, I downloaded Juri's tests:

  https://github.com/jlelli/tests.git 

    For his burn.c file.

  https://github.com/jlelli/schedtool-dl.git

    For his modified schedtool utility.


I have a kernel with my patches that show the bandwidth:

 # grep dl /proc/sched_debug        
dl_rq[0]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[1]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[2]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[3]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[4]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[5]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[6]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[7]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0


Note: the sched_rt runtime and period are 950000 and 1000000
respectively, and the bw ratio is (95/100) << 20 == 996147.

This isn't the way I first discovered the issue, but it appears to be
the quickest way to reproduce it.

Make sure there's no other cpusets. As libvirt created some, I had to
remove them first:

 # rmdir /sys/fs/cgroup/cpuset/libvirt/{qemu,}


 # burn&
 # schedtool -E -t 2000000:20000000 $!

 # grep dl /proc/sched_debug
dl_rq[0]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[1]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[2]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[3]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[4]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[5]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[6]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[7]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857

Note: (2/20) << 20 == 104857

 # echo 0 > /sys/fs/cgroup/cpuset/cpuset.sched_load_balance

 # grep dl /proc/sched_debug                                       
dl_rq[0]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[1]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[2]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[3]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[4]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[5]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[6]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[7]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0

Notice that after removing load_balancing from the main cpuset, all the
totals went to zero.

Let's see what happens when we kill the task.

 # killall burn

 # grep dl /proc/sched_debug
dl_rq[0]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[1]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[2]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[3]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[4]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[5]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[6]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[7]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857

They all went negative!

Not good, but we can recover...

 # echo 1 > /sys/fs/cgroup/cpuset/cpuset.sched_load_balance 

# grep dl /proc/sched_debugdl_rq[0]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[1]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[2]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[3]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[4]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[5]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[6]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[7]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0

Playing with this a bit more, I found that it appears that setting
load_balance to 1 in the toplevel cpuset always resets the deadline
bandwidth weather or not it should be. At least that's a way to recover
from things not working anymore, but I still believe this is a bug.

Things can get even worse when adding a bunch of cpusets. But it
appears to always be cleared if you kill all sched_deadline tasks and
set the toplevel cpuset sched_load_balance to 1.


I traced this with the below patch and have this:

       schedtool-2279  [004] d...   124.195359: __sched_setscheduler: 
dl_b=ffff88011a01d040 new=104857 old=0
       schedtool-2279  [004] d...   124.195361: __dl_add: (ffff88011a01d040) 
total=0 add tsk=104857
       schedtool-2279  [004] d...   124.195362: __dl_add: new total=104857
       schedtool-2279  [004] d...   124.195364: <stack trace>
 => __dl_add
 => __sched_setscheduler
 => SyS_sched_setattr
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917342: rq_attach_root: old_rd 
refcount=8
            bash-2213  [003] d...   142.917344: rq_attach_root: add new rd 
ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917351: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917387: rq_attach_root: old_rd 
refcount=7
            bash-2213  [003] d...   142.917388: rq_attach_root: add new rd 
ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917393: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917420: rq_attach_root: old_rd 
refcount=6
            bash-2213  [003] d...   142.917420: rq_attach_root: add new rd 
ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917425: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917452: rq_attach_root: old_rd 
refcount=5
            bash-2213  [003] d...   142.917452: rq_attach_root: add new rd 
ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917457: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917485: rq_attach_root: old_rd 
refcount=4
            bash-2213  [003] d...   142.917485: rq_attach_root: add new rd 
ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917490: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917518: rq_attach_root: old_rd 
refcount=3
            bash-2213  [003] d...   142.917519: rq_attach_root: add new rd 
ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917524: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917550: rq_attach_root: old_rd 
refcount=2
            bash-2213  [003] d...   142.917550: rq_attach_root: add new rd 
ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917556: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917582: rq_attach_root: old_rd 
refcount=1
            bash-2213  [003] d...   142.917582: rq_attach_root: old rd 
ffff88011a01d000 (ffff88011a01d040) total=104857
            bash-2213  [003] d...   142.917582: rq_attach_root: add new rd 
ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917588: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
     kworker/0:2-268   [000] d...   153.335522: task_dead_dl: 
[0:ffffffff81fcda00] total=0 tsk=104857
     kworker/0:2-268   [000] d...   153.335523: task_dead_dl: new total=-104857
     kworker/0:2-268   [000] d...   153.335528: <stack trace>
 => task_dead_dl
 => finish_task_switch
 => __schedule
 => schedule
 => worker_thread
 => kthread
 => ret_from_fork


Seems to be some disconnect between cgroups and sched deadline root
domains.

-- Steve

---
 kernel/sched/core.c     |    9 +++++++++
 kernel/sched/deadline.c |    5 +++++
 kernel/sched/sched.h    |    6 ++++++
 3 files changed, 20 insertions(+)

Index: linux-trace.git/kernel/sched/core.c
===================================================================
--- linux-trace.git.orig/kernel/sched/core.c    2016-02-03 10:54:55.158659968 
-0500
+++ linux-trace.git/kernel/sched/core.c 2016-02-03 13:52:56.213696814 -0500
@@ -2301,6 +2301,7 @@ static int dl_overflow(struct task_struc
        if (new_bw == p->dl.dl_bw)
                return 0;
 
+       trace_printk("dl_b=%p new=%lld old=%lld\n", dl_b, new_bw, p->dl.dl_bw);
        /*
         * Either if a task, enters, leave, or stays -deadline but changes
         * its parameters, we may need to update accordingly the total
@@ -5068,6 +5069,7 @@ int task_can_attach(struct task_struct *
                if (overflow)
                        ret = -EBUSY;
                else {
+                       struct dl_bw *src_dl_b;
                        /*
                         * We reserve space for this task in the destination
                         * root_domain, as we can't fail after this point.
@@ -5075,6 +5077,8 @@ int task_can_attach(struct task_struct *
                         * later on (see set_cpus_allowed_dl()).
                         */
                        __dl_add(dl_b, p->dl.dl_bw);
+                       src_dl_b = dl_bw_of(task_cpu(p));
+                       trace_printk("source %p total=%lld\n", src_dl_b, 
src_dl_b->total_bw);
                }
                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
                rcu_read_unlock_sched();
@@ -5636,6 +5640,7 @@ static void rq_attach_root(struct rq *rq
 
                cpumask_clear_cpu(rq->cpu, old_rd->span);
 
+               trace_printk("old_rd refcount=%d\n", 
atomic_read(&old_rd->refcount));
                /*
                 * If we dont want to free the old_rd yet then
                 * set old_rd to NULL to skip the freeing later
@@ -5646,7 +5651,11 @@ static void rq_attach_root(struct rq *rq
        }
 
        atomic_inc(&rd->refcount);
+       if (old_rd)
+               trace_printk("old rd %p (%p) total=%lld\n", rq->rd, 
&rq->rd->dl_bw, rq->rd->dl_bw.total_bw);
+       trace_printk("add new rd %p (%p) total=%lld\n", rd, &rd->dl_bw, 
rd->dl_bw.total_bw);
        rq->rd = rd;
+       trace_dump_stack(0);
 
        cpumask_set_cpu(rq->cpu, rd->span);
        if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
Index: linux-trace.git/kernel/sched/deadline.c
===================================================================
--- linux-trace.git.orig/kernel/sched/deadline.c        2016-02-03 
10:21:27.140280992 -0500
+++ linux-trace.git/kernel/sched/deadline.c     2016-02-03 13:44:41.321432980 
-0500
@@ -66,6 +66,8 @@ void init_dl_bw(struct dl_bw *dl_b)
        else
                dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
        raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
+       trace_printk("clear total_bw %p (was:%lld)\n", dl_b, dl_b->total_bw);
+       trace_dump_stack(0);
        dl_b->total_bw = 0;
 }
 
@@ -1224,7 +1226,10 @@ static void task_dead_dl(struct task_str
         */
        raw_spin_lock_irq(&dl_b->lock);
        /* XXX we should retain the bw until 0-lag */
+       trace_printk("[%d:%p] total=%lld tsk=%lld\n", task_cpu(p), dl_b, 
dl_b->total_bw, p->dl.dl_bw);
        dl_b->total_bw -= p->dl.dl_bw;
+       trace_printk("new total=%lld\n", dl_b->total_bw);
+       trace_dump_stack(0);
        raw_spin_unlock_irq(&dl_b->lock);
 }
 
Index: linux-trace.git/kernel/sched/sched.h
===================================================================
--- linux-trace.git.orig/kernel/sched/sched.h   2016-02-03 10:54:55.182659590 
-0500
+++ linux-trace.git/kernel/sched/sched.h        2016-02-03 13:44:41.321432980 
-0500
@@ -192,13 +192,19 @@ struct dl_bw {
 static inline
 void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
 {
+       trace_printk("(%p) total=%lld sub tsk=%lld\n", dl_b, dl_b->total_bw, 
tsk_bw);
        dl_b->total_bw -= tsk_bw;
+       trace_printk("new total=%lld\n", dl_b->total_bw);
+       trace_dump_stack(0);
 }
 
 static inline
 void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
 {
+       trace_printk("(%p) total=%lld add tsk=%lld\n", dl_b, dl_b->total_bw, 
tsk_bw);
        dl_b->total_bw += tsk_bw;
+       trace_printk("new total=%lld\n", dl_b->total_bw);
+       trace_dump_stack(0);
 }
 
 static inline

[BUG] Corrupted SCHED_DEADLINE bandwidth with cpusets

Reply via email to