I'm slowly starting to go through this...
On Fri, Jan 22, 2021 at 08:17:01PM -0500, Joel Fernandes (Google) wrote: > +static bool sched_core_empty(struct rq *rq) > +{ > + return RB_EMPTY_ROOT(&rq->core_tree); > +} > + > +static struct task_struct *sched_core_first(struct rq *rq) > +{ > + struct task_struct *task; > + > + task = container_of(rb_first(&rq->core_tree), struct task_struct, > core_node); > + return task; > +} AFAICT you can do with: static struct task_struct *sched_core_any(struct rq *rq) { return rb_entry(rq->core_tree.rb_node, struct task_struct, code_node); } > +static void sched_core_flush(int cpu) > +{ > + struct rq *rq = cpu_rq(cpu); > + struct task_struct *task; > + > + while (!sched_core_empty(rq)) { > + task = sched_core_first(rq); > + rb_erase(&task->core_node, &rq->core_tree); > + RB_CLEAR_NODE(&task->core_node); > + } > + rq->core->core_task_seq++; > +} However, > + for_each_possible_cpu(cpu) { > + struct rq *rq = cpu_rq(cpu); > + > + WARN_ON_ONCE(enabled == rq->core_enabled); > + > + if (!enabled || (enabled && cpumask_weight(cpu_smt_mask(cpu)) > >= 2)) { > + /* > + * All active and migrating tasks will have already > + * been removed from core queue when we clear the > + * cgroup tags. However, dying tasks could still be > + * left in core queue. Flush them here. > + */ > + if (!enabled) > + sched_core_flush(cpu); > + > + rq->core_enabled = enabled; > + } > + } I'm not sure I understand. Is the problem that we're still schedulable during do_exit() after cgroup_exit() ? It could be argued that when we leave the cgroup there, we should definitely leave the tag group too.