On Mon, Jul 20, 2020 at 01:26:23PM +0200, pet...@infradead.org wrote: > kernel/sched/core.c | 34 ++++++++++++++++++++++++++++------ > 1 file changed, 28 insertions(+), 6 deletions(-) > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index e15543cb84812..b5973d7fa521c 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -4100,9 +4100,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, > struct rq_flags *rf) > */ > static void __sched notrace __schedule(bool preempt) > { > + unsigned long prev_state, tmp_state; > struct task_struct *prev, *next; > unsigned long *switch_count; > - unsigned long prev_state; > struct rq_flags rf; > struct rq *rq; > int cpu; > @@ -4140,16 +4140,38 @@ static void __sched notrace __schedule(bool preempt) > rq_lock(rq, &rf); > smp_mb__after_spinlock(); > > + /* > + * We must re-load prev->state in case ttwu_remote() changed it > + * before we acquired rq->lock. > + */ > + tmp_state = prev->state; > + if (unlikely(prev_state != tmp_state)) { > + /* > + * ptrace_{,un}freeze_traced() think it is cool to change > + * ->state around behind our backs between TASK_TRACED and > + * __TASK_TRACED. > + * > + * This is safe because this, as well as any __TASK_TRACED > + * wakeups are under siglock. > + * > + * For any other case, a changed prev_state must be to > + * TASK_RUNNING, such that when it blocks, the load has > + * happened before the smp_mb(). > + * > + * Also see the comment with deactivate_task(). > + */ > + SCHED_WARN_ON(tmp_state && (prev_state & __TASK_TRACED && > + !(tmp_state & __TASK_TRACED))); > + > + prev_state = tmp_state;
While trying to write a changelog for this thing, I can't convince myself we don't need: smp_mb(); here. Consider: CPU0 CPU1 CPU2 schedule() prev_state = prev->state; spin_lock(rq->lock); smp_mb__after_spin_lock(); ptrace_freeze_traced() spin_lock(siglock) task->state = __TASK_TRACED; spin_unlock(siglock); tmp_state = prev->state; if (prev_state != tmp_state) prev_state = tmp_state; /* NO SMP_MB */ if (prev_state) deactivate_task() prev->on_rq = 0; spin_lock(siglock); ttwu() if (rq->on_rq && ...) goto unlock; smp_acquire__after_ctrl_dep(); p->state = TASK_WAKING; Looses the ordering we previously relied upon. That is, CPU1's prev->state load and prev->on_rq store can get reordered vs CPU2. OTOH, we have a control dependency on CPU1 as well, that should provide LOAD->STORE ordering, after all we only do the ->on_rq=0 store, IFF we see prev_state. So that is: if (p->state) if (!p->on_rq) p->on_rq = 0; p->state = TASK_WAKING which matches a CTRL-DEP to a CTRL-DEP ... But this then means we can simplify dbfb089d360 as well, but now my head hurts. > + } > + > /* Promote REQ to ACT */ > rq->clock_update_flags <<= 1; > update_rq_clock(rq); > > switch_count = &prev->nivcsw; > - /* > - * We must re-load prev->state in case ttwu_remote() changed it > - * before we acquired rq->lock. > - */ > - if (!preempt && prev_state && prev_state == prev->state) { > + if (!preempt && prev_state) { > if (signal_pending_state(prev_state, prev)) { > prev->state = TASK_RUNNING; > } else {