Re: [PATCH] sched: Fix race against ptrace_freeze_trace()
On 07/21, Peter Zijlstra wrote: > > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -4193,9 +4193,6 @@ static void __sched notrace __schedule(b > local_irq_disable(); > rcu_note_context_switch(preempt); > > - /* See deactivate_task() below. */ > - prev_state = prev->state; > - > /* >* Make sure that signal_pending_state()->signal_pending() below >* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) > @@ -4219,11 +4216,16 @@ static void __sched notrace __schedule(b > update_rq_clock(rq); > > switch_count = &prev->nivcsw; > + > /* > - * We must re-load prev->state in case ttwu_remote() changed it > - * before we acquired rq->lock. > + * We must load prev->state once (task_struct::state is volatile), such > + * that: > + * > + * - we form a control dependency vs deactivate_task() below. > + * - ptrace_{,un}freeze_traced() can change ->state underneath us. >*/ > - if (!preempt && prev_state && prev_state == prev->state) { > + prev_state = prev->state; > + if (!preempt && prev_state) { Thanks! FWIW, Acked-by: Oleg Nesterov
Re: [PATCH] sched: Fix race against ptrace_freeze_trace()
On Tue, Jul 21, 2020 at 02:13:08PM +0200, pet...@infradead.org wrote: > > There is apparently one site that violates the rule that only current > and ttwu() will modify task->state, namely ptrace_{,un}freeze_traced() > will change task->state for a remote task. > > Oleg explains: > > "TASK_TRACED/TASK_STOPPED was always protected by siglock. In > particular, ttwu(__TASK_TRACED) must be always called with siglock > held. That is why ptrace_freeze_traced() assumes it can safely do > s/TASK_TRACED/__TASK_TRACED/ under spin_lock(siglock)." > > This breaks the ordering scheme introduced by commit: > > dbfb089d360b ("sched: Fix loadavg accounting race") > > Specifically, the reload not matching no longer implies we don't have > to block. > > Simply things by noting that what we need is a LOAD->STORE ordering > and this can be provided by a control dependency. > > So replace: > > prev_state = prev->state; > raw_spin_lock(&rq->lock); > smp_mb__after_spinlock(); /* SMP-MB */ > if (... && prev_state && prev_state == prev->state) > deactivate_task(); > > with: > > prev_state = prev->state; > if (... && prev_state) /* CTRL-DEP */ > deactivate_task(); > > Since that already implies the 'prev->state' load must be complete > before allowing the 'prev->on_rq = 0' store to become visible. > > Fixes: dbfb089d360b ("sched: Fix loadavg accounting race") > Reported-by: Jiri Slaby > Signed-off-by: Peter Zijlstra (Intel) > Tested-by: Paul Gortmaker > --- Thank you. I applied this on top of v5.8-rc6 and re-ran the strace-test suite successfully. So at least Tested-by: Christian Brauner > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -4193,9 +4193,6 @@ static void __sched notrace __schedule(b > local_irq_disable(); > rcu_note_context_switch(preempt); > > - /* See deactivate_task() below. */ > - prev_state = prev->state; > - > /* >* Make sure that signal_pending_state()->signal_pending() below >* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) > @@ -4219,11 +4216,16 @@ static void __sched notrace __schedule(b > update_rq_clock(rq); > > switch_count = &prev->nivcsw; > + > /* > - * We must re-load prev->state in case ttwu_remote() changed it > - * before we acquired rq->lock. > + * We must load prev->state once (task_struct::state is volatile), such > + * that: > + * > + * - we form a control dependency vs deactivate_task() below. > + * - ptrace_{,un}freeze_traced() can change ->state underneath us. >*/ > - if (!preempt && prev_state && prev_state == prev->state) { > + prev_state = prev->state; > + if (!preempt && prev_state) { > if (signal_pending_state(prev_state, prev)) { > prev->state = TASK_RUNNING; > } else { > @@ -4237,10 +4239,12 @@ static void __sched notrace __schedule(b > > /* >* __schedule() ttwu() > - * prev_state = prev->state;if > (READ_ONCE(p->on_rq) && ...) > - * LOCK rq->lock goto out; > - * smp_mb__after_spinlock(); > smp_acquire__after_ctrl_dep(); > - * p->on_rq = 0;p->state = > TASK_WAKING; > + * if (prev_state) if (p->on_rq && ...) > + * p->on_rq = 0;goto out; > + * > smp_acquire__after_ctrl_dep(); > + *p->state = TASK_WAKING > + * > + * Where __schedule() and ttwu() have matching control > dependencies. >* >* After this, schedule() must not care about p->state > any more. >*/
[PATCH] sched: Fix race against ptrace_freeze_trace()
There is apparently one site that violates the rule that only current and ttwu() will modify task->state, namely ptrace_{,un}freeze_traced() will change task->state for a remote task. Oleg explains: "TASK_TRACED/TASK_STOPPED was always protected by siglock. In particular, ttwu(__TASK_TRACED) must be always called with siglock held. That is why ptrace_freeze_traced() assumes it can safely do s/TASK_TRACED/__TASK_TRACED/ under spin_lock(siglock)." This breaks the ordering scheme introduced by commit: dbfb089d360b ("sched: Fix loadavg accounting race") Specifically, the reload not matching no longer implies we don't have to block. Simply things by noting that what we need is a LOAD->STORE ordering and this can be provided by a control dependency. So replace: prev_state = prev->state; raw_spin_lock(&rq->lock); smp_mb__after_spinlock(); /* SMP-MB */ if (... && prev_state && prev_state == prev->state) deactivate_task(); with: prev_state = prev->state; if (... && prev_state) /* CTRL-DEP */ deactivate_task(); Since that already implies the 'prev->state' load must be complete before allowing the 'prev->on_rq = 0' store to become visible. Fixes: dbfb089d360b ("sched: Fix loadavg accounting race") Reported-by: Jiri Slaby Signed-off-by: Peter Zijlstra (Intel) Tested-by: Paul Gortmaker --- --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4193,9 +4193,6 @@ static void __sched notrace __schedule(b local_irq_disable(); rcu_note_context_switch(preempt); - /* See deactivate_task() below. */ - prev_state = prev->state; - /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) @@ -4219,11 +4216,16 @@ static void __sched notrace __schedule(b update_rq_clock(rq); switch_count = &prev->nivcsw; + /* -* We must re-load prev->state in case ttwu_remote() changed it -* before we acquired rq->lock. +* We must load prev->state once (task_struct::state is volatile), such +* that: +* +* - we form a control dependency vs deactivate_task() below. +* - ptrace_{,un}freeze_traced() can change ->state underneath us. */ - if (!preempt && prev_state && prev_state == prev->state) { + prev_state = prev->state; + if (!preempt && prev_state) { if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { @@ -4237,10 +4239,12 @@ static void __sched notrace __schedule(b /* * __schedule() ttwu() -* prev_state = prev->state;if (READ_ONCE(p->on_rq) && ...) -* LOCK rq->lock goto out; -* smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep(); -* p->on_rq = 0;p->state = TASK_WAKING; +* if (prev_state) if (p->on_rq && ...) +* p->on_rq = 0;goto out; +* smp_acquire__after_ctrl_dep(); +*p->state = TASK_WAKING +* +* Where __schedule() and ttwu() have matching control dependencies. * * After this, schedule() must not care about p->state any more. */