Re: [PATCH] sched: Fix race against ptrace_freeze_trace()

2020-07-21 Thread Oleg Nesterov
On 07/21, Peter Zijlstra wrote:
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4193,9 +4193,6 @@ static void __sched notrace __schedule(b
>   local_irq_disable();
>   rcu_note_context_switch(preempt);
>
> - /* See deactivate_task() below. */
> - prev_state = prev->state;
> -
>   /*
>* Make sure that signal_pending_state()->signal_pending() below
>* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
> @@ -4219,11 +4216,16 @@ static void __sched notrace __schedule(b
>   update_rq_clock(rq);
>
>   switch_count = &prev->nivcsw;
> +
>   /*
> -  * We must re-load prev->state in case ttwu_remote() changed it
> -  * before we acquired rq->lock.
> +  * We must load prev->state once (task_struct::state is volatile), such
> +  * that:
> +  *
> +  *  - we form a control dependency vs deactivate_task() below.
> +  *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
>*/
> - if (!preempt && prev_state && prev_state == prev->state) {
> + prev_state = prev->state;
> + if (!preempt && prev_state) {

Thanks! FWIW,

Acked-by: Oleg Nesterov 



Re: [PATCH] sched: Fix race against ptrace_freeze_trace()

2020-07-21 Thread Christian Brauner
On Tue, Jul 21, 2020 at 02:13:08PM +0200, pet...@infradead.org wrote:
> 
> There is apparently one site that violates the rule that only current
> and ttwu() will modify task->state, namely ptrace_{,un}freeze_traced()
> will change task->state for a remote task.
> 
> Oleg explains:
> 
>   "TASK_TRACED/TASK_STOPPED was always protected by siglock. In
> particular, ttwu(__TASK_TRACED) must be always called with siglock
> held. That is why ptrace_freeze_traced() assumes it can safely do
> s/TASK_TRACED/__TASK_TRACED/ under spin_lock(siglock)."
> 
> This breaks the ordering scheme introduced by commit:
> 
>   dbfb089d360b ("sched: Fix loadavg accounting race")
> 
> Specifically, the reload not matching no longer implies we don't have
> to block.
> 
> Simply things by noting that what we need is a LOAD->STORE ordering
> and this can be provided by a control dependency.
> 
> So replace:
> 
>   prev_state = prev->state;
>   raw_spin_lock(&rq->lock);
>   smp_mb__after_spinlock(); /* SMP-MB */
>   if (... && prev_state && prev_state == prev->state)
>   deactivate_task();
> 
> with:
> 
>   prev_state = prev->state;
>   if (... && prev_state) /* CTRL-DEP */
>   deactivate_task();
> 
> Since that already implies the 'prev->state' load must be complete
> before allowing the 'prev->on_rq = 0' store to become visible.
> 
> Fixes: dbfb089d360b ("sched: Fix loadavg accounting race")
> Reported-by: Jiri Slaby 
> Signed-off-by: Peter Zijlstra (Intel) 
> Tested-by: Paul Gortmaker 
> ---

Thank you. I applied this on top of v5.8-rc6 and re-ran the strace-test
suite successfully. So at least

Tested-by: Christian Brauner 

> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4193,9 +4193,6 @@ static void __sched notrace __schedule(b
>   local_irq_disable();
>   rcu_note_context_switch(preempt);
>  
> - /* See deactivate_task() below. */
> - prev_state = prev->state;
> -
>   /*
>* Make sure that signal_pending_state()->signal_pending() below
>* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
> @@ -4219,11 +4216,16 @@ static void __sched notrace __schedule(b
>   update_rq_clock(rq);
>  
>   switch_count = &prev->nivcsw;
> +
>   /*
> -  * We must re-load prev->state in case ttwu_remote() changed it
> -  * before we acquired rq->lock.
> +  * We must load prev->state once (task_struct::state is volatile), such
> +  * that:
> +  *
> +  *  - we form a control dependency vs deactivate_task() below.
> +  *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
>*/
> - if (!preempt && prev_state && prev_state == prev->state) {
> + prev_state = prev->state;
> + if (!preempt && prev_state) {
>   if (signal_pending_state(prev_state, prev)) {
>   prev->state = TASK_RUNNING;
>   } else {
> @@ -4237,10 +4239,12 @@ static void __sched notrace __schedule(b
>  
>   /*
>* __schedule() ttwu()
> -  *   prev_state = prev->state;if 
> (READ_ONCE(p->on_rq) && ...)
> -  *   LOCK rq->lock  goto out;
> -  *   smp_mb__after_spinlock();
> smp_acquire__after_ctrl_dep();
> -  *   p->on_rq = 0;p->state = 
> TASK_WAKING;
> +  *   if (prev_state)  if (p->on_rq && ...)
> +  * p->on_rq = 0;goto out;
> +  *
> smp_acquire__after_ctrl_dep();
> +  *p->state = TASK_WAKING
> +  *
> +  * Where __schedule() and ttwu() have matching control 
> dependencies.
>*
>* After this, schedule() must not care about p->state 
> any more.
>*/


[PATCH] sched: Fix race against ptrace_freeze_trace()

2020-07-21 Thread peterz


There is apparently one site that violates the rule that only current
and ttwu() will modify task->state, namely ptrace_{,un}freeze_traced()
will change task->state for a remote task.

Oleg explains:

  "TASK_TRACED/TASK_STOPPED was always protected by siglock. In
particular, ttwu(__TASK_TRACED) must be always called with siglock
held. That is why ptrace_freeze_traced() assumes it can safely do
s/TASK_TRACED/__TASK_TRACED/ under spin_lock(siglock)."

This breaks the ordering scheme introduced by commit:

  dbfb089d360b ("sched: Fix loadavg accounting race")

Specifically, the reload not matching no longer implies we don't have
to block.

Simply things by noting that what we need is a LOAD->STORE ordering
and this can be provided by a control dependency.

So replace:

prev_state = prev->state;
raw_spin_lock(&rq->lock);
smp_mb__after_spinlock(); /* SMP-MB */
if (... && prev_state && prev_state == prev->state)
deactivate_task();

with:

prev_state = prev->state;
if (... && prev_state) /* CTRL-DEP */
deactivate_task();

Since that already implies the 'prev->state' load must be complete
before allowing the 'prev->on_rq = 0' store to become visible.

Fixes: dbfb089d360b ("sched: Fix loadavg accounting race")
Reported-by: Jiri Slaby 
Signed-off-by: Peter Zijlstra (Intel) 
Tested-by: Paul Gortmaker 
---
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4193,9 +4193,6 @@ static void __sched notrace __schedule(b
local_irq_disable();
rcu_note_context_switch(preempt);
 
-   /* See deactivate_task() below. */
-   prev_state = prev->state;
-
/*
 * Make sure that signal_pending_state()->signal_pending() below
 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
@@ -4219,11 +4216,16 @@ static void __sched notrace __schedule(b
update_rq_clock(rq);
 
switch_count = &prev->nivcsw;
+
/*
-* We must re-load prev->state in case ttwu_remote() changed it
-* before we acquired rq->lock.
+* We must load prev->state once (task_struct::state is volatile), such
+* that:
+*
+*  - we form a control dependency vs deactivate_task() below.
+*  - ptrace_{,un}freeze_traced() can change ->state underneath us.
 */
-   if (!preempt && prev_state && prev_state == prev->state) {
+   prev_state = prev->state;
+   if (!preempt && prev_state) {
if (signal_pending_state(prev_state, prev)) {
prev->state = TASK_RUNNING;
} else {
@@ -4237,10 +4239,12 @@ static void __sched notrace __schedule(b
 
/*
 * __schedule() ttwu()
-*   prev_state = prev->state;if 
(READ_ONCE(p->on_rq) && ...)
-*   LOCK rq->lock  goto out;
-*   smp_mb__after_spinlock();
smp_acquire__after_ctrl_dep();
-*   p->on_rq = 0;p->state = 
TASK_WAKING;
+*   if (prev_state)  if (p->on_rq && ...)
+* p->on_rq = 0;goto out;
+*
smp_acquire__after_ctrl_dep();
+*p->state = TASK_WAKING
+*
+* Where __schedule() and ttwu() have matching control 
dependencies.
 *
 * After this, schedule() must not care about p->state 
any more.
 */