On Tue, 2015-10-06 at 20:03 +0200, Markus Pargmann wrote:
> The timeout handling introduced in
>       > 7e2893a16d3e (nbd: Fix timeout detection)
> introduces a race condition which may lead to killing of tasks that are
> not in nbd context anymore. This was not observed or reproducable yet.
> 
> This patch adds locking to critical use of task_recv and task_send to
> avoid killing tasks that already left the NBD thread functions. This
> lock is only acquired if a timeout occures or the nbd device
> starts/stops.
> 
> Reported-by: Ben Hutchings <b...@decadent.org.uk>
> Signed-off-by: Markus Pargmann <m...@pengutronix.de>

Reviewed-by: Ben Hutchings <b...@decadent.org.uk>

You could add 'Fixes: 7e2893a16d3e ("nbd: Fix timeout detection")' to
the commit message as well.

nbd_dbg_tasks_show() can still race with thread exit and two tasks can 
race to become the receive thread, but those aren't new bugs.

Ben.

> ---
>  drivers/block/nbd.c | 36 ++++++++++++++++++++++++++++++------
>  1 file changed, 30 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> index 039c7c4f0539..1a70852ac808 100644
> --- a/drivers/block/nbd.c
> +++ b/drivers/block/nbd.c
> @@ -60,6 +60,7 @@ struct nbd_device {
>  >    > bool disconnect; /* a disconnect has been requested by user */
>  
>  >    > struct timer_list timeout_timer;
> +>    > spinlock_t tasks_lock;
>  >    > struct task_struct *task_recv;
>  >    > struct task_struct *task_send;
>  
> @@ -140,21 +141,23 @@ static void sock_shutdown(struct nbd_device *nbd)
>  static void nbd_xmit_timeout(unsigned long arg)
>  {
>  >    > struct nbd_device *nbd = (struct nbd_device *)arg;
> ->    > struct task_struct *task;
> +>    > unsigned long flags;
>  
>  >    > if (list_empty(&nbd->queue_head))
>  >    >       > return;
>  
>  >    > nbd->disconnect = true;
>  
> ->    > task = READ_ONCE(nbd->task_recv);
> ->    > if (task)
> ->    >       > force_sig(SIGKILL, task);
> +>    > spin_lock_irqsave(&nbd->tasks_lock, flags);
> +
> +>    > if (nbd->task_recv)
> +>    >       > force_sig(SIGKILL, nbd->task_recv);
>  
> ->    > task = READ_ONCE(nbd->task_send);
> ->    > if (task)
> +>    > if (nbd->task_send)
>  >    >       > force_sig(SIGKILL, nbd->task_send);
>  
> +>    > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
> +
>  >    > dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and 
> sender, shutting down connection\n");
>  }
>  
> @@ -403,17 +406,24 @@ static int nbd_thread_recv(struct nbd_device *nbd)
>  {
>  >    > struct request *req;
>  >    > int ret;
> +>    > unsigned long flags;
>  
>  >    > BUG_ON(nbd->magic != NBD_MAGIC);
>  
>  >    > sk_set_memalloc(nbd->sock->sk);
>  
> +>    > spin_lock_irqsave(&nbd->tasks_lock, flags);
>  >    > nbd->task_recv = current;
> +>    > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
>  
>  >    > ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
>  >    > if (ret) {
>  >    >       > dev_err(disk_to_dev(nbd->disk), "device_create_file 
> failed!\n");
> +
> +>    >       > spin_lock_irqsave(&nbd->tasks_lock, flags);
>  >    >       > nbd->task_recv = NULL;
> +>    >       > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
> +
>  >    >       > return ret;
>  >    > }
>  
> @@ -429,7 +439,9 @@ static int nbd_thread_recv(struct nbd_device *nbd)
>  
>  >    > device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
>  
> +>    > spin_lock_irqsave(&nbd->tasks_lock, flags);
>  >    > nbd->task_recv = NULL;
> +>    > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
>  
>  >    > if (signal_pending(current)) {
>  >    >       > siginfo_t info;
> @@ -534,8 +546,11 @@ static int nbd_thread_send(void *data)
>  {
>  >    > struct nbd_device *nbd = data;
>  >    > struct request *req;
> +>    > unsigned long flags;
>  
> +>    > spin_lock_irqsave(&nbd->tasks_lock, flags);
>  >    > nbd->task_send = current;
> +>    > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
>  
>  >    > set_user_nice(current, MIN_NICE);
>  >    > while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
> @@ -572,7 +587,15 @@ static int nbd_thread_send(void *data)
>  >    >       > nbd_handle_req(nbd, req);
>  >    > }
>  
> +>    > spin_lock_irqsave(&nbd->tasks_lock, flags);
>  >    > nbd->task_send = NULL;
> +>    > spin_unlock_irqrestore(&nbd->tasks_lock, flags);
> +
> +>    > /* Clear maybe pending signals */
> +>    > if (signal_pending(current)) {
> +>    >       > siginfo_t info;
> +>    >       > dequeue_signal_lock(current, ¤t->blocked, &info);
> +>    > }
>  
>  >    > return 0;
>  }
> @@ -1027,6 +1050,7 @@ static int __init nbd_init(void)
>  >    >       > nbd_dev[i].magic = NBD_MAGIC;
>  >    >       > INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
>  >    >       > spin_lock_init(&nbd_dev[i].queue_lock);
> +>    >       > spin_lock_init(&nbd_dev[i].tasks_lock);
>  >    >       > INIT_LIST_HEAD(&nbd_dev[i].queue_head);
>  >    >       > mutex_init(&nbd_dev[i].tx_lock);
>  >    >       > init_timer(&nbd_dev[i].timeout_timer);
-- 
Ben Hutchings
Once a job is fouled up, anything done to improve it makes it worse.

Attachment: signature.asc
Description: This is a digitally signed message part

Reply via email to