Re: [PATCH] futex: fix a race condition between REQUEUE_PI and task death

Brian Silverman Thu, 23 Oct 2014 12:29:41 -0700

Here's the test code:

#define _GNU_SOURCE


#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <assert.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <inttypes.h>
#include <linux/futex.h>

// Whether to use a pthread mutex+condvar or do the raw futex operations. Either
// one will break.
// There's less user-space code involved with the non-pthread version, but the
// syscalls I think are relevant end up the same (except they're FUTEX_PRIVATE
// with the pthread version).
#define PTHREAD_MUTEX 0

// The number of processes that repeatedly call RunTest to fork off.
// Making this big (like 2500) makes it reproduce faster, but I have seen it go
// with as little as 4.
// With big values, `ulimit -u unlimited` (or something big at least) is
// necessary before running it (or do it as root).
#define NUMBER_TESTERS 2500

#if PTHREAD_MUTEX
#include <pthread.h>

typedef struct {
  pthread_mutex_t mutex;
  pthread_cond_t condition;
} Shared;
#else
typedef volatile uint32_t my_futex __attribute__((aligned(sizeof(int))));

void condition_wait(my_futex *c, my_futex *m) {
  syscall(SYS_futex, c, FUTEX_WAIT_REQUEUE_PI, *c, NULL, m);
}

void condition_signal(my_futex *c, my_futex *m) {
  syscall(SYS_futex, c, FUTEX_CMP_REQUEUE_PI, 1, 0, m, *c);
}

typedef struct {
  my_futex mutex;
  my_futex condition;
} Shared;
#endif

void RunTest(Shared *shared) {
#if PTHREAD_MUTEX
  pthread_mutexattr_t mutexattr;
  assert(pthread_mutexattr_init(&mutexattr) == 0);
  assert(pthread_mutexattr_setprotocol(&mutexattr, PTHREAD_PRIO_INHERIT) == 0);
  assert(pthread_mutex_init(&shared->mutex, &mutexattr) == 0);
  assert(pthread_mutexattr_destroy(&mutexattr) == 0);

  assert(pthread_cond_init(&shared->condition, NULL) == 0);
#else
  shared->mutex = shared->condition = 0;
#endif

  pid_t child = fork();
  if (child == 0) {
#if PTHREAD_MUTEX
    pthread_mutex_lock(&shared->mutex);
    pthread_cond_wait(&shared->condition, &shared->mutex);
#else
    condition_wait(&shared->condition, &shared->mutex);
#endif
    _exit(0);
  } else {
    assert(child != -1);
    // This sleep makes it reproduce way faster, but it will still break
    // eventually without it.
    usleep(20000);
#if PTHREAD_MUTEX
    assert(pthread_cond_broadcast(&shared->condition) == 0);
#else
    condition_signal(&shared->condition, &shared->mutex);
#endif
    assert(kill(child, SIGKILL) == 0);
    assert(waitpid(child, NULL, 0) == child);
  }
}

int main() {
  Shared *shared_memory = (Shared *)(mmap(NULL, sizeof(Shared) * NUMBER_TESTERS,
                                          PROT_READ | PROT_WRITE,
                                          MAP_SHARED | MAP_ANONYMOUS,
                                          -1, 0));
  int i;
  for (i = 0; i < NUMBER_TESTERS; ++i) {
    if (fork() == 0) {
      while (1) {
        RunTest(&shared_memory[i]);
      }
    }
  }
}

On Thu, Oct 23, 2014 at 3:22 PM, Brian Silverman <bsilver16...@gmail.com> wrote:
> pi_state_free and exit_pi_state_list both clean up futex_pi_state's.
> exit_pi_state_list takes the hb lock first, and most callers of
> pi_state_free do too. requeue_pi didn't, which causes lots of problems.
> Move the pi_state_free calls in requeue_pi to before it drops the hb
> locks which it's already holding.
>
> I have test code that forks a bunch of processes, which all fork more
> processes that do futex(FUTEX_WAIT_REQUEUE_PI), then do
> futex(FUTEX_CMP_REQUEUE_PI) before killing the waiting processes. That
> test consistently breaks QEMU VMs without this patch.
>
> Although they both make it faster to crash, CONFIG_PREEMPT_RT and lots
> of CPUs are not necessary to reproduce this problem. The symptoms range
> from random reboots to corrupting the test program to corrupting whole
> filesystems to strange BIOS errors. Crashdumps (when they get created at
> all) are usually a mess, to the point of crashing tools used to examine
> them.
>
> Signed-off-by: Brian Silverman <bsilver16...@gmail.com>
> ---
> I am not subscribed to the list so please CC me on any responses.
>
>  kernel/futex.c |   32 +++++++++++++++++++++-----------
>  1 file changed, 21 insertions(+), 11 deletions(-)
>
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 815d7af..dc69775 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -639,6 +639,9 @@ static struct futex_pi_state * alloc_pi_state(void)
>         return pi_state;
>  }
>
> +/**
> + * Must be called with the hb lock held.
> + */
>  static void free_pi_state(struct futex_pi_state *pi_state)
>  {
>         if (!atomic_dec_and_test(&pi_state->refcount))
> @@ -1519,15 +1522,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned 
> int flags,
>         }
>
>  retry:
> -       if (pi_state != NULL) {
> -               /*
> -                * We will have to lookup the pi_state again, so free this one
> -                * to keep the accounting correct.
> -                */
> -               free_pi_state(pi_state);
> -               pi_state = NULL;
> -       }
> -
>         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
>         if (unlikely(ret != 0))
>                 goto out;
> @@ -1558,6 +1552,14 @@ retry_private:
>                 ret = get_futex_value_locked(&curval, uaddr1);
>
>                 if (unlikely(ret)) {
> +                       if (flags & FLAGS_SHARED && pi_state != NULL) {
> +                               /*
> +                                * We will have to lookup the pi_state again, 
> so
> +                                * free this one to keep the accounting 
> correct.
> +                                */
> +                               free_pi_state(pi_state);
> +                               pi_state = NULL;
> +                       }
>                         double_unlock_hb(hb1, hb2);
>                         hb_waiters_dec(hb2);
>
> @@ -1617,6 +1619,10 @@ retry_private:
>                 case 0:
>                         break;
>                 case -EFAULT:
> +                       if (pi_state != NULL) {
> +                               free_pi_state(pi_state);
> +                               pi_state = NULL;
> +                       }
>                         double_unlock_hb(hb1, hb2);
>                         hb_waiters_dec(hb2);
>                         put_futex_key(&key2);
> @@ -1632,6 +1638,10 @@ retry_private:
>                          *   exit to complete.
>                          * - The user space value changed.
>                          */
> +                       if (pi_state != NULL) {
> +                               free_pi_state(pi_state);
> +                               pi_state = NULL;
> +                       }
>                         double_unlock_hb(hb1, hb2);
>                         hb_waiters_dec(hb2);
>                         put_futex_key(&key2);
> @@ -1708,6 +1718,8 @@ retry_private:
>         }
>
>  out_unlock:
> +       if (pi_state != NULL)
> +               free_pi_state(pi_state);
>         double_unlock_hb(hb1, hb2);
>         hb_waiters_dec(hb2);
>
> @@ -1725,8 +1737,6 @@ out_put_keys:
>  out_put_key1:
>         put_futex_key(&key1);
>  out:
> -       if (pi_state != NULL)
> -               free_pi_state(pi_state);
>         return ret ? ret : task_count;
>  }
>
> --
> 1.7.10.4
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] futex: fix a race condition between REQUEUE_PI and task death

Reply via email to