Here's the test code: #define _GNU_SOURCE
#include <unistd.h> #include <sys/types.h> #include <sys/wait.h> #include <assert.h> #include <sys/mman.h> #include <sys/syscall.h> #include <inttypes.h> #include <linux/futex.h> // Whether to use a pthread mutex+condvar or do the raw futex operations. Either // one will break. // There's less user-space code involved with the non-pthread version, but the // syscalls I think are relevant end up the same (except they're FUTEX_PRIVATE // with the pthread version). #define PTHREAD_MUTEX 0 // The number of processes that repeatedly call RunTest to fork off. // Making this big (like 2500) makes it reproduce faster, but I have seen it go // with as little as 4. // With big values, `ulimit -u unlimited` (or something big at least) is // necessary before running it (or do it as root). #define NUMBER_TESTERS 2500 #if PTHREAD_MUTEX #include <pthread.h> typedef struct { pthread_mutex_t mutex; pthread_cond_t condition; } Shared; #else typedef volatile uint32_t my_futex __attribute__((aligned(sizeof(int)))); void condition_wait(my_futex *c, my_futex *m) { syscall(SYS_futex, c, FUTEX_WAIT_REQUEUE_PI, *c, NULL, m); } void condition_signal(my_futex *c, my_futex *m) { syscall(SYS_futex, c, FUTEX_CMP_REQUEUE_PI, 1, 0, m, *c); } typedef struct { my_futex mutex; my_futex condition; } Shared; #endif void RunTest(Shared *shared) { #if PTHREAD_MUTEX pthread_mutexattr_t mutexattr; assert(pthread_mutexattr_init(&mutexattr) == 0); assert(pthread_mutexattr_setprotocol(&mutexattr, PTHREAD_PRIO_INHERIT) == 0); assert(pthread_mutex_init(&shared->mutex, &mutexattr) == 0); assert(pthread_mutexattr_destroy(&mutexattr) == 0); assert(pthread_cond_init(&shared->condition, NULL) == 0); #else shared->mutex = shared->condition = 0; #endif pid_t child = fork(); if (child == 0) { #if PTHREAD_MUTEX pthread_mutex_lock(&shared->mutex); pthread_cond_wait(&shared->condition, &shared->mutex); #else condition_wait(&shared->condition, &shared->mutex); #endif _exit(0); } else { assert(child != -1); // This sleep makes it reproduce way faster, but it will still break // eventually without it. usleep(20000); #if PTHREAD_MUTEX assert(pthread_cond_broadcast(&shared->condition) == 0); #else condition_signal(&shared->condition, &shared->mutex); #endif assert(kill(child, SIGKILL) == 0); assert(waitpid(child, NULL, 0) == child); } } int main() { Shared *shared_memory = (Shared *)(mmap(NULL, sizeof(Shared) * NUMBER_TESTERS, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0)); int i; for (i = 0; i < NUMBER_TESTERS; ++i) { if (fork() == 0) { while (1) { RunTest(&shared_memory[i]); } } } } On Thu, Oct 23, 2014 at 3:22 PM, Brian Silverman <bsilver16...@gmail.com> wrote: > pi_state_free and exit_pi_state_list both clean up futex_pi_state's. > exit_pi_state_list takes the hb lock first, and most callers of > pi_state_free do too. requeue_pi didn't, which causes lots of problems. > Move the pi_state_free calls in requeue_pi to before it drops the hb > locks which it's already holding. > > I have test code that forks a bunch of processes, which all fork more > processes that do futex(FUTEX_WAIT_REQUEUE_PI), then do > futex(FUTEX_CMP_REQUEUE_PI) before killing the waiting processes. That > test consistently breaks QEMU VMs without this patch. > > Although they both make it faster to crash, CONFIG_PREEMPT_RT and lots > of CPUs are not necessary to reproduce this problem. The symptoms range > from random reboots to corrupting the test program to corrupting whole > filesystems to strange BIOS errors. Crashdumps (when they get created at > all) are usually a mess, to the point of crashing tools used to examine > them. > > Signed-off-by: Brian Silverman <bsilver16...@gmail.com> > --- > I am not subscribed to the list so please CC me on any responses. > > kernel/futex.c | 32 +++++++++++++++++++++----------- > 1 file changed, 21 insertions(+), 11 deletions(-) > > diff --git a/kernel/futex.c b/kernel/futex.c > index 815d7af..dc69775 100644 > --- a/kernel/futex.c > +++ b/kernel/futex.c > @@ -639,6 +639,9 @@ static struct futex_pi_state * alloc_pi_state(void) > return pi_state; > } > > +/** > + * Must be called with the hb lock held. > + */ > static void free_pi_state(struct futex_pi_state *pi_state) > { > if (!atomic_dec_and_test(&pi_state->refcount)) > @@ -1519,15 +1522,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned > int flags, > } > > retry: > - if (pi_state != NULL) { > - /* > - * We will have to lookup the pi_state again, so free this one > - * to keep the accounting correct. > - */ > - free_pi_state(pi_state); > - pi_state = NULL; > - } > - > ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); > if (unlikely(ret != 0)) > goto out; > @@ -1558,6 +1552,14 @@ retry_private: > ret = get_futex_value_locked(&curval, uaddr1); > > if (unlikely(ret)) { > + if (flags & FLAGS_SHARED && pi_state != NULL) { > + /* > + * We will have to lookup the pi_state again, > so > + * free this one to keep the accounting > correct. > + */ > + free_pi_state(pi_state); > + pi_state = NULL; > + } > double_unlock_hb(hb1, hb2); > hb_waiters_dec(hb2); > > @@ -1617,6 +1619,10 @@ retry_private: > case 0: > break; > case -EFAULT: > + if (pi_state != NULL) { > + free_pi_state(pi_state); > + pi_state = NULL; > + } > double_unlock_hb(hb1, hb2); > hb_waiters_dec(hb2); > put_futex_key(&key2); > @@ -1632,6 +1638,10 @@ retry_private: > * exit to complete. > * - The user space value changed. > */ > + if (pi_state != NULL) { > + free_pi_state(pi_state); > + pi_state = NULL; > + } > double_unlock_hb(hb1, hb2); > hb_waiters_dec(hb2); > put_futex_key(&key2); > @@ -1708,6 +1718,8 @@ retry_private: > } > > out_unlock: > + if (pi_state != NULL) > + free_pi_state(pi_state); > double_unlock_hb(hb1, hb2); > hb_waiters_dec(hb2); > > @@ -1725,8 +1737,6 @@ out_put_keys: > out_put_key1: > put_futex_key(&key1); > out: > - if (pi_state != NULL) > - free_pi_state(pi_state); > return ret ? ret : task_count; > } > > -- > 1.7.10.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/