dann frazier wrote: > On Mon, Feb 02, 2009 at 07:04:48PM +0100, Lucas Nussbaum wrote: >> ruby1.9 still fails to build on hppa and alpha. >> >> On hppa, it's caused by a kernel bug, which was partially fixed (at >> least the kernel doesn't panic() anymore). Since the issue is related to >> threading, it is possible that retrying could make it build >> successfully. > > fyi, I've retried it numerous times on both buildds with no > luck. We're not crashing the buildd anymore - thanks to Helge's fix -
The kudos belong to James Bottomley btw. I did debugging and testing, but James gave me the final hint to the solution then... > but the build hangs indefinitely. I've no objection to it being > retried again of course (and I'm not the buildd admin anyway) - I just > want to set your expectations. I tried a few times now to find the bug. I'm not sure if it's really due to a) a kernel bug (probably) b) the fact that hppa still uses Linuxthreads (although Dann mentioned in another mail that he saw similar problems with another server which used NPTL instead of Linuxthreads) c) wrong pthread coding in ruby1.9 If it's due to a) (kernel bug), then it's hard to find and track down. I concentrated on b) and c) for now. LT uses a few signals to synchronize the threads, and ruby plays some small but bad games with signals in it's code, e.g. rb_disable_interrupt() and rb_enable_interrupt() in signal.c. With the attached patch/hack below I tried to work around possible LT-related cornercases in ruby1.9, but the issue stays the same: "make test" will make the ruby testsuite hang in the "test_thread.rb" test. It seems some thread is waiting for a signal which will not arrive, since the other thread is a zombie already.... Anyway, it would be nice if someone with ruby knowledge could reduce the testsuite, so that it will be easier to reproduce the bug. I'm a little lost at this stage. Now since the hppa kernel doesn't crash any longer, building such a testcase should be much easier to create. Helge
--- ./signal.c.org 2009-02-05 11:16:23.000000000 +0100 +++ ./signal.c 2009-02-05 20:52:38.000000000 +0100 @@ -36,6 +36,46 @@ # endif #endif +/* ruby1.9 is a multithreaded program. + Nevertheless, ruby1.9 uses sigprocmask() which has unspecified + behaviour in a multi-threaded process (see man page!). + */ +static void ruby_generate_sigprocmask(int how, sigset_t *mask, sigset_t *oldset) +{ + /* make sure that ruby does not block the Linuxthreads + signals */ + if (how == SIG_BLOCK) { + sigdelset(mask, __SIGRTMIN); + sigdelset(mask, __SIGRTMIN+1); + sigdelset(mask, __SIGRTMIN+2); + } else if (how == SIG_SETMASK) { + sigaddset(mask, __SIGRTMIN); + sigaddset(mask, __SIGRTMIN+1); + sigaddset(mask, __SIGRTMIN+2); + } else { // SIG_UNBLOCK + sigaddset(mask, __SIGRTMIN); + sigaddset(mask, __SIGRTMIN+1); + sigaddset(mask, __SIGRTMIN+2); + } +} + +static int ruby_pthread_sigprocmask(int how, sigset_t *mask, sigset_t *oldset) +{ + ruby_generate_sigprocmask(how, mask, oldset); + return pthread_sigmask(how,mask,oldset); +} + +static int ruby_sigprocmask(int how, sigset_t *mask, sigset_t *oldset) +{ +#if 0 + return ruby_pthread_sigprocmask(how, mask, oldset); +#else + ruby_generate_sigprocmask(how, mask, oldset); + /* XXX: ruby should not use sigprocmask(). */ + return sigprocmask(how,mask,oldset); +#endif +} + static const struct signals { const char *signm; int signo; @@ -430,7 +470,6 @@ static sighandler_t ruby_signal(int signum, sighandler_t handler) { struct sigaction sigact, old; - #if 0 rb_trap_accept_nativethreads[signum] = 0; #endif @@ -448,6 +487,10 @@ ruby_signal(int signum, sighandler_t han if (signum == SIGCHLD && handler == SIG_IGN) sigact.sa_flags |= SA_NOCLDWAIT; #endif + +// printf("signal: %d (%d), %p\n", signum, __SIGRTMIN, handler); + if (signum >= __SIGRTMIN && signum <= __SIGRTMIN+2) + return NULL; sigaction(signum, &sigact, &old); return old.sa_handler; } @@ -505,7 +548,7 @@ rb_disable_interrupt(void) sigfillset(&mask); sigdelset(&mask, SIGVTALRM); sigdelset(&mask, SIGSEGV); - pthread_sigmask(SIG_SETMASK, &mask, NULL); + ruby_pthread_sigprocmask(SIG_SETMASK, &mask, NULL); #endif } @@ -515,7 +558,7 @@ rb_enable_interrupt(void) #ifndef _WIN32 sigset_t mask; sigemptyset(&mask); - pthread_sigmask(SIG_SETMASK, &mask, NULL); + ruby_pthread_sigprocmask(SIG_SETMASK, &mask, NULL); #endif } @@ -852,7 +895,7 @@ trap_ensure(struct trap_arg *arg) { /* enable interrupt */ #ifdef HAVE_SIGPROCMASK - sigprocmask(SIG_SETMASK, &arg->mask, NULL); + ruby_sigprocmask(SIG_SETMASK, &arg->mask, NULL); #else sigsetmask(arg->mask); #endif @@ -866,7 +909,7 @@ rb_trap_restore_mask(void) { #if USE_TRAP_MASK # ifdef HAVE_SIGPROCMASK - sigprocmask(SIG_SETMASK, &trap_last_mask, NULL); + ruby_sigprocmask(SIG_SETMASK, &trap_last_mask, NULL); # else sigsetmask(trap_last_mask); # endif @@ -931,7 +974,7 @@ sig_trap(int argc, VALUE *argv) /* disable interrupt */ # ifdef HAVE_SIGPROCMASK sigfillset(&arg.mask); - sigprocmask(SIG_BLOCK, &arg.mask, &arg.mask); + ruby_sigprocmask(SIG_BLOCK, &arg.mask, &arg.mask); # else arg.mask = sigblock(~0); # endif @@ -991,7 +1034,7 @@ init_sigchld(int sig) /* disable interrupt */ # ifdef HAVE_SIGPROCMASK sigfillset(&mask); - sigprocmask(SIG_BLOCK, &mask, &mask); + ruby_sigprocmask(SIG_BLOCK, &mask, &mask); # else mask = sigblock(~0); # endif @@ -1007,7 +1050,7 @@ init_sigchld(int sig) #if USE_TRAP_MASK #ifdef HAVE_SIGPROCMASK sigdelset(&mask, sig); - sigprocmask(SIG_SETMASK, &mask, NULL); + ruby_sigprocmask(SIG_SETMASK, &mask, NULL); #else mask &= ~sigmask(sig); sigsetmask(mask);