Hi, Open MPI's signal handler (show_stackframe function defined in opal/util/stacktrace.c) calls non-async-signal-safe functions and it causes a problem.
See attached mpisigabrt.c. Passing corrupted memory to realloc(3) will cause SIGABRT and show_stackframe function will be invoked. But invoked show_stackframe function deadlocks in backtrace_symbols(3) on some systems because backtrace_symbols(3) calls malloc(3) internally and a deadlock of realloc/malloc mutex occurs. Attached mpisigabrt.gstack.txt shows the stacktrace gotten by gdb in this deadlock situation on Ubuntu 12.04 LTS (precise) x86_64. Though I could not reproduce this behavior on RHEL 5/6, I can reproduce it also on K computer and its successor PRIMEHPC FX10. Passing non-heap memory to free(3) and double-free also cause this deadlock. malloc (and backtrace_symbols) is not marked as async-signal-safe in POSIX and current glibc, though it seems to have been marked in old glibc. So we should not call it in the signal handler now. http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_04 http://cygwin.com/ml/libc-help/2013-06/msg00005.html I wrote a patch to address this issue. See the attached async-signal-safe-stacktrace.patch. This patch calls backtrace_symbols_fd(3) instead of backtrace_symbols(3). Though backtrace_symbols_fd is not declared as async-signal-safe, it is described not to call malloc internally in its man. So it should be rather safer. Output format of show_stackframe function is not changed by this patch. But the opal_backtrace_print function (backtrace framework) interface is changed for the output format compatibility. This requires changes in some additional files (ompi_mpi_abort.c etc.). This patch also removes unnecessary fflush(3) calls, which are meaningless for write(2) system call but might cause a similar problem. What do you think about this patch? Takahiro Kawashima, MPI development team, Fujitsu
Index: opal/mca/backtrace/backtrace.h =================================================================== --- opal/mca/backtrace/backtrace.h (revision 29841) +++ opal/mca/backtrace/backtrace.h (working copy) @@ -34,11 +34,12 @@ /* - * print back trace to FILE file + * Print back trace to FILE file with a prefix for each line. + * First strip lines are not printed. * * \note some attempts made to be signal safe. */ -OPAL_DECLSPEC void opal_backtrace_print(FILE *file); +OPAL_DECLSPEC int opal_backtrace_print(FILE *file, char *prefix, int strip); /* * Return back trace in buffer. buffer will be allocated by the Index: opal/mca/backtrace/execinfo/backtrace_execinfo.c =================================================================== --- opal/mca/backtrace/execinfo/backtrace_execinfo.c (revision 29841) +++ opal/mca/backtrace/execinfo/backtrace_execinfo.c (working copy) @@ -20,6 +20,10 @@ #include "opal_config.h" #include <stdio.h> +#include <string.h> +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif #ifdef HAVE_EXECINFO_H #include <execinfo.h> #endif @@ -27,23 +31,31 @@ #include "opal/constants.h" #include "opal/mca/backtrace/backtrace.h" -void -opal_backtrace_print(FILE *file) +int +opal_backtrace_print(FILE *file, char *prefix, int strip) { - int i; + int i, fd, len; int trace_size; void * trace[32]; - char ** messages = (char **)NULL; + char buf[6]; + fd = fileno (file); + if (-1 == fd) { + return OPAL_ERR_BAD_PARAM; + } + trace_size = backtrace (trace, 32); - messages = backtrace_symbols (trace, trace_size); - for (i = 0; i < trace_size; i++) { - fprintf(file, "[%d] func:%s\n", i, messages[i]); - fflush(file); + for (i = strip; i < trace_size; i++) { + if (NULL != prefix) { + write (fd, prefix, strlen (prefix)); + } + len = snprintf (buf, sizeof(buf), "[%2d] ", i - strip); + write (fd, buf, len); + backtrace_symbols_fd (&trace[i], 1, fd); } - free(messages); + return OPAL_SUCCESS; } Index: opal/mca/backtrace/printstack/backtrace_printstack.c =================================================================== --- opal/mca/backtrace/printstack/backtrace_printstack.c (revision 29841) +++ opal/mca/backtrace/printstack/backtrace_printstack.c (working copy) @@ -24,10 +24,12 @@ #include "opal/constants.h" #include "opal/mca/backtrace/backtrace.h" -void -opal_backtrace_print(FILE *file) +int +opal_backtrace_print(FILE *file, char *prefix, int strip) { printstack(fileno(file)); + + return OPAL_SUCCESS; } Index: opal/mca/backtrace/none/backtrace_none.c =================================================================== --- opal/mca/backtrace/none/backtrace_none.c (revision 29841) +++ opal/mca/backtrace/none/backtrace_none.c (working copy) @@ -23,9 +23,10 @@ #include "opal/constants.h" #include "opal/mca/backtrace/backtrace.h" -void -opal_backtrace_print(FILE *file) +int +opal_backtrace_print(FILE *file, char *prefix, int strip) { + return OPAL_ERR_NOT_IMPLEMENTED; } Index: opal/util/stacktrace.c =================================================================== --- opal/util/stacktrace.c (revision 29841) +++ opal/util/stacktrace.c (working copy) @@ -53,12 +53,10 @@ /** * This function is being called as a signal-handler in response * to a user-specified signal (e.g. SIGFPE or SIGSEGV). - * For Linux/Glibc, it then uses backtrace and backtrace_symbols - * to figure the current stack and then prints that out to stdout. + * For Linux/Glibc, it then uses backtrace and backtrace_symbols_fd + * to figure the current stack and print that out to stderr. * Where available, the BSD libexecinfo is used to provide Linux/Glibc - * compatible backtrace and backtrace_symbols functions. - * Yes, printf and malloc are not signal-safe per se, but should be - * on Linux? + * compatible backtrace and backtrace_symbols_fd functions. * * @param signo with the signal number raised * @param info with information regarding the reason/send of the signal @@ -72,9 +70,8 @@ char print_buffer[1024]; char * tmp = print_buffer; int size = sizeof (print_buffer); - int ret, traces_size; + int ret; char *si_code_str = ""; - char **traces; /* write out the footer information */ memset (print_buffer, 0, sizeof (print_buffer)); @@ -82,18 +79,8 @@ HOSTFORMAT "*** Process received signal ***\n", stacktrace_hostname, getpid()); write(fileno(stderr), print_buffer, ret); - fflush(stderr); - /* - * Yes, we are doing printf inside a signal-handler. - * However, backtrace itself calls malloc (which may not be signal-safe, - * under linux, printf and malloc are) - * - * We could use backtrace_symbols_fd and write directly into an - * filedescriptor, however, without formatting -- also this fd - * should be opened in a sensible way... - */ memset (print_buffer, 0, sizeof (print_buffer)); #ifdef HAVE_STRSIGNAL @@ -342,28 +329,14 @@ /* write out the signal information generated above */ write(fileno(stderr), print_buffer, sizeof(print_buffer)-size); - fflush(stderr); /* print out the stack trace */ - ret = opal_backtrace_buffer(&traces, &traces_size); - if (OPAL_SUCCESS == ret) { - int i; - /* since we have the opportunity, strip off the bottom two - function calls, which will be this function and - opal_backtrace_buffer(). */ - for (i = 2 ; i < traces_size ; ++i) { - ret = snprintf(print_buffer, sizeof(print_buffer), - HOSTFORMAT "[%2d] %s\n", - stacktrace_hostname, getpid(), i - 2, traces[i]); - if (ret > 0) { - write(fileno(stderr), print_buffer, ret); - } else { - write(fileno(stderr), unable_to_print_msg, - strlen(unable_to_print_msg)); - } - } - } else { - opal_backtrace_print(stderr); + snprintf(print_buffer, sizeof(print_buffer), HOSTFORMAT, + stacktrace_hostname, getpid()); + print_buffer[sizeof(print_buffer) - 1] = '\0'; + ret = opal_backtrace_print(stderr, print_buffer, 2); + if (OPAL_SUCCESS != ret) { + write(fileno(stderr), unable_to_print_msg, strlen(unable_to_print_msg)); } /* write out the footer information */ @@ -376,7 +349,6 @@ } else { write(fileno(stderr), unable_to_print_msg, strlen(unable_to_print_msg)); } - fflush(stderr); } #endif /* OPAL_WANT_PRETTY_PRINT_STACKTRACE */ @@ -393,12 +365,12 @@ int i; /* since we have the opportunity, strip off the bottom two function calls, which will be this function and - opa_backtrace_buffer(). */ + opal_backtrace_buffer(). */ for (i = 2; i < traces_size; ++i) { opal_output(stream, "%s", traces[i]); } } else { - opal_backtrace_print(stderr); + opal_backtrace_print(stderr, NULL, 2); } } Index: ompi/runtime/ompi_mpi_abort.c =================================================================== --- ompi/runtime/ompi_mpi_abort.c (revision 29841) +++ ompi/runtime/ompi_mpi_abort.c (working copy) @@ -87,7 +87,7 @@ /* This will print an message if it's unable to print the backtrace, so we don't need an additional "else" clause if opal_backtrace_print() is not supported. */ - opal_backtrace_print(stderr); + opal_backtrace_print(stderr, NULL, 1); } } Index: orte/mca/oob/tcp/oob_tcp_common.c =================================================================== --- orte/mca/oob/tcp/oob_tcp_common.c (revision 29841) +++ orte/mca/oob/tcp/oob_tcp_common.c (working copy) @@ -79,7 +79,7 @@ int optval; optval = 1; if(setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval)) < 0) { - opal_backtrace_print(stderr); + opal_backtrace_print(stderr, NULL, 1); opal_output(0, "[%s:%d] setsockopt(TCP_NODELAY) failed: %s (%d)", __FILE__, __LINE__, strerror(opal_socket_errno),
#include <stdlib.h> #include <execinfo.h> #include <mpi.h> int main(int argc, char *argv[]) { void *p; { /* This code is not interest of us. Just avoiding hang in backtrace(3) which is called in realloc(3) failure message. https://sourceware.org/bugzilla/show_bug.cgi?id=956 https://sourceware.org/bugzilla/show_bug.cgi?id=16159 */ void *b; backtrace(&b, 0); } MPI_Init(NULL, NULL); p = malloc(100); *((size_t *)p - 1) = 0x10; /* simulate buffer overrun */ p = realloc(p, 100); MPI_Finalize(); return 0; }
Thread 2 (Thread 0x7f2a74605700 (LWP 16294)): #0 0x00007f2a7585ea43 in __GI___poll (fds=<optimized out>, #1 0x00007f2a7529fa3a in poll_dispatch () #2 0x00007f2a75294336 in opal_libevent2021_event_base_loop () #3 0x00007f2a7551711e in orte_progress_thread_engine () #4 0x00007f2a75b3de9a in start_thread (arg=0x7f2a74605700) #5 0x00007f2a7586a3fd in clone () #6 0x0000000000000000 in ?? () Thread 1 (Thread 0x7f2a76240700 (LWP 16293)): #0 __lll_lock_wait_private () #1 0x00007f2a757fb231 in _L_lock_10574 () at malloc.c:5241 #2 0x00007f2a757f8f87 in __GI___libc_malloc (bytes=139820340016928) #3 0x00007f2a7588131a in __backtrace_symbols (array=0x7fff4a14a970, size=12) #4 0x00007f2a7528e165 in opal_backtrace_buffer () #5 0x00007f2a7528b8e6 in show_stackframe () #6 <signal handler called> #7 0x00007f2a757ac425 in __GI_raise (sig=<optimized out>) #8 0x00007f2a757afb8b in __GI_abort () at abort.c:91 #9 0x00007f2a757ea39e in __libc_message (do_abort=2, #10 0x00007f2a757f4b96 in malloc_printerr (action=3, #11 0x00007f2a757f7ee7 in _int_realloc (av=0x7f2a75b2f720, oldp=0x172d050, #12 0x00007f2a757f96fe in __GI___libc_realloc (oldmem=0x172d060, bytes=100) #13 0x00000000004007b1 in main (argc=1, argv=0x7fff4a14bfb8) at mpisigabrt.c:23