signal not interrupting futex

2017-07-22 Thread Michael Day
We have hit an apparent kernel bug where a signal is not interrupting a 
futex, leading to a deadlock in our code. Here is the relevant strace 
output just before it blocks (complete strace log is attached):


14069 set_robust_list(0x7f7b3e7ee9e0, 24 
14061 futex(0x7f7b46721fd8, FUTEX_WAIT_PRIVATE, 0, NULL 
14069 <... set_robust_list resumed> )   = 0
14069 futex(0x7f7b46721fd8, FUTEX_WAKE_PRIVATE, 1) = 1
14061 <... futex resumed> ) = 0
14061 futex(0x1585ea0, FUTEX_WAIT_PRIVATE, 2, NULL 
14069 tgkill(14061, 14061, SIGPWR)  = 0
14069 futex(0x1586280, FUTEX_WAIT_PRIVATE, 0, NULL

Thread '69 sends SIGPWR to thread '61, but it is never delivered and we 
have not been able to figure out why.


Background information: this deadlock is experienced by our customer 
running Prince on CentOS 7. The bug happens every time on their system, 
but we have not been able to reproduce it on ours yet. They have tried 
two different kernel versions:


3.10.0-327.28.2.el7.x86_64
3.10.0-514.26.2.el7.x86_64

Over the past two years we have heard similar deadlock issues from other 
customers, always on CentOS and typically involving PHP, although these 
are of course very popular systems.


This issue appears to be unrelated to the earlier futex bug affecting 
Haswell processors, but could there be another bug along these lines 
affecting futexes or signal delivery?


What can we do to help debug this issue?

Best regards,

Michael

--
Prince: Print with CSS!
http://www.princexml.com
14061 execve("/usr/lib/prince/bin/prince", ["/usr/lib/prince/bin/prince", "-i", 
"html", "--no-xinclude", "--license-file=/usr/lib/prince/l"..., 
"--no-compress", "--structured-log=normal", "tmp/tmphtmlinvoice.html"], [/* 43 
vars */]) = 0
14061 brk(0)= 0x1c5b000
14061 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) 
= 0x7f7b46881000
14061 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
14061 open("/opt/mercury-2016-02-18/lib/mercury/lib/tls/x86_64/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/tls/x86_64", 
0x7ffc5a6fab30) = -1 ENOENT (No such file or directory)
14061 open("/opt/mercury-2016-02-18/lib/mercury/lib/tls/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/tls", 0x7ffc5a6fab30) = -1 
ENOENT (No such file or directory)
14061 open("/opt/mercury-2016-02-18/lib/mercury/lib/x86_64/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/x86_64", 0x7ffc5a6fab30) = 
-1 ENOENT (No such file or directory)
14061 open("/opt/mercury-2016-02-18/lib/mercury/lib/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib", 0x7ffc5a6fab30) = -1 
ENOENT (No such file or directory)
14061 
open("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/tls/x86_64/libxml2.so.2",
 O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/tls/x86_64", 
0x7ffc5a6fab30) = -1 ENOENT (No such file or directory)
14061 
open("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/tls/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/tls", 
0x7ffc5a6fab30) = -1 ENOENT (No such file or directory)
14061 
open("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/x86_64/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/x86_64", 
0x7ffc5a6fab30) = -1 ENOENT (No such file or directory)
14061 open("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc", 
0x7ffc5a6fab30) = -1 ENOENT (No such file or directory)
14061 open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
14061 fstat(3, {st_mode=S_IFREG|0644, st_size=49060, ...}) = 0
14061 mmap(NULL, 49060, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f7b46875000
14061 close(3)  = 0
14061 open("/lib64/libxml2.so.2", O_RDONLY|O_CLOEXEC) = 3
14061 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0 
\351\2\0\0\0\0\0"..., 832) = 832
14061 fstat(3, {st_mode=S_IFREG|0755, st_size=1509376, ...}) = 0
14061 mmap(NULL, 3575896, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) 
= 0x7f7b462f8000
14061 mprotect(0x7f7b46457000, 2093056, PROT_NONE) = 0
14061 mmap(0x7f7b46656000, 40960, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x15e000) = 0x7f7b46656000
14061 mmap(0x7f7b4666, 4184, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7b4666
14061 close(3)  = 0
14061 open("/lib64/libpthread.so.0", 

signal not interrupting futex

2017-07-22 Thread Michael Day
We have hit an apparent kernel bug where a signal is not interrupting a 
futex, leading to a deadlock in our code. Here is the relevant strace 
output just before it blocks (complete strace log is attached):


14069 set_robust_list(0x7f7b3e7ee9e0, 24 
14061 futex(0x7f7b46721fd8, FUTEX_WAIT_PRIVATE, 0, NULL 
14069 <... set_robust_list resumed> )   = 0
14069 futex(0x7f7b46721fd8, FUTEX_WAKE_PRIVATE, 1) = 1
14061 <... futex resumed> ) = 0
14061 futex(0x1585ea0, FUTEX_WAIT_PRIVATE, 2, NULL 
14069 tgkill(14061, 14061, SIGPWR)  = 0
14069 futex(0x1586280, FUTEX_WAIT_PRIVATE, 0, NULL

Thread '69 sends SIGPWR to thread '61, but it is never delivered and we 
have not been able to figure out why.


Background information: this deadlock is experienced by our customer 
running Prince on CentOS 7. The bug happens every time on their system, 
but we have not been able to reproduce it on ours yet. They have tried 
two different kernel versions:


3.10.0-327.28.2.el7.x86_64
3.10.0-514.26.2.el7.x86_64

Over the past two years we have heard similar deadlock issues from other 
customers, always on CentOS and typically involving PHP, although these 
are of course very popular systems.


This issue appears to be unrelated to the earlier futex bug affecting 
Haswell processors, but could there be another bug along these lines 
affecting futexes or signal delivery?


What can we do to help debug this issue?

Best regards,

Michael

--
Prince: Print with CSS!
http://www.princexml.com
14061 execve("/usr/lib/prince/bin/prince", ["/usr/lib/prince/bin/prince", "-i", 
"html", "--no-xinclude", "--license-file=/usr/lib/prince/l"..., 
"--no-compress", "--structured-log=normal", "tmp/tmphtmlinvoice.html"], [/* 43 
vars */]) = 0
14061 brk(0)= 0x1c5b000
14061 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) 
= 0x7f7b46881000
14061 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
14061 open("/opt/mercury-2016-02-18/lib/mercury/lib/tls/x86_64/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/tls/x86_64", 
0x7ffc5a6fab30) = -1 ENOENT (No such file or directory)
14061 open("/opt/mercury-2016-02-18/lib/mercury/lib/tls/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/tls", 0x7ffc5a6fab30) = -1 
ENOENT (No such file or directory)
14061 open("/opt/mercury-2016-02-18/lib/mercury/lib/x86_64/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/x86_64", 0x7ffc5a6fab30) = 
-1 ENOENT (No such file or directory)
14061 open("/opt/mercury-2016-02-18/lib/mercury/lib/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib", 0x7ffc5a6fab30) = -1 
ENOENT (No such file or directory)
14061 
open("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/tls/x86_64/libxml2.so.2",
 O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/tls/x86_64", 
0x7ffc5a6fab30) = -1 ENOENT (No such file or directory)
14061 
open("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/tls/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/tls", 
0x7ffc5a6fab30) = -1 ENOENT (No such file or directory)
14061 
open("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/x86_64/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/x86_64", 
0x7ffc5a6fab30) = -1 ENOENT (No such file or directory)
14061 open("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc/libxml2.so.2", 
O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
14061 stat("/opt/mercury-2016-02-18/lib/mercury/lib/hlc.par.gc", 
0x7ffc5a6fab30) = -1 ENOENT (No such file or directory)
14061 open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
14061 fstat(3, {st_mode=S_IFREG|0644, st_size=49060, ...}) = 0
14061 mmap(NULL, 49060, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f7b46875000
14061 close(3)  = 0
14061 open("/lib64/libxml2.so.2", O_RDONLY|O_CLOEXEC) = 3
14061 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0 
\351\2\0\0\0\0\0"..., 832) = 832
14061 fstat(3, {st_mode=S_IFREG|0755, st_size=1509376, ...}) = 0
14061 mmap(NULL, 3575896, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) 
= 0x7f7b462f8000
14061 mprotect(0x7f7b46457000, 2093056, PROT_NONE) = 0
14061 mmap(0x7f7b46656000, 40960, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x15e000) = 0x7f7b46656000
14061 mmap(0x7f7b4666, 4184, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7b4666
14061 close(3)  = 0
14061 open("/lib64/libpthread.so.0",