Public bug reported: Found in: https://objectstorage.prodstack4-5.canonical.com/v1/AUTH_77e2ada1e7a84929a74ba3b87153c0ac /autopkgtest-cosmic/cosmic/armhf/p/pgpool2/20180616_121023_27258@/log.gz
The issue seems to be a reproducible segfault on armhf 2018-06-18 07:17:33: pid 21486: WARNING: child process with pid: 21518 was terminated by segmentation fault 2018-06-18 07:17:33: pid 21486: LOG: fork a new child process with pid: 21537 DebCI tests only x86 https://ci.debian.net/packages/p/pgpool2/ The test is also re-reproducible on reruns and leaves a crash file. Ugly test environment (many preocesses, env and timing dependent) took a while to reproduce in gdb with useful output. Note: 2018-05-30 10:05:43: pid 6519: LOG: pool_send_and_wait: Error or notice message from backend: : DB node id: 0 backend pid: 6535 statement: "DROP TABLE IF EXISTS utocommit;" message: "table "utocommit" does not exist, skipping" 2018-05-30 10:05:43: pid 6519: LOG: backend [0]: NOTICE: table "utocommit" does not exist, skipping Is a red herring, that exists on other arches as well. Program received signal SIGSEGV, Segmentation fault. strlen () at ../sysdeps/arm/armv6t2/strlen.S:85 85 ../sysdeps/arm/armv6t2/strlen.S: No such file or directory. (gdb) bt #0 strlen () at ../sysdeps/arm/armv6t2/strlen.S:85 #1 0x76af2754 in _IO_vfprintf_internal (s=s@entry=0x7ed43780, format=format@entry=0x1e409e0 "%s: DB node id: %d backend pid: %d statement: \"%s\" message: \"%s\"", ap=..., ap@entry=...) at vfprintf.c:1643 #2 0x76b60ba6 in ___vsnprintf_chk ( s=0x1e40de8 "pool_send_and_wait: Error or notice message from backend: : DB node id: 0 backend pid: 817 statement: \"CREATE TABLE autocommit (a int);\" message: \"LE IF EXISTS utocommit;\" message: \"table \"utocommit\" "..., maxlen=<optimized out>, flags=1, slen=<optimized out>, format=0x1e409e0 "%s: DB node id: %d backend pid: %d statement: \"%s\" message: \"%s\"", args=...) at vsnprintf_chk.c:63 #3 0x004ab7ac in ?? () Backtrace stopped: previous frame identical to this frame (corrupt stack?) I thought it would have been an artifact of the apport/crash recovery. But in live gdb it has the same issue to resolve which call it was. gdb on arm needs dwarf or is odd, if readable it looks like: #0 strlen () at ../sysdeps/arm/armv6t2/strlen.S:85 #1 0x76b3c754 in _IO_vfprintf_internal (s=s@entry=0x7e974780, format=format@entry=0x6769e0 "%s: DB node id: %d backend pid: %d statement: \"%s\" message: \"%s\"", ap=..., ap@entry=...) at vfprintf.c:1643 #2 0x76baaba6 in ___vsnprintf_chk ( s=s@entry=0x676de8 "pool_send_and_wait: Error or notice message from backend: : DB node id: 0 backend pid: 2827 statement: \"CREATE TABLE autocommit (a int);\" message: \"BLE IF EXISTS utocommit;\" message: \"table \"utocommit"..., maxlen=<optimized out>, maxlen@entry=1024, flags=flags@entry=1, slen=slen@entry=4294967295, format=format@entry=0x6769e0 "%s: DB node id: %d backend pid: %d statement: \"%s\" message: \"%s\"", args=args@entry=...) at vsnprintf_chk.c:63 #3 0x0048e7ac in vsnprintf (__ap=..., __fmt=0x6769e0 "%s: DB node id: %d backend pid: %d statement: \"%s\" message: \"%s\"", __n=1024, __s=0x676de8 "pool_send_and_wait: Error or notice message from backend: : DB node id: 0 backend pid: 2827 statement: \"CREATE TABLE autocommit (a int);\" message: \"BLE IF EXISTS utocommit;\" message: \"table \"utocommit"...) at /usr/include/arm-linux-gnueabihf/bits/stdio2.h:77 #4 pvsnprintf ( buf=0x676de8 "pool_send_and_wait: Error or notice message from backend: : DB node id: 0 backend pid: 2827 statement: \"CREATE TABLE autocommit (a int);\" message: \"BLE IF EXISTS utocommit;\" message: \"table \"utocommit"..., len=len@entry=1024, fmt=fmt@entry=0x6769e0 "%s: DB node id: %d backend pid: %d statement: \"%s\" message: \"%s\"", args=...) at utils/psprintf.c:124 #5 0x004bc13c in appendStringInfoVA (str=str@entry=0x7e9748dc, fmt=fmt@entry=0x6769e0 "%s: DB node id: %d backend pid: %d statement: \"%s\" message: \"%s\"", args=..., args@entry=...) at stringinfo.c:131 #6 0x0049a12e in errmsg (fmt=0x76f5a968 <__stack_chk_guard> "") at ../../src/utils/error/elog.c:658 #7 0x0047ff60 in per_node_error_log (backend=backend@entry=0x690998, node_id=node_id@entry=0, query=query@entry=0x6a4190 "CREATE TABLE autocommit (a int);", prefix=prefix@entry=0x4e3220 "pool_send_and_wait: Error or notice message from backend: ", unread=unread@entry=1 '\001') at protocol/pool_proto_modules.c:3141 #8 0x0048a65e in pool_send_and_wait (query_context=query_context@entry=0x6a33e0, send_type=send_type@entry=1, node_id=0) at context/pool_query_context.c:830 #9 0x0047d8c4 in SimpleQuery (frontend=frontend@entry=0x69a178, backend=backend@entry=0x690998, len=33, contents=contents@entry=0x6a13d8 "CREATE TABLE autocommit (a int);") at protocol/pool_proto_modules.c:672 #10 0x004814d6 in ProcessFrontendResponse (frontend=frontend@entry=0x69a178, backend=backend@entry=0x690998) at protocol/pool_proto_modules.c:2369 ---Type <return> to continue, or q <return> to quit--- #11 0x00477874 in read_packets_and_process (frontend=frontend@entry=0x69a178, backend=backend@entry=0x690998, reset_request=reset_request@entry=0, state=state@entry=0x7e975098, num_fields=num_fields@entry=0x7e975096, cont=cont@entry=0x7e9750a0 "\001") at protocol/pool_process_query.c:4755 #12 0x00477f9e in pool_process_query (frontend=0x69a178, backend=0x690998, reset_request=reset_request@entry=0) at protocol/pool_process_query.c:273 #13 0x00473c1a in do_child (fds=fds@entry=0x694b90) at protocol/child.c:383 #14 0x0045b494 in fork_a_child (fds=0x694b90, id=29) at main/pgpool_main.c:611 #15 0x0045ff84 in PgpoolMain (discard_status=<optimized out>, clear_memcache_oidmaps=<optimized out>) at main/pgpool_main.c:367 #16 0x00459ebc in main (argc=<optimized out>, argv=0x7e980664) at main/main.c:318 Old buffer content, is just the target, so content should not matter (gdb) p buf $1 = 0x676de8 "pool_send_and_wait: Error or notice message from backend: : DB node id: 0 backend pid: 2827 statement: \"CREATE TABLE autocommit (a int);\" message: \"BLE IF EXISTS utocommit;\" message: \"table \"utocommit"... BUffer size 1024. (gdb) p len $2 = 1024 There is a guard on buf+1024 so it seems right. (gdb) p fmt $3 = 0x6769e0 "%s: DB node id: %d backend pid: %d statement: \"%s\" message: \"%s\"" 5 elements to format. Maybe some in the valist are broken? va_list parsing is hard, but we should have the fill stack. This is from per_node_error_log in src/protocol/pool_proto_modules.c But re-mangled in errmsg and macro around it. 3139 »···if (pool_extract_error_message(true, CONNECTION(backend, node_id), MAJOR(backend), true, &message) > 0) 3140 »···{ 3141 »···»···ereport(LOG, 3142 »···»···»···(errmsg("%s: DB node id: %d backend pid: %d statement: \"%s\" message: \"%s\"", 3143 »···»···»···»··· prefix, node_id, ntohl(slot->pid), query, message))); 3144 »···} Frame 7: It seems that message is a bit odd. (gdb) p prefix $24 = 0x4e3220 "pool_send_and_wait: Error or notice message from backend: " (gdb) p node_id $25 = 0 (gdb) p slot->pid $26 = 185270272 (gdb) p query $27 = 0x6a4190 "CREATE TABLE autocommit (a int);" (gdb) p message $28 = 0x25000000 <error: Cannot access memory at address 0x25000000> That is a local variable. And should have been set by the "pool_extract_error_message" call. On Frame 6 va_start was called on args, but the content is the same. As the 250000000 seems wrong we need to check pool_extract_error_message on this. It runs into this code block failing to get the message as it wants: 4366 if (kind != 'E' && kind != 'N') 4367 { 4368 pool_unread(backend, str_buf->data, readlen); 4369 »···»···»···»···pfree(str_message_buf->data); 4370 »···»···»···»···pfree(str_message_buf); 4371 »···»···»···»···pfree(str_buf->data); 4372 »···»···»···»···pfree(str_buf); 4373 ereport(ERROR, 4374 (errmsg("unable to extract error message"), 4375 errdetail("invalid message kind \"%c\"",kind))); 4376 } kind is "'C' in my case. That triggers PG_CATCH and exits the function with (gdb) p *message $24 = 0x25000000 <error: Cannot access memory at address 0x25000000> The outer code should not use the message in this case: pool_extract_error_message sets ret = -1 And the outer code has > 0 to pass But this might be the arch reasons here only hitting armhf. On fin to the pool_extract_error_message function I see 255. (gdb) fin Run till exit from #0 pool_extract_error_message (read_kind=read_kind@entry=1 '\001', backend=backend@entry=0x130e9a0, major=3, unread=unread@entry=1 '\001', message=message@entry=0x7ee61928) at protocol/pool_process_query.c:4439 0x004e3f24 in per_node_error_log (backend=backend@entry=0x130d998, node_id=node_id@entry=0, query=query@entry=0x1321190 "CREATE TABLE autocommit (a int);", prefix=prefix@entry=0x547220 "pool_send_and_wait: Error or notice message from backend: ", unread=unread@entry=1 '\001') at protocol/pool_proto_modules.c:3139 3139 if (pool_extract_error_message(true, CONNECTION(backend, node_id), MAJOR(backend), true, &message) > 0) Value returned is $26 = 255 (gdb) p *message Cannot access memory at address 0x25000000 (gdb) n 3141 ereport(LOG, Per Definition of the function retvals should be like: 4337 * Return values are: 4338 * 0: not error or notice message 4339 * 1: succeeded to extract error message 4340 * -1: error) Other calls already either ignore the retval or check for == 1 3 pool_auth.c pool_do_auth 298 if (pool_extract_error_message(false, MASTER(cp), protoMajor, true, &message) == 1) 4 pool_auth.c pool_do_auth 315 if (pool_extract_error_message(false, MASTER(cp), protoMajor, true, &message) == 1) 5 pool_auth.c do_clear_text_password 609 if (pool_extract_error_message(false, backend, protoMajor, false, &message) == 1) 6 pool_auth.c do_crypt 756 if (pool_extract_error_message(false, backend, protoMajor, false, &message) == 1) 7 pool_auth.c send_password_packet 1046 if (pool_extract_error_message(false, backend, protoMajor, false, &message) == 1) 8 pool_auth.c pool_read_kind 1220 if (pool_extract_error_message(false, MASTER(cp), MAJOR(cp), true, &message) == 1) 3 pool_process_query.c read_kind_from_backend 3567 if (pool_extract_error_message(false, CONNECTION(backend, i), MAJOR(backend), true, &m) == 1) 5 pool_proto_modules.c per_node_error_log 3139 if (pool_extract_error_message(true, CONNECTION(backend, node_id), MAJOR(backend), true, &message) == 1) This would avoid the issue here as well. Only go on if == 1 which means found a message to report. ** Affects: pgpool2 (Ubuntu) Importance: Undecided Status: New -- You received this bug notification because you are a member of Ubuntu Bugs, which is subscribed to Ubuntu. https://bugs.launchpad.net/bugs/1777418 Title: crash on armhf (autopkgtest) To manage notifications about this bug go to: https://bugs.launchpad.net/ubuntu/+source/pgpool2/+bug/1777418/+subscriptions -- ubuntu-bugs mailing list ubuntu-bugs@lists.ubuntu.com https://lists.ubuntu.com/mailman/listinfo/ubuntu-bugs