[ https://issues.apache.org/jira/browse/HAWQ-978?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ming LI updated HAWQ-978: ------------------------- Description: One backend process on master had been running for several days and can't be terminated. The session is idle on all segments but master instance. pstack/strace/back trace of the backend process. <p><code> [gpadmin@avw7hdm2p1 ~]$ pstack 431263 Thread 2 (Thread 0x7f4c93aa2700 (LWP 431264)): #0 0x00007f4c9013f0d3 in poll () from /lib64/libc.so.6 #1 0x0000000000ba8294 in rxThreadFunc () #2 0x00007f4c9101f9d1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f4c901488fd in clone () from /lib64/libc.so.6 Thread 1 (Thread 0x7f4c93af48e0 (LWP 431263)): #0 0x00007f4c9015805e in __lll_lock_wait_private () from /lib64/libc.so.6 #1 0x00007f4c900dd16b in _L_lock_9503 () from /lib64/libc.so.6 #2 0x00007f4c900da6a6 in malloc () from /lib64/libc.so.6 #3 0x00007f4c9008fb39 in _nl_make_l10nflist () from /lib64/libc.so.6 #4 0x00007f4c9008ddf5 in _nl_find_domain () from /lib64/libc.so.6 #5 0x00007f4c9008d6e0 in __dcigettext () from /lib64/libc.so.6 #6 0x00007f4c6fabcfe3 in Rf_onsigusr1 () from /usr/local/lib64/R/lib/libR.so #7 <signal handler called> #8 0x00007f4c9014079a in brk () from /lib64/libc.so.6 #9 0x00007f4c90140845 in sbrk () from /lib64/libc.so.6 #10 0x00007f4c900dd769 in __default_morecore () from /lib64/libc.so.6 #11 0x00007f4c900d87a2 in _int_free () from /lib64/libc.so.6 #12 0x0000000000b3ff24 in gp_free2 () #13 0x0000000000b356fc in AllocSetDelete () #14 0x0000000000b38391 in MemoryContextDeleteImpl () #15 0x000000000077c851 in ExecEndAgg () #16 0x00000000007592ad in ExecEndNode () #17 0x000000000075186c in ExecEndPlan () #18 0x000000000079dffa in ExecEndSubqueryScan () #19 0x000000000075921d in ExecEndNode () #20 0x000000000075186c in ExecEndPlan () #21 0x0000000000752565 in ExecutorEnd () #22 0x00000000006dd9bd in PortalCleanup () #23 0x0000000000b3f077 in AtCommit_Portals () #24 0x000000000051abe5 in CommitTransaction () #25 0x000000000051f1d5 in CommitTransactionCommand () #26 0x000000000099809e in PostgresMain () #27 0x00000000008f1031 in BackendStartup () #28 0x00000000008f70e0 in PostmasterMain () #29 0x00000000007f63da in main () [gpadmin@avw7hdm2p1 ~]$ [gpadmin@avw7hdm2p1 ~]$ strace -p 431263 Process 431263 attached - interrupt to quit futex(0x7f4c903efe80, FUTEX_WAIT_PRIVATE, 2, NULL^C <unfinished ...> Process 431263 detached [gpadmin@avw7hdm2p1 ~]$ (gdb) thread apply all bt Thread 2 (Thread 0x7f4c93af48e0 (LWP 431263)): #0 0x00007f4c9015805e in __lll_lock_wait_private () from /lib64/libc.so.6 #1 0x00007f4c900dd16b in _L_lock_9503 () from /lib64/libc.so.6 #2 0x00007f4c900da6a6 in malloc () from /lib64/libc.so.6 #3 0x00007f4c9008fb39 in _nl_make_l10nflist () from /lib64/libc.so.6 #4 0x00007f4c9008ddf5 in _nl_find_domain () from /lib64/libc.so.6 #5 0x00007f4c9008d6e0 in __dcigettext () from /lib64/libc.so.6 #6 0x00007f4c6fabcfe3 in Rf_onsigusr1 (dummy=<value optimized out>) at errors.c:178 #7 <signal handler called> #8 0x00007f4c9014079a in brk () from /lib64/libc.so.6 #9 0x00007f4c90140845 in sbrk () from /lib64/libc.so.6 #10 0x00007f4c900dd769 in __default_morecore () from /lib64/libc.so.6 #11 0x00007f4c900d87a2 in _int_free () from /lib64/libc.so.6 #12 0x0000000000b3ff24 in gp_free2 (ptr=0x191c3b000, sz=0) at memprot.c:808 #13 0x0000000000b356fc in AllocSetDelete (context=<value optimized out>) at aset.c:981 #14 0x0000000000b38391 in MemoryContextDeleteImpl (context=0x4a46da0, sfile=0x0, func=<value optimized out>, sline=-1) at mcxt.c:232 #15 MemoryContextDeleteChildren (context=0x4a46da0, sfile=0x0, func=<value optimized out>, sline=-1) at mcxt.c:251 #16 MemoryContextDeleteImpl (context=0x4a46da0, sfile=0x0, func=<value optimized out>, sline=-1) at mcxt.c:205 #17 0x000000000077c851 in ExecEndAgg (node=0x325eb00) at nodeAgg.c:2641 #18 0x00000000007592ad in ExecEndNode (node=0x325eb00) at execProcnode.c:1687 #19 0x000000000075186c in ExecEndPlan (planstate=0x325eb00, estate=0x323f9e8) at execMain.c:2825 #20 0x000000000079dffa in ExecEndSubqueryScan (node=0x325cd20) at nodeSubqueryscan.c:294 #21 0x000000000075921d in ExecEndNode (node=0x325cd20) at execProcnode.c:1638 #22 0x000000000075186c in ExecEndPlan (planstate=0x325cd20, estate=0x323f010) at execMain.c:2825 #23 0x0000000000752565 in ExecutorEnd (queryDesc=<value optimized out>) at execMain.c:1321 #24 0x00000000006dd9bd in PortalCleanupHelper (portal=<value optimized out>) at portalcmds.c:366 #25 PortalCleanup (portal=<value optimized out>) at portalcmds.c:302 #26 0x0000000000b3f077 in PortalDrop () at portalmem.c:402 #27 AtCommit_Portals () at portalmem.c:643 #28 0x000000000051abe5 in CommitTransaction () at xact.c:3379 #29 0x000000000051f1d5 in CommitTransactionCommand () at xact.c:4535 #30 0x000000000099809e in finish_xact_command (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:3180 #31 PostgresMain (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:5260 #32 0x00000000008f1031 in BackendRun (port=0x2aa5520) at postmaster.c:6811 #33 BackendStartup (port=0x2aa5520) at postmaster.c:6408 #34 0x00000000008f70e0 in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:2350 #35 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1556 #36 0x00000000007f63da in main (argc=18, argv=0x2aa1270) at main.c:217 Thread 1 (Thread 0x7f4c93aa2700 (LWP 431264)): #0 0x00007f4c9013f0d3 in poll () from /lib64/libc.so.6 #1 0x0000000000ba8294 in rxThreadFunc (arg=<value optimized out>) at ic_udp.c:6263 #2 0x00007f4c9101f9d1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f4c901488fd in clone () from /lib64/libc.so.6 (gdb) </code></p> was: One backend process on master had been running for several days and can't be terminated. The session is idle on all segments but master instance. pstack/strace/back trace of the backend process. ``` [gpadmin@avw7hdm2p1 ~]$ pstack 431263 Thread 2 (Thread 0x7f4c93aa2700 (LWP 431264)): #0 0x00007f4c9013f0d3 in poll () from /lib64/libc.so.6 #1 0x0000000000ba8294 in rxThreadFunc () #2 0x00007f4c9101f9d1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f4c901488fd in clone () from /lib64/libc.so.6 Thread 1 (Thread 0x7f4c93af48e0 (LWP 431263)): #0 0x00007f4c9015805e in __lll_lock_wait_private () from /lib64/libc.so.6 #1 0x00007f4c900dd16b in _L_lock_9503 () from /lib64/libc.so.6 #2 0x00007f4c900da6a6 in malloc () from /lib64/libc.so.6 #3 0x00007f4c9008fb39 in _nl_make_l10nflist () from /lib64/libc.so.6 #4 0x00007f4c9008ddf5 in _nl_find_domain () from /lib64/libc.so.6 #5 0x00007f4c9008d6e0 in __dcigettext () from /lib64/libc.so.6 #6 0x00007f4c6fabcfe3 in Rf_onsigusr1 () from /usr/local/lib64/R/lib/libR.so #7 <signal handler called> #8 0x00007f4c9014079a in brk () from /lib64/libc.so.6 #9 0x00007f4c90140845 in sbrk () from /lib64/libc.so.6 #10 0x00007f4c900dd769 in __default_morecore () from /lib64/libc.so.6 #11 0x00007f4c900d87a2 in _int_free () from /lib64/libc.so.6 #12 0x0000000000b3ff24 in gp_free2 () #13 0x0000000000b356fc in AllocSetDelete () #14 0x0000000000b38391 in MemoryContextDeleteImpl () #15 0x000000000077c851 in ExecEndAgg () #16 0x00000000007592ad in ExecEndNode () #17 0x000000000075186c in ExecEndPlan () #18 0x000000000079dffa in ExecEndSubqueryScan () #19 0x000000000075921d in ExecEndNode () #20 0x000000000075186c in ExecEndPlan () #21 0x0000000000752565 in ExecutorEnd () #22 0x00000000006dd9bd in PortalCleanup () #23 0x0000000000b3f077 in AtCommit_Portals () #24 0x000000000051abe5 in CommitTransaction () #25 0x000000000051f1d5 in CommitTransactionCommand () #26 0x000000000099809e in PostgresMain () #27 0x00000000008f1031 in BackendStartup () #28 0x00000000008f70e0 in PostmasterMain () #29 0x00000000007f63da in main () [gpadmin@avw7hdm2p1 ~]$ [gpadmin@avw7hdm2p1 ~]$ strace -p 431263 Process 431263 attached - interrupt to quit futex(0x7f4c903efe80, FUTEX_WAIT_PRIVATE, 2, NULL^C <unfinished ...> Process 431263 detached [gpadmin@avw7hdm2p1 ~]$ (gdb) thread apply all bt Thread 2 (Thread 0x7f4c93af48e0 (LWP 431263)): #0 0x00007f4c9015805e in __lll_lock_wait_private () from /lib64/libc.so.6 #1 0x00007f4c900dd16b in _L_lock_9503 () from /lib64/libc.so.6 #2 0x00007f4c900da6a6 in malloc () from /lib64/libc.so.6 #3 0x00007f4c9008fb39 in _nl_make_l10nflist () from /lib64/libc.so.6 #4 0x00007f4c9008ddf5 in _nl_find_domain () from /lib64/libc.so.6 #5 0x00007f4c9008d6e0 in __dcigettext () from /lib64/libc.so.6 #6 0x00007f4c6fabcfe3 in Rf_onsigusr1 (dummy=<value optimized out>) at errors.c:178 #7 <signal handler called> #8 0x00007f4c9014079a in brk () from /lib64/libc.so.6 #9 0x00007f4c90140845 in sbrk () from /lib64/libc.so.6 #10 0x00007f4c900dd769 in __default_morecore () from /lib64/libc.so.6 #11 0x00007f4c900d87a2 in _int_free () from /lib64/libc.so.6 #12 0x0000000000b3ff24 in gp_free2 (ptr=0x191c3b000, sz=0) at memprot.c:808 #13 0x0000000000b356fc in AllocSetDelete (context=<value optimized out>) at aset.c:981 #14 0x0000000000b38391 in MemoryContextDeleteImpl (context=0x4a46da0, sfile=0x0, func=<value optimized out>, sline=-1) at mcxt.c:232 #15 MemoryContextDeleteChildren (context=0x4a46da0, sfile=0x0, func=<value optimized out>, sline=-1) at mcxt.c:251 #16 MemoryContextDeleteImpl (context=0x4a46da0, sfile=0x0, func=<value optimized out>, sline=-1) at mcxt.c:205 #17 0x000000000077c851 in ExecEndAgg (node=0x325eb00) at nodeAgg.c:2641 #18 0x00000000007592ad in ExecEndNode (node=0x325eb00) at execProcnode.c:1687 #19 0x000000000075186c in ExecEndPlan (planstate=0x325eb00, estate=0x323f9e8) at execMain.c:2825 #20 0x000000000079dffa in ExecEndSubqueryScan (node=0x325cd20) at nodeSubqueryscan.c:294 #21 0x000000000075921d in ExecEndNode (node=0x325cd20) at execProcnode.c:1638 #22 0x000000000075186c in ExecEndPlan (planstate=0x325cd20, estate=0x323f010) at execMain.c:2825 #23 0x0000000000752565 in ExecutorEnd (queryDesc=<value optimized out>) at execMain.c:1321 #24 0x00000000006dd9bd in PortalCleanupHelper (portal=<value optimized out>) at portalcmds.c:366 #25 PortalCleanup (portal=<value optimized out>) at portalcmds.c:302 #26 0x0000000000b3f077 in PortalDrop () at portalmem.c:402 #27 AtCommit_Portals () at portalmem.c:643 #28 0x000000000051abe5 in CommitTransaction () at xact.c:3379 #29 0x000000000051f1d5 in CommitTransactionCommand () at xact.c:4535 #30 0x000000000099809e in finish_xact_command (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:3180 #31 PostgresMain (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:5260 #32 0x00000000008f1031 in BackendRun (port=0x2aa5520) at postmaster.c:6811 #33 BackendStartup (port=0x2aa5520) at postmaster.c:6408 #34 0x00000000008f70e0 in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:2350 #35 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1556 #36 0x00000000007f63da in main (argc=18, argv=0x2aa1270) at main.c:217 Thread 1 (Thread 0x7f4c93aa2700 (LWP 431264)): #0 0x00007f4c9013f0d3 in poll () from /lib64/libc.so.6 #1 0x0000000000ba8294 in rxThreadFunc (arg=<value optimized out>) at ic_udp.c:6263 #2 0x00007f4c9101f9d1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f4c901488fd in clone () from /lib64/libc.so.6 (gdb) ``` > long running query got hang on master and can't be terminated > ------------------------------------------------------------- > > Key: HAWQ-978 > URL: https://issues.apache.org/jira/browse/HAWQ-978 > Project: Apache HAWQ > Issue Type: Bug > Reporter: Ming LI > Assignee: Lei Chang > > One backend process on master had been running for several days and can't be > terminated. > The session is idle on all segments but master instance. > pstack/strace/back trace of the backend process. > <p><code> > [gpadmin@avw7hdm2p1 ~]$ pstack 431263 > Thread 2 (Thread 0x7f4c93aa2700 (LWP 431264)): > #0 0x00007f4c9013f0d3 in poll () from /lib64/libc.so.6 > #1 0x0000000000ba8294 in rxThreadFunc () > #2 0x00007f4c9101f9d1 in start_thread () from /lib64/libpthread.so.0 > #3 0x00007f4c901488fd in clone () from /lib64/libc.so.6 > Thread 1 (Thread 0x7f4c93af48e0 (LWP 431263)): > #0 0x00007f4c9015805e in __lll_lock_wait_private () from /lib64/libc.so.6 > #1 0x00007f4c900dd16b in _L_lock_9503 () from /lib64/libc.so.6 > #2 0x00007f4c900da6a6 in malloc () from /lib64/libc.so.6 > #3 0x00007f4c9008fb39 in _nl_make_l10nflist () from /lib64/libc.so.6 > #4 0x00007f4c9008ddf5 in _nl_find_domain () from /lib64/libc.so.6 > #5 0x00007f4c9008d6e0 in __dcigettext () from /lib64/libc.so.6 > #6 0x00007f4c6fabcfe3 in Rf_onsigusr1 () from /usr/local/lib64/R/lib/libR.so > #7 <signal handler called> > #8 0x00007f4c9014079a in brk () from /lib64/libc.so.6 > #9 0x00007f4c90140845 in sbrk () from /lib64/libc.so.6 > #10 0x00007f4c900dd769 in __default_morecore () from /lib64/libc.so.6 > #11 0x00007f4c900d87a2 in _int_free () from /lib64/libc.so.6 > #12 0x0000000000b3ff24 in gp_free2 () > #13 0x0000000000b356fc in AllocSetDelete () > #14 0x0000000000b38391 in MemoryContextDeleteImpl () > #15 0x000000000077c851 in ExecEndAgg () > #16 0x00000000007592ad in ExecEndNode () > #17 0x000000000075186c in ExecEndPlan () > #18 0x000000000079dffa in ExecEndSubqueryScan () > #19 0x000000000075921d in ExecEndNode () > #20 0x000000000075186c in ExecEndPlan () > #21 0x0000000000752565 in ExecutorEnd () > #22 0x00000000006dd9bd in PortalCleanup () > #23 0x0000000000b3f077 in AtCommit_Portals () > #24 0x000000000051abe5 in CommitTransaction () > #25 0x000000000051f1d5 in CommitTransactionCommand () > #26 0x000000000099809e in PostgresMain () > #27 0x00000000008f1031 in BackendStartup () > #28 0x00000000008f70e0 in PostmasterMain () > #29 0x00000000007f63da in main () > [gpadmin@avw7hdm2p1 ~]$ > [gpadmin@avw7hdm2p1 ~]$ strace -p 431263 > Process 431263 attached - interrupt to quit > futex(0x7f4c903efe80, FUTEX_WAIT_PRIVATE, 2, NULL^C <unfinished ...> > Process 431263 detached > [gpadmin@avw7hdm2p1 ~]$ > (gdb) thread apply all bt > Thread 2 (Thread 0x7f4c93af48e0 (LWP 431263)): > #0 0x00007f4c9015805e in __lll_lock_wait_private () from /lib64/libc.so.6 > #1 0x00007f4c900dd16b in _L_lock_9503 () from /lib64/libc.so.6 > #2 0x00007f4c900da6a6 in malloc () from /lib64/libc.so.6 > #3 0x00007f4c9008fb39 in _nl_make_l10nflist () from /lib64/libc.so.6 > #4 0x00007f4c9008ddf5 in _nl_find_domain () from /lib64/libc.so.6 > #5 0x00007f4c9008d6e0 in __dcigettext () from /lib64/libc.so.6 > #6 0x00007f4c6fabcfe3 in Rf_onsigusr1 (dummy=<value optimized out>) at > errors.c:178 > #7 <signal handler called> > #8 0x00007f4c9014079a in brk () from /lib64/libc.so.6 > #9 0x00007f4c90140845 in sbrk () from /lib64/libc.so.6 > #10 0x00007f4c900dd769 in __default_morecore () from /lib64/libc.so.6 > #11 0x00007f4c900d87a2 in _int_free () from /lib64/libc.so.6 > #12 0x0000000000b3ff24 in gp_free2 (ptr=0x191c3b000, sz=0) at memprot.c:808 > #13 0x0000000000b356fc in AllocSetDelete (context=<value optimized out>) at > aset.c:981 > #14 0x0000000000b38391 in MemoryContextDeleteImpl (context=0x4a46da0, > sfile=0x0, func=<value optimized out>, sline=-1) at mcxt.c:232 > #15 MemoryContextDeleteChildren (context=0x4a46da0, sfile=0x0, func=<value > optimized out>, sline=-1) at mcxt.c:251 > #16 MemoryContextDeleteImpl (context=0x4a46da0, sfile=0x0, func=<value > optimized out>, sline=-1) at mcxt.c:205 > #17 0x000000000077c851 in ExecEndAgg (node=0x325eb00) at nodeAgg.c:2641 > #18 0x00000000007592ad in ExecEndNode (node=0x325eb00) at execProcnode.c:1687 > #19 0x000000000075186c in ExecEndPlan (planstate=0x325eb00, estate=0x323f9e8) > at execMain.c:2825 > #20 0x000000000079dffa in ExecEndSubqueryScan (node=0x325cd20) at > nodeSubqueryscan.c:294 > #21 0x000000000075921d in ExecEndNode (node=0x325cd20) at execProcnode.c:1638 > #22 0x000000000075186c in ExecEndPlan (planstate=0x325cd20, estate=0x323f010) > at execMain.c:2825 > #23 0x0000000000752565 in ExecutorEnd (queryDesc=<value optimized out>) at > execMain.c:1321 > #24 0x00000000006dd9bd in PortalCleanupHelper (portal=<value optimized out>) > at portalcmds.c:366 > #25 PortalCleanup (portal=<value optimized out>) at portalcmds.c:302 > #26 0x0000000000b3f077 in PortalDrop () at portalmem.c:402 > #27 AtCommit_Portals () at portalmem.c:643 > #28 0x000000000051abe5 in CommitTransaction () at xact.c:3379 > #29 0x000000000051f1d5 in CommitTransactionCommand () at xact.c:4535 > #30 0x000000000099809e in finish_xact_command (argc=<value optimized out>, > argv=<value optimized out>, username=<value optimized out>) at postgres.c:3180 > #31 PostgresMain (argc=<value optimized out>, argv=<value optimized out>, > username=<value optimized out>) at postgres.c:5260 > #32 0x00000000008f1031 in BackendRun (port=0x2aa5520) at postmaster.c:6811 > #33 BackendStartup (port=0x2aa5520) at postmaster.c:6408 > #34 0x00000000008f70e0 in ServerLoop (argc=<value optimized out>, argv=<value > optimized out>) at postmaster.c:2350 > #35 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) > at postmaster.c:1556 > #36 0x00000000007f63da in main (argc=18, argv=0x2aa1270) at main.c:217 > Thread 1 (Thread 0x7f4c93aa2700 (LWP 431264)): > #0 0x00007f4c9013f0d3 in poll () from /lib64/libc.so.6 > #1 0x0000000000ba8294 in rxThreadFunc (arg=<value optimized out>) at > ic_udp.c:6263 > #2 0x00007f4c9101f9d1 in start_thread () from /lib64/libpthread.so.0 > #3 0x00007f4c901488fd in clone () from /lib64/libc.so.6 > (gdb) > </code></p> -- This message was sent by Atlassian JIRA (v6.3.4#6332)