Hi all, >From time to time we have scenario where somebody has some backend killed by some_script_somewhere(TM) in a multi-department company scenario and to me it was always unnecessary time intensive to identify the origin of the signal.
eBPF/bpftrace can be used to find the origin, but I think that PG could simply log which PID/UID (e.g. root or pg-user) raised the signal. So Linux has SA_SIGINFO, we could use that to provide mentioned things. As expected, search of course returned that it was discussed earlier here [1] 11 years ago, but there was no patch back then, so attached is an attempt to do just that. No GUC, and yes it only displays it on Linux. I think FreeBSD also has this, but I haven't tried it there (or I haven't tried other OSes without it - proper autoconf/meson sa_sigaction SA_SIGINFO detection is probably missing with proper #ifdefs ), but I would first like to learn if that would be a welcomed feature or not. -J. [1] - https://hackorum.dev/topics/32019
From 28fe83686c4031eebd1ffad2487aef0101a05295 Mon Sep 17 00:00:00 2001 From: Jakub Wartak <[email protected]> Date: Tue, 17 Feb 2026 12:41:01 +0100 Subject: [PATCH v1] Add errdetail() with PID and UID about source of termination signal. On Linux we can use SA_SIGINFO to fetch additional information about sender of the signal, which can aid troubleshooting. Sample log: FATAL: terminating connection due to administrator command DETAIL: signal sent by PID 508477, UID 1000. Author: Jakub Wartak <[email protected]> Reviewed-by: Discussion: --- src/backend/postmaster/bgworker.c | 6 ++++- src/backend/replication/syncrep.c | 12 +++++++-- src/backend/tcop/postgres.c | 42 +++++++++++++++++++++++++------ src/backend/utils/init/globals.c | 2 ++ src/bin/psql/t/001_basic.pl | 7 +++--- src/include/miscadmin.h | 2 ++ src/port/pqsignal.c | 36 +++++++++++++++++++++++--- 7 files changed, 90 insertions(+), 17 deletions(-) diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 261ccd3f59..20b1893533 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -729,7 +729,11 @@ bgworker_die(SIGNAL_ARGS) ereport(FATAL, (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("terminating background worker \"%s\" due to administrator command", - MyBgworkerEntry->bgw_type))); + MyBgworkerEntry->bgw_type), + proc_die_sender_pid == 0 ? 0 : + errdetail("signal sent by PID %d, UID %d.", + proc_die_sender_pid, proc_die_sender_uid) + )); } /* diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c index 7ea6001e9a..339c03fb6a 100644 --- a/src/backend/replication/syncrep.c +++ b/src/backend/replication/syncrep.c @@ -302,7 +302,11 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit) ereport(WARNING, (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"), - errdetail("The transaction has already committed locally, but might not have been replicated to the standby."))); + errdetail("The transaction has already committed locally, but might not have been replicated to the standby."), + proc_die_sender_pid == 0 ? 0 : + errdetail("signal sent by PID %d, UID %d.", + proc_die_sender_pid, proc_die_sender_uid) + )); whereToSendOutput = DestNone; SyncRepCancelWait(); break; @@ -319,7 +323,11 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit) QueryCancelPending = false; ereport(WARNING, (errmsg("canceling wait for synchronous replication due to user request"), - errdetail("The transaction has already committed locally, but might not have been replicated to the standby."))); + errdetail("The transaction has already committed locally, but might not have been replicated to the standby."), + proc_die_sender_pid == 0 ? 0 : + errdetail("signal sent by PID %d, UID %d.", + proc_die_sender_pid, proc_die_sender_uid) + )); SyncRepCancelWait(); break; } diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 21de158adb..115ea965d4 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3357,15 +3357,27 @@ ProcessInterrupts(void) else if (AmAutoVacuumWorkerProcess()) ereport(FATAL, (errcode(ERRCODE_ADMIN_SHUTDOWN), - errmsg("terminating autovacuum process due to administrator command"))); + errmsg("terminating autovacuum process due to administrator command"), + proc_die_sender_pid == 0 ? 0 : + errdetail("signal sent by PID %d, UID %d.", + proc_die_sender_pid, proc_die_sender_uid) + )); else if (IsLogicalWorker()) ereport(FATAL, (errcode(ERRCODE_ADMIN_SHUTDOWN), - errmsg("terminating logical replication worker due to administrator command"))); + errmsg("terminating logical replication worker due to administrator command"), + proc_die_sender_pid == 0 ? 0 : + errdetail("signal sent by PID %d, UID %d.", + proc_die_sender_pid, proc_die_sender_uid) + )); else if (IsLogicalLauncher()) { ereport(DEBUG1, - (errmsg_internal("logical replication launcher shutting down"))); + (errmsg_internal("logical replication launcher shutting down"), + proc_die_sender_pid == 0 ? 0 : + errdetail("signal sent by PID %d, UID %d.", + proc_die_sender_pid, proc_die_sender_uid) + )); /* * The logical replication launcher can be stopped at any time. @@ -3376,23 +3388,39 @@ ProcessInterrupts(void) else if (AmWalReceiverProcess()) ereport(FATAL, (errcode(ERRCODE_ADMIN_SHUTDOWN), - errmsg("terminating walreceiver process due to administrator command"))); + errmsg("terminating walreceiver process due to administrator command"), + proc_die_sender_pid == 0 ? 0 : + errdetail("signal sent by PID %d, UID %d.", + proc_die_sender_pid, proc_die_sender_uid) + )); else if (AmBackgroundWorkerProcess()) ereport(FATAL, (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("terminating background worker \"%s\" due to administrator command", - MyBgworkerEntry->bgw_type))); + MyBgworkerEntry->bgw_type), + proc_die_sender_pid == 0 ? 0 : + errdetail("signal sent by PID %d, UID %d.", + proc_die_sender_pid, proc_die_sender_uid) + )); else if (AmIoWorkerProcess()) { ereport(DEBUG1, - (errmsg_internal("io worker shutting down due to administrator command"))); + (errmsg_internal("io worker shutting down due to administrator command"), + proc_die_sender_pid == 0 ? 0 : + errdetail("signal sent by PID %d, UID %d.", + proc_die_sender_pid, proc_die_sender_uid) + )); proc_exit(0); } else ereport(FATAL, (errcode(ERRCODE_ADMIN_SHUTDOWN), - errmsg("terminating connection due to administrator command"))); + errmsg("terminating connection due to administrator command"), + proc_die_sender_pid == 0 ? 0 : + errdetail("signal sent by PID %d, UID %d.", + proc_die_sender_pid, proc_die_sender_uid) + )); } if (CheckClientConnectionPending) diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 36ad708b36..144c8aca1b 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -32,6 +32,8 @@ ProtocolVersion FrontendProtocol; volatile sig_atomic_t InterruptPending = false; volatile sig_atomic_t QueryCancelPending = false; volatile sig_atomic_t ProcDiePending = false; +volatile sig_atomic_t proc_die_sender_pid = 0; +volatile sig_atomic_t proc_die_sender_uid = 0; volatile sig_atomic_t CheckClientConnectionPending = false; volatile sig_atomic_t ClientConnectionLost = false; volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false; diff --git a/src/bin/psql/t/001_basic.pl b/src/bin/psql/t/001_basic.pl index 6839f27cbe..aa8f819d9d 100644 --- a/src/bin/psql/t/001_basic.pl +++ b/src/bin/psql/t/001_basic.pl @@ -142,12 +142,11 @@ my ($ret, $out, $err) = $node->psql('postgres', is($ret, 2, 'server crash: psql exit code'); like($out, qr/before/, 'server crash: output before crash'); unlike($out, qr/AFTER/, 'server crash: no output after crash'); -is( $err, - 'psql:<stdin>:2: FATAL: terminating connection due to administrator command -psql:<stdin>:2: server closed the connection unexpectedly +like( $err, qr/psql:<stdin>:2: FATAL: terminating connection due to administrator command +(?:DETAIL: signal sent by PID \d+, UID \d+.\n)?psql:<stdin>:2: server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. -psql:<stdin>:2: error: connection to server was lost', +psql:<stdin>:2: error: connection to server was lost/, 'server crash: error message'); # test \errverbose diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index f16f35659b..63256eff84 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -90,6 +90,8 @@ extern PGDLLIMPORT volatile sig_atomic_t InterruptPending; extern PGDLLIMPORT volatile sig_atomic_t QueryCancelPending; extern PGDLLIMPORT volatile sig_atomic_t ProcDiePending; +extern PGDLLIMPORT volatile sig_atomic_t proc_die_sender_pid; +extern PGDLLIMPORT volatile sig_atomic_t proc_die_sender_uid; extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t TransactionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending; diff --git a/src/port/pqsignal.c b/src/port/pqsignal.c index fbdf9341c2..bb632285f4 100644 --- a/src/port/pqsignal.c +++ b/src/port/pqsignal.c @@ -82,10 +82,18 @@ static volatile pqsigfunc pqsignal_handlers[PG_NSIG]; * * This wrapper also handles restoring the value of errno. */ +#if !defined(FRONTEND) && !defined(WIN32) +static void +wrapper_handler(int signo, siginfo_t *info, void *context) +#else static void wrapper_handler(SIGNAL_ARGS) +#endif { int save_errno = errno; +#if !defined(FRONTEND) && !defined(WIN32) + int postgres_signal_arg = signo; +#endif Assert(postgres_signal_arg > 0); Assert(postgres_signal_arg < PG_NSIG); @@ -105,6 +113,14 @@ wrapper_handler(SIGNAL_ARGS) raise(postgres_signal_arg); return; } + +#ifndef WIN32 + if (signo == SIGTERM && info) + { + proc_die_sender_pid = info->si_pid; + proc_die_sender_uid = info->si_uid; + } +#endif #endif (*pqsignal_handlers[postgres_signal_arg]) (postgres_signal_arg); @@ -125,6 +141,7 @@ pqsignal(int signo, pqsigfunc func) #if !(defined(WIN32) && defined(FRONTEND)) struct sigaction act; #endif + bool use_wrapper = false; Assert(signo > 0); Assert(signo < PG_NSIG); @@ -132,13 +149,26 @@ pqsignal(int signo, pqsigfunc func) if (func != SIG_IGN && func != SIG_DFL) { pqsignal_handlers[signo] = func; /* assumed atomic */ - func = wrapper_handler; + use_wrapper = true; } #if !(defined(WIN32) && defined(FRONTEND)) - act.sa_handler = func; sigemptyset(&act.sa_mask); act.sa_flags = SA_RESTART; +#if !defined(FRONTEND) && !defined(WIN32) + if (use_wrapper) + { + act.sa_sigaction = wrapper_handler; + act.sa_flags |= SA_SIGINFO; + } + else + { + act.sa_handler = func; + } +#else + act.sa_handler = use_wrapper ? wrapper_handler : func; +#endif + #ifdef SA_NOCLDSTOP if (signo == SIGCHLD) act.sa_flags |= SA_NOCLDSTOP; @@ -147,7 +177,7 @@ pqsignal(int signo, pqsigfunc func) Assert(false); /* probably indicates coding error */ #else /* Forward to Windows native signal system. */ - if (signal(signo, func) == SIG_ERR) + if (signal(signo, use_wrapper ? wrapper_handler : func) == SIG_ERR) Assert(false); /* probably indicates coding error */ #endif } -- 2.43.0
