Hi all
I recently wrote a utility that adds a $node->gdb_backends() method to
PostgresNode instances - figured I'd share it here in case anyone finds it
useful, or wants to adopt it into the features of the TAP tools.
This function provides a one-line way to dump stacks for all running
backends to per-pid files or to the main test log, as well as the values of
various global variables that are potentially of interest. A default set of
globals will be dumped for each backend and the caller can specify
additional expressions of interest.
If requested, cores will be dumped for each running backend.
A subset of backends may be passed by pid instead, so you can easily target
specific backends you're interested in.
I initially wrote this to help debug a variety of issues with shutdown,
where I hacked the PostgresNode stop() method to trap failed shutdowns and
report stacks for all surviving processes + the postmaster in my wrapper
class for PostgresNode:
sub stop {
my ($self, $mode) = @_;
local($@);
eval {
PostgresNode::stop($self, $mode);
};
if ($@) {
$node->gdb_backends(want_cores => 1);
die $@;
}
}
#
# This is an excerpt from a subclass of PostgresNode
#
# Generate backtraces and optionally core files for all user backends and
# walsenders associated with this node. Requires gdb to be present. Cores
# will be labeled by node name.
sub gdb_backends {
my ($self, %kwargs) = @_;
$kwargs{backtrace_timeout_s} //= '60';
$kwargs{core_timeout_s} //= '60';
$kwargs{want_cores} //= 0;
$kwargs{core_name_pattern} //= 'core.{{pid}}';
$kwargs{gdb_logfile_pattern} //= '';
my $postmaster_pid = $self->{_pid};
my $pgname = $self->name;
# Globals
# TODO make these conditional on an expression to filter them.
# TODO handle statics that vary across files
# TODO add typecasts for when we don't have debuginfo
# TODO useful GUCs
#
my @print_exprs = (
# All backends
'IsPostmasterEnvironment',
'IsUnderPostmaster',
'PostmasterPid',
'LocalRecoveryInProgress',
'*MyProc',
'MyAuxProcType',
'*XLogCtl',
'*ControlFile',
# Generic signal handling
'InterruptPending',
'ProcDiePending',
'ShutdownRequestPending',
'ConfigReloadPending',
# user backend / postgres
'xact_started',
'doing_extended_query_message',
'ignore_till_sync',
# startup process
'ThisTimeLineID',
'LastRec',
'ArchiveRecoveryRequested',
'InArchiveRecovery',
'PrimaryConnInfo',
'PrimarySlotName',
'StandbyMode',
# autovac
'am_autovacuum_launcher',
'am_autovacuum_worker',
'got_SIGHUP',
'got_SIGUSR2',
'got_SIGTERM',
"'autovacuum.c':got_SIGTERM",
# for walsenders
'am_walsender',
'am_cascading_walsender',
'am_db_walsender',
'*MyWalSnd',
'*xlogreader',
'sendTimeLine',
'sentPtr',
'streamingDoneSending',
'streamingDoneReceiving',
"'walsender.c':got_SIGTERM",
'got_STOPPING',
'got_SIGUSR2',
'replication_active',
'*logical_decoding_ctx',
'logical_startptr',
# walreceiver
'recvFileTLI',
'*wrconn',
# checkpointer
'*CheckpointerShmem',
'last_checkpoint_time',
'ckpt_active',
# for bgworkers
'IsBackgroundWorker',
# for pgl backends
'*MyPGLogicalWorker',
'*MyPGLSubscription',
# for bdr backends
'*MyBdrSubscription',
# postmaster
'pmState',
);
# Add your own print expressions by passing print_exprs => ['var1',
'var2']
push @print_exprs, @{$kwargs{print_exprs}}
if (defined($kwargs{print_exprs}));
my @pids;
if (defined($kwargs{pids})) {
if (ref($kwargs{pids}) eq 'ARRAY') {
# arrayref pid-list
@pids = @{$kwargs{pids}};
} elsif (ref($kwargs{pids}) eq '') {
# Scalar pid-list
@pids = split(qr/[\r\n]/, $kwargs{pids});
} else {
die("keyword argument 'pids' must be undef, an
arrayref, or a scalar string of pids");
}
} else {
# Probe all children. Default if no pid list passed.
#
# We can't rely on querying the db because it might be shutting
down so we don't
# want to use pg_stat_activity and pg_stat_repliation. Use the
postmaster pid
# instead, with ps.
my ($stdout, $stderr);
IPC::Run::run(['pgrep', '--parent', $postmaster_pid], '>',
\$stdout, '2>', \$stderr);
print("raw pid list: $stdout\n");
@pids = split(qr/[\r\n]/, $stdout);
if (scalar(@pids) == 0) {
print("Failed to find child processes for pid
$postmaster_pid. pgrep produced stdout \"$stdout\" and stderr \"$stderr\".\n");
return;
}
# Include postmaster itself in the list
push @pids, $postmaster_pid;
}
print("getting backtraces of children of postmaster $postmaster_pid for
node $pgname: @pids\n");
foreach my $pid (@pids) {
my $core_path = $kwargs{core_name_pattern};
$core_path =~ s/\{\{pmpid\}\}/$postmaster_pid/g;
$core_path =~ s/\{\{pid\}\}/$pid/g;
$core_path =~ s/\{\{name\}\}/$pgname/g;
if (dirname($core_path) ne "") {
make_path(dirname($core_path));
}
my $gdb_logfile = $kwargs{gdb_logfile_pattern};
$gdb_logfile =~ s/\{\{pmpid\}\}/$postmaster_pid/g;
$gdb_logfile =~ s/\{\{pid\}\}/$pid/g;
$gdb_logfile =~ s/\{\{name\}\}/$pgname/g;
if (dirname($gdb_logfile) ne "") {
make_path(dirname($gdb_logfile));
}
my $gdbcmds = q[set prompt
set style enabled off
set pagination off
set print pretty on
set print max-depth 20
set print frame-arguments all
set print frame-info source-and-location
set print entry-values if-needed
set print symbol-filename on
set print symbol-loading full
set print type typedefs on
set print symbol on
set print array on
set print array-indexes on
set print elements 100
set print null-stop on
];
$gdbcmds .= qq[attach $pid
];
$gdbcmds .= q[info proc
if ($_exitsignal)
printf "Exited with signal: %d\n", $_exitsignal
end
printf "application_name = %s\n", application_name
printf "debug_query_string = %s\n", debug_query_string
echo \nbacktrace (short):\n
bt
echo \nbacktrace (extended):\n
thread apply all bt full
printf "\n\nEXPRESSIONS:\n"
];
for my $print_expr (@print_exprs) {
# This prints annoying $nn convenience variable labels,
but
# there's not much to be done about that.
$gdbcmds .= qq[printf "$print_expr: "
p $print_expr
];
}
if ($kwargs{'want_cores'}) {
$gdbcmds .= qq[printf "\\n\\n"
gcore $core_path
];
}
$gdbcmds .= q[quit 0
];
# TODO: recursively expand some of the target vars
#
# Hint: if you want to run REALLY REALLY SLOW you can also get
state of
# globals. We should possibly do this but it's not easy to
filter out the
# libc stuff etc, and requires some back-and-forth with gdb.
#
my @log_arg = ();
if ($gdb_logfile) {
print("Writing gdb log to ${gdb_logfile}\n");
@log_arg = ('&>', $gdb_logfile);
}
my $timeout = IPC::Run::timeout($kwargs{'backtrace_timeout_s'});
print("--BACKTRACE-START-- $pid\n") unless ($gdb_logfile);
my $h = IPC::Run::start(['gdb'], '<', \$gdbcmds, @log_arg,
$timeout);
do {
$h->pump;
} while $h->pumpable;
$h->finish;
print("--BACKTRACE-END-- $pid\n") unless ($gdb_logfile);
my $rc = $h->result(0);
print("gdb exited with $rc\n") unless ($rc == 0);
}
print("all backtraces and (if requested) cores have been collected.\n");
}