On Sat, Sep 24, 2011 at 22:57, Marti Raudsepp <ma...@juffo.org> wrote:
> However, I now realize that it does make sense to write a separate
> simpler function for the crashed backend case with no
> vbeentry->st_changecount check loops, no checkUser, etc. That would be
> more robust and easier to review.

I implemented this now, but I'm not convinced anymore that it's the
right way to go. I'm duplicating some amount of code that could be
subject to bitrot in the future since this code path won't be
excercised often. But I'll let the reviewers decide.

Is there a sane way to regression-test backend crashes?

> I propsed replacing non-ASCII characters with '?' earlier.

This is also in. I created a new function in
backend/utils/adt/ascii.c. It didn't quite fit in because all other
functions in this file are dealing with Datums, but I couldn't find a
better place.

(I'm still not sure what "adt" means)

Regards,
Marti
From 0f46bb1357bafbe940e7df8fce38c01e2237f57e Mon Sep 17 00:00:00 2001
From: Marti Raudsepp <ma...@juffo.org>
Date: Wed, 28 Sep 2011 00:46:48 +0300
Subject: [PATCH] Log crashed backend's query (activity string)

The crashing query is often a good starting point in debugging the
cause, and much more easily accessible than core dumps.

We're extra-paranoid in reading the activity buffer since it might be
corrupt. All non-ASCII characters are replaced with '?'

Example output:
LOG:  server process (PID 31451) was terminated by signal 9: Killed
DETAIL:  Running query: DO LANGUAGE plpythonu 'import os;os.kill(os.getpid(),9)'
---
 src/backend/postmaster/pgstat.c     |   59 +++++++++++++++++++++++++++++++++++
 src/backend/postmaster/postmaster.c |   17 +++++++---
 src/backend/utils/adt/ascii.c       |   31 ++++++++++++++++++
 src/include/pgstat.h                |    1 +
 src/include/utils/ascii.h           |    1 +
 5 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index eb9adc8..812bf04 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -58,6 +58,7 @@
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
 #include "storage/procsignal.h"
+#include "utils/ascii.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
@@ -2747,6 +2748,64 @@ pgstat_get_backend_current_activity(int pid, bool checkUser)
 	return "<backend information not available>";
 }
 
+/* ----------
+ * pgstat_get_backend_current_activity() -
+ *
+ *	Return a string representing the current activity of the backend with
+ *	the specified PID.	Like the function above, but reads shared memory with
+ *	the expectation that it may be corrupt.
+ *
+ *	This function is only intended to be used by postmaster to report the
+ *	query that crashed the backend. In particular, no attempt is made to
+ *	follow the correct concurrency protocol when accessing the
+ *	BackendStatusArray. But that's OK, in the worst case we get a corrupted
+ *	message. We also must take care not to trip on ereport(ERROR).
+ *
+ *	Note: return strings for special cases match pg_stat_get_backend_activity.
+ * ----------
+ */
+const char *
+pgstat_get_crashed_backend_activity(int pid)
+{
+	volatile PgBackendStatus *beentry;
+	int			i;
+
+	beentry = BackendStatusArray;
+	for (i = 1; i <= MaxBackends; i++)
+	{
+		if (beentry->st_procpid == pid)
+		{
+			/* Read pointer just once, so it can't change after validation */
+			const char *activity = beentry->st_activity;
+			char	   *buffer;
+
+			/*
+			 * We can't access activity pointer before we verify that it
+			 * falls into BackendActivityBuffer. To make sure that the
+			 * string's ending is contained within the buffer, the string
+			 * can start at offset (MaxBackends - 1) at most.
+			 */
+			if (activity < BackendActivityBuffer ||
+				activity > (BackendActivityBuffer +
+						(MaxBackends - 1) * pgstat_track_activity_query_size))
+				return "<command string corrupt>";
+
+			if (*(activity) == '\0')
+				return "<command string not enabled>";
+
+			buffer = (char *) palloc(pgstat_track_activity_query_size);
+			ascii_safe_strncpy(buffer, activity,
+							   pgstat_track_activity_query_size);
+
+			return buffer;
+		}
+
+		beentry++;
+	}
+
+	/* PID not found */
+	return "<backend information not available>";
+}
 
 /* ------------------------------------------------------------
  * Local support functions follow
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 94b57fa..9ba622c 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2763,6 +2763,8 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 static void
 LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 {
+	const char   *activity = pgstat_get_crashed_backend_activity(pid);
+
 	if (WIFEXITED(exitstatus))
 		ereport(lev,
 
@@ -2770,7 +2772,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 		  translator: %s is a noun phrase describing a child process, such as
 		  "server process" */
 				(errmsg("%s (PID %d) exited with exit code %d",
-						procname, pid, WEXITSTATUS(exitstatus))));
+						procname, pid, WEXITSTATUS(exitstatus)),
+				 errdetail("Running query: %s", activity)));
 	else if (WIFSIGNALED(exitstatus))
 #if defined(WIN32)
 		ereport(lev,
@@ -2780,7 +2783,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 		  "server process" */
 				(errmsg("%s (PID %d) was terminated by exception 0x%X",
 						procname, pid, WTERMSIG(exitstatus)),
-				 errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value.")));
+				 errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."),
+				 errdetail("Running query: %s", activity)));
 #elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
 	ereport(lev,
 
@@ -2790,7 +2794,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 			(errmsg("%s (PID %d) was terminated by signal %d: %s",
 					procname, pid, WTERMSIG(exitstatus),
 					WTERMSIG(exitstatus) < NSIG ?
-					sys_siglist[WTERMSIG(exitstatus)] : "(unknown)")));
+					sys_siglist[WTERMSIG(exitstatus)] : "(unknown)"),
+			 errdetail("Running query: %s", activity)));
 #else
 		ereport(lev,
 
@@ -2798,7 +2803,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 		  translator: %s is a noun phrase describing a child process, such as
 		  "server process" */
 				(errmsg("%s (PID %d) was terminated by signal %d",
-						procname, pid, WTERMSIG(exitstatus))));
+						procname, pid, WTERMSIG(exitstatus)),
+				 errdetail("Running query: %s", activity)));
 #endif
 	else
 		ereport(lev,
@@ -2807,7 +2813,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 		  translator: %s is a noun phrase describing a child process, such as
 		  "server process" */
 				(errmsg("%s (PID %d) exited with unrecognized status %d",
-						procname, pid, exitstatus)));
+						procname, pid, exitstatus),
+				 errdetail("Running query: %s", activity)));
 }
 
 /*
diff --git a/src/backend/utils/adt/ascii.c b/src/backend/utils/adt/ascii.c
index 7ecf494..d59f6ec 100644
--- a/src/backend/utils/adt/ascii.c
+++ b/src/backend/utils/adt/ascii.c
@@ -158,3 +158,34 @@ to_ascii_default(PG_FUNCTION_ARGS)
 
 	PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
 }
+
+/* ----------
+ * convert a string in unknown encoding to ASCII. Replace non-ASCII bytes
+ * with '?'
+ * This must not trigger ereport(ERROR), as this is called from postmaster.
+ * ----------
+ */
+void
+ascii_safe_strncpy(char *dest, const char *src, int len)
+{
+	int			i;
+
+	for (i = 0; i < (len - 1); i++)
+	{
+		char ch = src[i];
+
+		if (ch == '\0')
+			break;
+		/* Keep printable ASCII characters */
+		if (32 <= ch && ch <= 127)
+			dest[i] = src[i];
+		/* White-space is also OK */
+		else if (ch == '\n' || ch == '\r' || ch == '\t')
+			dest[i] = src[i];
+		/* Everything else is replaced with '?' */
+		else
+			dest[i] = '?';
+	}
+
+	dest[i] = '\0';
+}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 20c4d43..9694896 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -720,6 +720,7 @@ extern void pgstat_report_appname(const char *appname);
 extern void pgstat_report_xact_timestamp(TimestampTz tstamp);
 extern void pgstat_report_waiting(bool waiting);
 extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser);
+extern const char *pgstat_get_crashed_backend_activity(int pid);
 
 extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id);
 extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id);
diff --git a/src/include/utils/ascii.h b/src/include/utils/ascii.h
index ad70e03..adeddb5 100644
--- a/src/include/utils/ascii.h
+++ b/src/include/utils/ascii.h
@@ -16,5 +16,6 @@
 extern Datum to_ascii_encname(PG_FUNCTION_ARGS);
 extern Datum to_ascii_enc(PG_FUNCTION_ARGS);
 extern Datum to_ascii_default(PG_FUNCTION_ARGS);
+extern void ascii_safe_strncpy(char *dest, const char *src, int len);
 
 #endif   /* _ASCII_H_ */
-- 
1.7.6.3

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to