On Wed, Dec 31, 2014 at 01:56:08PM -0500, Noah Misch wrote: > On Wed, Dec 31, 2014 at 12:32:37AM -0500, Robert Haas wrote: > > On Sun, Dec 28, 2014 at 4:58 PM, Noah Misch <n...@leadboat.com> wrote: > > > I wondered whether to downgrade FATAL to LOG in back branches. > > > Introducing a > > > new reason to block startup is disruptive for a minor release, but having > > > the > > > postmaster deadlock at an unpredictable later time is even more > > > disruptive. I > > > am inclined to halt startup that way in all branches. > > > > Jeepers. I'd rather not do that. From your report, this problem has > > been around for years. Yet, as far as I know, it's bothering very few > > real users, some of whom might be far more bothered by the postmaster > > suddenly failing to start. I'm fine with a FATAL in master, but I'd > > vote against doing anything that might prevent startup in the > > back-branches without more compelling justification. > > Clusters hosted on OS X fall into these categories: > > 1) Unaffected configuration. This includes everyone setting a valid messages > locale via LANG, LC_ALL or LC_MESSAGES. > 2) Affected configuration. Through luck and light use, the cluster would not > experience the crashes/hangs. > 3) Cluster would experience the crashes/hangs. > > DBAs in (3) want the FATAL at startup, but those in (2) want a LOG message > instead. DBAs in (1) don't care. Since intermittent postmaster hangs are far > worse than startup failure, if (2) and (3) have similar population, FATAL is > the better bet. If (2) is sufficiently more populous than (3), then the many > small pricks from startup failure do add up to hurt more than the occasional > postmaster hang. Who knows how that calculation plays out.
The first attached patch, for all branches, adds LOG-level messages and an assertion. So cassert builds will fail hard, while others won't. The second patch, for master only, changes the startup-time message to FATAL. If we decide to use FATAL in all branches, I would just squash them into one.
diff --git a/configure b/configure index 7594401..f7dedc0 100755 --- a/configure +++ b/configure @@ -11366,7 +11366,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l +for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.in b/configure.in index 0dc3f18..76c6405 100644 --- a/configure.in +++ b/configure.in @@ -1265,7 +1265,7 @@ PGAC_FUNC_GETTIMEOFDAY_1ARG LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l]) +AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 5106f52..6b0190c 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -87,6 +87,10 @@ #include <dns_sd.h> #endif +#ifdef HAVE_PTHREAD_IS_THREADED_NP +#include <pthread.h> +#endif + #include "access/transam.h" #include "access/xlog.h" #include "bootstrap/bootstrap.h" @@ -1200,6 +1204,24 @@ PostmasterMain(int argc, char *argv[]) */ RemovePgTempFiles(); +#ifdef HAVE_PTHREAD_IS_THREADED_NP + + /* + * On Darwin, libintl replaces setlocale() with a version that calls + * CFLocaleCopyCurrent() when its second argument is "" and every relevant + * environment variable is unset or empty. CFLocaleCopyCurrent() makes + * the process multithreaded. The postmaster calls sigprocmask() and + * calls fork() without an immediate exec(), both of which have undefined + * behavior in a multithreaded program. A multithreaded postmaster is the + * normal case on Windows, which offers neither fork() nor sigprocmask(). + */ + if (pthread_is_threaded_np() != 0) + ereport(LOG, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("postmaster became multithreaded during startup"), + errhint("Set the LC_ALL environment variable to a valid locale."))); +#endif + /* * Remember postmaster startup time */ @@ -1657,6 +1679,15 @@ ServerLoop(void) last_touch_time = now; } +#ifdef HAVE_PTHREAD_IS_THREADED_NP + + /* + * With assertions enabled, check regularly for appearance of + * additional threads. All builds check at start and exit. + */ + Assert(pthread_is_threaded_np() == 0); +#endif + /* * If we already sent SIGQUIT to children and they are slow to shut * down, it's time to send them SIGKILL. This doesn't happen @@ -4745,6 +4776,18 @@ SubPostmasterMain(int argc, char *argv[]) static void ExitPostmaster(int status) { +#ifdef HAVE_PTHREAD_IS_THREADED_NP + + /* + * There is no known cause for a postmaster to become multithreaded after + * startup. Recheck to account for the possibility of unknown causes. + */ + if (pthread_is_threaded_np() != 0) + ereport(LOG, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("postmaster became multithreaded"))); +#endif + /* should cleanup shared memory and kill all backends */ /* diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 465281c..83f04e9 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -394,6 +394,9 @@ /* Define to 1 if the PS_STRINGS thing exists. */ #undef HAVE_PS_STRINGS +/* Define to 1 if you have the `pthread_is_threaded_np' function. */ +#undef HAVE_PTHREAD_IS_THREADED_NP + /* Define to 1 if you have the <pwd.h> header file. */ #undef HAVE_PWD_H
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 6b0190c..96d54e8 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -1216,7 +1216,7 @@ PostmasterMain(int argc, char *argv[]) * normal case on Windows, which offers neither fork() nor sigprocmask(). */ if (pthread_is_threaded_np() != 0) - ereport(LOG, + ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("postmaster became multithreaded during startup"), errhint("Set the LC_ALL environment variable to a valid locale."))); @@ -4781,11 +4781,14 @@ ExitPostmaster(int status) /* * There is no known cause for a postmaster to become multithreaded after * startup. Recheck to account for the possibility of unknown causes. + * This message uses LOG level, because an unclean shutdown at this point + * would usually not look much different from a clean shutdown. */ if (pthread_is_threaded_np() != 0) ereport(LOG, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("postmaster became multithreaded"))); + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("postmaster became multithreaded"), + errdetail("Please report this to <pgsql-b...@postgresql.org>."))); #endif /* should cleanup shared memory and kill all backends */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers