Hi,

Bug #16427 mentioned that temporary files are not removed after a crash. I
heard similar complaints from some customers. In the bug report [1], I
proposed a new GUC to control whether temporary files are removed after a
crash recovery. The current behavior is only useful for debugging purposes.
It also has an undesirable behavior: you have to restart the service to
reclaim the space. Interrupting the service continuity isn't always an
option and due to limited resources, you have no choice but to restart the
service.

The new GUC cleanup_temp_files_after_crash is marked as SIGHUP. Hence, you
can enable it to debug without service interruption. The default value is
on which means it changes the current behavior. Documentation and tests are
included.

[1]
https://www.postgresql.org/message-id/CAH503wB28N1382YReXWjqpqZE6iqaxERoZUqnf02eNOYs0cZOA%40mail.gmail.com

Regards,

-- 
Euler Taveira                 http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
From bcd5dcc00cade3a80ee83f88b15f980ac7cdd0f5 Mon Sep 17 00:00:00 2001
From: Euler Taveira <euler.tave...@2ndquadrant.com>
Date: Mon, 25 May 2020 00:08:20 -0300
Subject: [PATCH] Control temporary files removal after crash

A new GUC cleanup_temp_files_after_crash controls whether temporary
files are removed after a crash. Successive crashes could result in
useless storage usage until service is restarted. It could be the case
on host with inadequate resources. This manual intervention for some
environments is not desirable. This GUC is marked as SIGHUP hence you
don't have to interrupt the service to change it.

The current behavior introduces a backward incompatibility (minor one)
which means Postgres will reclaim temporary file space after a crash.
The main reason is that we favor service continuity over debugging.
---
 doc/src/sgml/config.sgml                      |  17 ++
 src/backend/postmaster/postmaster.c           |   5 +
 src/backend/storage/file/fd.c                 |  12 +-
 src/backend/utils/misc/guc.c                  |   9 +
 src/backend/utils/misc/postgresql.conf.sample |   2 +
 src/include/postmaster/postmaster.h           |   1 +
 src/test/recovery/t/022_crash_temp_files.pl   | 194 ++++++++++++++++++
 7 files changed, 235 insertions(+), 5 deletions(-)
 create mode 100644 src/test/recovery/t/022_crash_temp_files.pl

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index f043433e31..9e6265afe1 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9553,6 +9553,23 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-cleanup-temp-files-after-crash" xreflabel="cleanup_temp_files_after_crash">
+      <term><varname>cleanup_temp_files_after_crash</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>cleanup_temp_files_after_crash</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        When set to on, <productname>PostgreSQL</productname> will automatically
+        remove temporary files after a backend crash. If disabled, sucessive
+        crashes (while queries are using temporary files) could result in
+        accumulation of useless files. However, in some circumstances (e.g. debugging),
+        you want to examine these temporary files. It defaults to <literal>on</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-data-sync-retry" xreflabel="data_sync_retry">
       <term><varname>data_sync_retry</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 959e3b8873..c906f577f8 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -248,6 +248,7 @@ bool		Db_user_namespace = false;
 bool		enable_bonjour = false;
 char	   *bonjour_name;
 bool		restart_after_crash = true;
+bool		cleanup_temp_files_after_crash = true;
 
 /* PIDs of special child processes; 0 when not running */
 static pid_t StartupPID = 0,
@@ -4014,6 +4015,10 @@ PostmasterStateMachine(void)
 		ereport(LOG,
 				(errmsg("all server processes terminated; reinitializing")));
 
+		/* remove leftover temporary files after a crash */
+		if (cleanup_temp_files_after_crash)
+			RemovePgTempFiles();
+
 		/* allow background workers to immediately restart */
 		ResetBackgroundWorkerCrashTimes();
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index bd72a87ee3..b5c87787a2 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2992,11 +2992,13 @@ CleanupTempFiles(bool isCommit, bool isProcExit)
  * remove any leftover files created by OpenTemporaryFile and any leftover
  * temporary relation files created by mdcreate.
  *
- * NOTE: we could, but don't, call this during a post-backend-crash restart
- * cycle.  The argument for not doing it is that someone might want to examine
- * the temp files for debugging purposes.  This does however mean that
- * OpenTemporaryFile had better allow for collision with an existing temp
- * file name.
+ * During post-backend-crash restart cycle, this routine could be called if
+ * cleanup_temp_files_after_crash GUC is enabled. Multiple crashes while
+ * queries are using temp files could result in useless storage usage that can
+ * only be reclaimed by a service restart. The argument against enabling it is
+ * that someone might want to examine the temporary files for debugging
+ * purposes. This does however mean that OpenTemporaryFile had better allow for
+ * collision with an existing temp file name.
  *
  * NOTE: this function and its subroutines generally report syscall failures
  * with ereport(LOG) and keep going.  Removing temp files is not so critical
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a62d64eaa4..44f070cb66 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1358,6 +1358,15 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"cleanup_temp_files_after_crash", PGC_SIGHUP, ERROR_HANDLING_OPTIONS,
+			gettext_noop("Remove temporary files after backend crash."),
+			NULL
+		},
+		&cleanup_temp_files_after_crash,
+		true,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"log_duration", PGC_SUSET, LOGGING_WHAT,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 9cb571f7cc..a4c3439211 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -757,6 +757,8 @@
 
 #exit_on_error = off			# terminate session on any error?
 #restart_after_crash = on		# reinitialize after backend crash?
+#cleanup_temp_files_after_crash = on	# remove temporary files after
+#										# backend crash?
 #data_sync_retry = off			# retry or panic on failure to fsync
 					# data?
 					# (change requires restart)
diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h
index babc87dfc9..90cc7ccb62 100644
--- a/src/include/postmaster/postmaster.h
+++ b/src/include/postmaster/postmaster.h
@@ -29,6 +29,7 @@ extern bool log_hostname;
 extern bool enable_bonjour;
 extern char *bonjour_name;
 extern bool restart_after_crash;
+extern bool cleanup_temp_files_after_crash;
 
 #ifdef WIN32
 extern HANDLE PostmasterHandle;
diff --git a/src/test/recovery/t/022_crash_temp_files.pl b/src/test/recovery/t/022_crash_temp_files.pl
new file mode 100644
index 0000000000..dee3783a0c
--- /dev/null
+++ b/src/test/recovery/t/022_crash_temp_files.pl
@@ -0,0 +1,194 @@
+#
+# Test cleanup of temporary files after a crash.
+#
+# This is a description that I will add later.
+#
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More;
+use Config;
+use Time::HiRes qw(usleep);
+
+plan tests => 9;
+
+
+# To avoid hanging while expecting some specific input from a psql
+# instance being driven by us, add a timeout high enough that it
+# should never trigger even on very slow machines, unless something
+# is really wrong.
+my $psql_timeout = IPC::Run::timer(60);
+
+my $node = get_new_node('node_crash');
+$node->init();
+$node->start();
+
+# By default, PostgresNode doesn't restart after crash
+# Reduce work_mem to generate temporary file with a few number of rows
+$node->safe_psql(
+	'postgres',
+	q[ALTER SYSTEM SET cleanup_temp_files_after_crash = on;
+				   ALTER SYSTEM SET log_connections = 1;
+				   ALTER SYSTEM SET work_mem = '64kB';
+				   ALTER SYSTEM SET restart_after_crash = on;
+				   SELECT pg_reload_conf();]);
+
+# create table, insert rows
+$node->safe_psql(
+	'postgres',
+	q[CREATE TABLE tab_crash (a text);
+		INSERT INTO tab_crash (a) SELECT gen_random_uuid() FROM generate_series(1, 500);]);
+
+#
+# Test cleanup temporary files after crash
+#
+
+# Run psql, keeping session alive, so we have an alive backend to kill.
+my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
+my $killme = IPC::Run::start(
+	[
+		'psql', '-X', '-qAt', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d',
+		$node->connstr('postgres')
+	],
+	'<',
+	\$killme_stdin,
+	'>',
+	\$killme_stdout,
+	'2>',
+	\$killme_stderr,
+	$psql_timeout);
+
+# Get backend pid
+$killme_stdin .= q[
+SELECT pg_backend_pid();
+];
+ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
+	'acquired pid for SIGKILL');
+my $pid = $killme_stdout;
+chomp($pid);
+$killme_stdout = '';
+$killme_stderr = '';
+
+# Run the query that generates a temporary file and that will be killed before
+# it finishes. Since the query that generates the temporary file does not
+# return before the connection is killed, use a SELECT before to trigger
+# pump_until.
+$killme_stdin .= q[
+BEGIN;
+SELECT $$in-progress-before-sigkill$$;
+WITH foo AS (SELECT a FROM tab_crash ORDER BY a) SELECT a, pg_sleep(1) FROM foo;
+];
+ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigkill/m),
+	'select in-progress-before-sigkill');
+$killme_stdout = '';
+$killme_stderr = '';
+
+# Kill with SIGKILL
+my $ret = TestLib::system_log('pg_ctl', 'kill', 'KILL', $pid);
+is($ret, 0, 'killed process with KILL');
+
+# Close psql session
+$killme->finish;
+
+# Wait till server restarts
+$node->poll_query_until('postgres', 'SELECT 1', '1');
+
+# Check for temporary files
+is($node->safe_psql(
+	'postgres',
+	'SELECT COUNT(1) FROM pg_ls_dir($$base/pgsql_tmp$$)'),
+	qq(0), 'no temporary files');
+
+#
+# Test old behavior (doesn't cleanup temporary file after crash)
+#
+$node->safe_psql(
+	'postgres',
+	q[ALTER SYSTEM SET cleanup_temp_files_after_crash = off;
+				   SELECT pg_reload_conf();]);
+
+# Restart psql session
+($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
+$killme->run();
+
+# Get backend pid
+$killme_stdin .= q[
+SELECT pg_backend_pid();
+];
+ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
+	'acquired pid for SIGKILL');
+$pid = $killme_stdout;
+chomp($pid);
+$killme_stdout = '';
+$killme_stderr = '';
+
+# Run the query that generates a temporary file and that will be killed before
+# it finishes. Since the query that generates the temporary file does not
+# return before the connection is killed, use a SELECT before to trigger
+# pump_until.
+$killme_stdin .= q[
+BEGIN;
+SELECT $$in-progress-before-sigkill$$;
+WITH foo AS (SELECT a FROM tab_crash ORDER BY a) SELECT a, $$in-progress-before-sigkill$$, pg_sleep(1) FROM foo;
+];
+ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigkill/m),
+	'select in-progress-before-sigkill');
+$killme_stdout = '';
+$killme_stderr = '';
+
+# Kill with SIGKILL
+$ret = TestLib::system_log('pg_ctl', 'kill', 'KILL', $pid);
+is($ret, 0, 'killed process with KILL');
+
+# Close psql session
+$killme->finish;
+
+# Wait till server restarts
+$node->poll_query_until('postgres', 'SELECT 1', '1');
+
+# Check for temporary files -- should be there
+is($node->safe_psql(
+	'postgres',
+	'SELECT COUNT(1) FROM pg_ls_dir($$base/pgsql_tmp$$)'),
+	qq(1), 'one temporary file');
+
+# Restart should remove the temporary files
+$node->restart();
+
+# Check the temporary files -- should be gone
+is($node->safe_psql(
+	'postgres',
+	'SELECT COUNT(1) FROM pg_ls_dir($$base/pgsql_tmp$$)'),
+	qq(0), 'temporary file was removed');
+
+$node->stop();
+
+# Pump until string is matched, or timeout occurs
+sub pump_until
+{
+	my ($proc, $stream, $untl) = @_;
+	$proc->pump_nb();
+	while (1)
+	{
+		last if $$stream =~ /$untl/;
+		if ($psql_timeout->is_expired)
+		{
+			diag("aborting wait: program timed out");
+			diag("stream contents: >>", $$stream, "<<");
+			diag("pattern searched for: ", $untl);
+
+			return 0;
+		}
+		if (not $proc->pumpable())
+		{
+			diag("aborting wait: program died");
+			diag("stream contents: >>", $$stream, "<<");
+			diag("pattern searched for: ", $untl);
+
+			return 0;
+		}
+		$proc->pump();
+	}
+	return 1;
+}
-- 
2.20.1

Reply via email to