Here's an updated patch.  It's mostly just rebased over the recent
firehose, but with lots of comments and a few names (hopefully)
improved.  There is one code change to highlight though:

maybe_start_io_workers() knows when it's not allowed to create new
workers, an interesting case being FatalError before we have started
the new world.  The previous coding of DetermineSleepTime() didn't
know about that, so it could return 0 (don't sleep), and then the
postmaster could busy-wait for restart progress.  Maybe there were
other cases like that, but in general DetermineSleepTime() and
maybe_start_io_workers() really need to be 100% in agreement.  So I
have moved that knowledge into a new function
maybe_start_io_workers_scheduled_at().  Both DetermineSleepTime() and
maybe_start_io_workers() call that so there is a single source of
truth.

I think I got confused about that because it's not that obvious why
the existing code doesn't test FatalError.

I thought of a slightly bigger refactoring that might deconfuse
DetermineSleepTime() a bit more.  Probably material for the next
cycle, but basically the idea is to stop using a bunch of different
conditions and different units of time and convert the whole thing to
a simple find-the-lowest-time function.  I kept that separate.

I'll post a new version of the patch that was v3-0002 separately.
From ccc5b6fc9cf7d30359b015c953c04f481c66657e Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Mon, 6 Apr 2026 20:54:53 +1200
Subject: [PATCH v4 2/2] Refactor the postmaster's periodic job scheduling.

DetermineSleepTime() considers the following reasons for ServerLoop() to
wake up:

 * bgworker restart delay reached
 * I/O worker launch interval reached
 * SIGKILL timeout reached during immediate shutdown/crash restart
 * periodically checking the lock file
 * periodically touching socket files

To make it easier to follow:

 * move the next-bgworker-wakeup logic out to its own function
 * standardize the unit of timekeeping
 * convert DetermineSleepTime() to just: which is soonest?

As a side-effect, SIGKILL, lockfile and socket files duties are now
performed with more accurate timing.
---
 src/backend/postmaster/postmaster.c | 247 ++++++++++++++--------------
 1 file changed, 125 insertions(+), 122 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index c42564500c6..2a6887eb6c2 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -361,13 +361,24 @@ static PMState pmState = PM_INIT;
  */
 static bool connsAllowed = true;
 
-/* Start time of SIGKILL timeout during immediate shutdown or child crash */
-/* Zero means timeout is not running */
-static time_t AbortStartTime = 0;
+/* Special values for scheduling Postmaster duties at certain times. */
+#define PM_SCHEDULE_NEVER				TIMESTAMP_INFINITY
+#define PM_SCHEDULE_IMMEDIATELY			TIMESTAMP_MINUS_INFINITY
+
+/* Time of SIGKILL during immediate shutdown or child crash */
+static TimestampTz sigkill_children_scheduled_at = PM_SCHEDULE_NEVER;
 
 /* Length of said timeout */
 #define SIGKILL_CHILDREN_AFTER_SECS		5
 
+/* Time of next lockfile check and socket touch. */
+static TimestampTz lockfile_check_scheduled_at;
+static TimestampTz socket_touch_scheduled_at;
+
+/* Length of said timeouts */
+#define LOCKFILE_CHECK_SECS				60
+#define SOCKET_TOUCH_SECS				(58 * SECS_PER_MINUTE)
+
 static bool ReachedNormalRunning = false;	/* T if we've reached PM_RUN */
 
 bool		ClientAuthInProgress = false;	/* T during new-client
@@ -409,8 +420,8 @@ static DNSServiceRef bonjour_sdref = NULL;
 #endif
 
 /* State for IO worker management. */
-static TimestampTz io_worker_launch_next_time = 0;
-static TimestampTz io_worker_launch_last_time = 0;
+static TimestampTz io_worker_launch_next_time;
+static TimestampTz io_worker_launch_last_time;
 static int	io_worker_count = 0;
 static PMChild *io_worker_children[MAX_IO_WORKERS];
 
@@ -448,6 +459,7 @@ static void TerminateChildren(int signal);
 static int	CountChildren(BackendTypeMask targetMask);
 static void LaunchMissingBackgroundProcesses(void);
 static void maybe_start_bgworkers(void);
+static TimestampTz maybe_start_bgworkers_scheduled_at(void);
 static bool maybe_reap_io_worker(int pid);
 static void maybe_start_io_workers(void);
 static TimestampTz maybe_start_io_workers_scheduled_at(void);
@@ -1546,98 +1558,33 @@ checkControlFile(void)
 	FreeFile(fp);
 }
 
+static void
+compute_next_wakeup(TimestampTz *next_wakeup, TimestampTz wakeup)
+{
+	if (*next_wakeup > wakeup)
+		*next_wakeup = wakeup;
+}
+
 /*
  * Determine how long should we let ServerLoop sleep, in milliseconds.
- *
- * In normal conditions we wait at most one minute, to ensure that the other
- * background tasks handled by ServerLoop get done even when no requests are
- * arriving.  However, if there are background workers waiting to be started,
- * we don't actually sleep so that they are quickly serviced.  Other exception
- * cases are as shown in the code.
+ * Returns the time to wait for the next of ServerLoop()'s scheduled duties.
+ * The longest possible wait is one minute (LOCKFILE_CHECK_SECS), but it could
+ * be as low as zero if one the jobs below is due/overdue now.
  */
 static int
 DetermineSleepTime(void)
 {
-	TimestampTz next_wakeup;
-
-	/*
-	 * If an ImmediateShutdown or a crash restart has set a SIGKILL timeout,
-	 * ignore everything else and wait for that.
-	 */
-	if (Shutdown >= ImmediateShutdown || FatalError)
-	{
-		if (AbortStartTime != 0)
-		{
-			time_t		curtime = time(NULL);
-			int			seconds;
-
-			/*
-			 * time left to abort; clamp to 0 if it already expired, or if
-			 * time goes backwards
-			 */
-			if (curtime < AbortStartTime ||
-				curtime - AbortStartTime >= SIGKILL_CHILDREN_AFTER_SECS)
-				seconds = 0;
-			else
-				seconds = SIGKILL_CHILDREN_AFTER_SECS -
-					(curtime - AbortStartTime);
-
-			return seconds * 1000;
-		}
-	}
-
-	/* Time of next maybe_start_io_workers() call, or 0 for none. */
-	next_wakeup = maybe_start_io_workers_scheduled_at();
-
-	/* Ignore bgworkers during shutdown. */
-	if (StartWorkerNeeded && Shutdown == NoShutdown)
-		return 0;
-
-	if (HaveCrashedWorker && Shutdown == NoShutdown)
-	{
-		dlist_mutable_iter iter;
-
-		/*
-		 * When there are crashed bgworkers, we sleep just long enough that
-		 * they are restarted when they request to be.  Scan the list to
-		 * determine the minimum of all wakeup times according to most recent
-		 * crash time and requested restart interval.
-		 */
-		dlist_foreach_modify(iter, &BackgroundWorkerList)
-		{
-			RegisteredBgWorker *rw;
-			TimestampTz this_wakeup;
-
-			rw = dlist_container(RegisteredBgWorker, rw_lnode, iter.cur);
-
-			if (rw->rw_crashed_at == 0)
-				continue;
-
-			if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART
-				|| rw->rw_terminate)
-			{
-				ForgetBackgroundWorker(rw);
-				continue;
-			}
+	TimestampTz next_wakeup = PM_SCHEDULE_NEVER;
 
-			this_wakeup = TimestampTzPlusMilliseconds(rw->rw_crashed_at,
-													  1000L * rw->rw_worker.bgw_restart_time);
-			if (next_wakeup == 0 || this_wakeup < next_wakeup)
-				next_wakeup = this_wakeup;
-		}
-	}
+	/* Find the time of the next scheduled ServerLoop() duty. */
+	compute_next_wakeup(&next_wakeup, sigkill_children_scheduled_at);
+	compute_next_wakeup(&next_wakeup, lockfile_check_scheduled_at);
+	compute_next_wakeup(&next_wakeup, socket_touch_scheduled_at);
+	compute_next_wakeup(&next_wakeup, maybe_start_io_workers_scheduled_at());
+	compute_next_wakeup(&next_wakeup, maybe_start_bgworkers_scheduled_at());
 
-	if (next_wakeup != 0)
-	{
-		int			ms;
-
-		/* result of TimestampDifferenceMilliseconds is in [0, INT_MAX] */
-		ms = (int) TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
-												   next_wakeup);
-		return Min(60 * 1000, ms);
-	}
-
-	return 60 * 1000;
+	/* result of TimestampDifferenceMilliseconds is in [0, INT_MAX] */
+	return TimestampDifferenceMilliseconds(GetCurrentTimestamp(), next_wakeup);
 }
 
 /*
@@ -1675,17 +1622,17 @@ ConfigurePostmasterWaitSet(bool accept_connections)
 static int
 ServerLoop(void)
 {
-	time_t		last_lockfile_recheck_time,
-				last_touch_time;
 	WaitEvent	events[MAXLISTEN];
 	int			nevents;
 
 	ConfigurePostmasterWaitSet(true);
-	last_lockfile_recheck_time = last_touch_time = time(NULL);
+
+	lockfile_check_scheduled_at = GetCurrentTimestamp();
+	socket_touch_scheduled_at = GetCurrentTimestamp();
 
 	for (;;)
 	{
-		time_t		now;
+		TimestampTz now;
 
 		nevents = WaitEventSetWait(pm_wait_set,
 								   DetermineSleepTime(),
@@ -1760,12 +1707,9 @@ ServerLoop(void)
 		/*
 		 * Lastly, check to see if it's time to do some things that we don't
 		 * want to do every single time through the loop, because they're a
-		 * bit expensive.  Note that there's up to a minute of slop in when
-		 * these tasks will be performed, since DetermineSleepTime() will let
-		 * us sleep at most that long; except for SIGKILL timeout which has
-		 * special-case logic there.
+		 * bit expensive.
 		 */
-		now = time(NULL);
+		now = GetCurrentTimestamp();
 
 		/*
 		 * If we already sent SIGQUIT to children and they are slow to shut
@@ -1776,10 +1720,10 @@ ServerLoop(void)
 		 *
 		 * Note we also do this during recovery from a process crash.
 		 */
-		if ((Shutdown >= ImmediateShutdown || FatalError) &&
-			AbortStartTime != 0 &&
-			(now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS)
+		if (now >= sigkill_children_scheduled_at)
 		{
+			Assert(Shutdown >= ImmediateShutdown || FatalError);
+
 			/* We were gentle with them before. Not anymore */
 			ereport(LOG,
 			/* translator: %s is SIGKILL or SIGABRT */
@@ -1787,7 +1731,7 @@ ServerLoop(void)
 							send_abort_for_kill ? "SIGABRT" : "SIGKILL")));
 			TerminateChildren(send_abort_for_kill ? SIGABRT : SIGKILL);
 			/* reset flag so we don't SIGKILL again */
-			AbortStartTime = 0;
+			sigkill_children_scheduled_at = PM_SCHEDULE_NEVER;
 		}
 
 		/*
@@ -1800,7 +1744,7 @@ ServerLoop(void)
 		 * starting a new postmaster.  Data corruption is likely to ensue from
 		 * that anyway, but we can minimize the damage by aborting ASAP.
 		 */
-		if (now - last_lockfile_recheck_time >= 1 * SECS_PER_MINUTE)
+		if (now >= lockfile_check_scheduled_at)
 		{
 			if (!RecheckDataDirLockFile())
 			{
@@ -1808,7 +1752,9 @@ ServerLoop(void)
 						(errmsg("performing immediate shutdown because data directory lock file is invalid")));
 				kill(MyProcPid, SIGQUIT);
 			}
-			last_lockfile_recheck_time = now;
+			lockfile_check_scheduled_at =
+				TimestampTzPlusSeconds(lockfile_check_scheduled_at,
+									   LOCKFILE_CHECK_SECS);
 		}
 
 		/*
@@ -1816,11 +1762,13 @@ ServerLoop(void)
 		 * they are not removed by overzealous /tmp-cleaning tasks.  We assume
 		 * no one runs cleaners with cutoff times of less than an hour ...
 		 */
-		if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
+		if (now >= socket_touch_scheduled_at)
 		{
 			TouchSocketFiles();
 			TouchSocketLockFiles();
-			last_touch_time = now;
+			socket_touch_scheduled_at =
+				TimestampTzPlusSeconds(socket_touch_scheduled_at,
+									   SOCKET_TOUCH_SECS);
 		}
 	}
 }
@@ -2231,7 +2179,9 @@ process_pm_shutdown_request(void)
 			UpdatePMState(PM_WAIT_BACKENDS);
 
 			/* set stopwatch for them to die */
-			AbortStartTime = time(NULL);
+			sigkill_children_scheduled_at =
+				TimestampTzPlusSeconds(GetCurrentTimestamp(),
+									   SIGKILL_CHILDREN_AFTER_SECS);
 
 			/*
 			 * Now wait for backends to exit.  If there are none,
@@ -2354,7 +2304,7 @@ process_pm_child_exit(void)
 			 */
 			StartupStatus = STARTUP_NOT_RUNNING;
 			FatalError = false;
-			AbortStartTime = 0;
+			sigkill_children_scheduled_at = PM_SCHEDULE_NEVER;
 			ReachedNormalRunning = true;
 			UpdatePMState(PM_RUN);
 			connsAllowed = true;
@@ -2815,8 +2765,10 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt)
 	 * .. and if this doesn't happen quickly enough, now the clock is ticking
 	 * for us to kill them without mercy.
 	 */
-	if (AbortStartTime == 0)
-		AbortStartTime = time(NULL);
+	if (sigkill_children_scheduled_at == PM_SCHEDULE_NEVER)
+		sigkill_children_scheduled_at =
+			TimestampTzPlusSeconds(GetCurrentTimestamp(),
+								   SIGKILL_CHILDREN_AFTER_SECS);
 }
 
 /*
@@ -3282,7 +3234,7 @@ PostmasterStateMachine(void)
 		Assert(StartupPMChild != NULL);
 		StartupStatus = STARTUP_RUNNING;
 		/* crash recovery started, reset SIGKILL flag */
-		AbortStartTime = 0;
+		sigkill_children_scheduled_at = PM_SCHEDULE_NEVER;
 
 		/* start accepting server socket connection events again */
 		ConfigurePostmasterWaitSet(true);
@@ -3759,7 +3711,7 @@ process_pm_pmsignal(void)
 	{
 		/* WAL redo has started. We're out of reinitialization. */
 		FatalError = false;
-		AbortStartTime = 0;
+		sigkill_children_scheduled_at = PM_SCHEDULE_NEVER;
 		reachedConsistency = false;
 
 		/*
@@ -4278,6 +4230,56 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
 	return false;
 }
 
+static TimestampTz
+maybe_start_bgworkers_scheduled_at(void)
+{
+	TimestampTz next_wakeup;
+
+	/* Background workers are ignored during shutdown. */
+	if (Shutdown != NoShutdown)
+		return PM_SCHEDULE_NEVER;
+
+	/* Do we need a worker right now? */
+	if (StartWorkerNeeded)
+		return PM_SCHEDULE_IMMEDIATELY;
+
+	next_wakeup = PM_SCHEDULE_NEVER;
+	if (HaveCrashedWorker)
+	{
+		dlist_mutable_iter iter;
+
+		/*
+		 * When there are crashed bgworkers, we sleep just long enough that
+		 * they are restarted when they request to be.  Scan the list to
+		 * determine the minimum of all wakeup times according to most recent
+		 * crash time and requested restart interval.
+		 */
+		dlist_foreach_modify(iter, &BackgroundWorkerList)
+		{
+			RegisteredBgWorker *rw;
+			TimestampTz this_wakeup;
+
+			rw = dlist_container(RegisteredBgWorker, rw_lnode, iter.cur);
+
+			if (rw->rw_crashed_at == 0)
+				continue;
+
+			if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART
+				|| rw->rw_terminate)
+			{
+				ForgetBackgroundWorker(rw);
+				continue;
+			}
+
+			this_wakeup = TimestampTzPlusMilliseconds(rw->rw_crashed_at,
+													  1000L * rw->rw_worker.bgw_restart_time);
+			compute_next_wakeup(&next_wakeup, this_wakeup);
+		}
+	}
+
+	return next_wakeup;
+}
+
 /*
  * If the time is right, start background worker(s).
  *
@@ -4423,8 +4425,8 @@ maybe_reap_io_worker(int pid)
 
 /*
  * Returns the next time at which maybe_start_io_workers() would start one or
- * more I/O workers.  Any time in the past means ASAP, and 0 means no worker
- * is currently scheduled.
+ * more I/O workers, or one of the special values PM_SCHEDULE_IMMEDIATELY and
+ * PM_SCHEDULE_NEVER.
  *
  * This is called by DetermineSleepTime() and also maybe_start_io_workers()
  * itself, to make sure that they agree.
@@ -4433,25 +4435,25 @@ static TimestampTz
 maybe_start_io_workers_scheduled_at(void)
 {
 	if (!pgaio_workers_enabled())
-		return 0;
+		return PM_SCHEDULE_NEVER;
 
 	/*
 	 * If we're in final shutting down state, then we're just waiting for all
 	 * processes to exit.
 	 */
 	if (pmState >= PM_WAIT_IO_WORKERS)
-		return 0;
+		return PM_SCHEDULE_NEVER;
 
 	/* Don't start new workers during an immediate shutdown either. */
 	if (Shutdown >= ImmediateShutdown)
-		return 0;
+		return PM_SCHEDULE_NEVER;
 
 	/*
 	 * Don't start new workers if we're in the shutdown phase of a crash
 	 * restart. But we *do* need to start if we're already starting up again.
 	 */
 	if (FatalError && pmState >= PM_STOP_BACKENDS)
-		return 0;
+		return PM_SCHEDULE_NEVER;
 
 	/*
 	 * Don't start a worker if we're at or above the maximum.  (Excess workers
@@ -4459,15 +4461,15 @@ maybe_start_io_workers_scheduled_at(void)
 	 * until they are reaped.)
 	 */
 	if (io_worker_count >= io_max_workers)
-		return 0;
+		return PM_SCHEDULE_NEVER;
 
 	/* If we're under the minimum, start a worker as soon as possible. */
 	if (io_worker_count < io_min_workers)
-		return TIMESTAMP_MINUS_INFINITY;	/* start worker ASAP */
+		return PM_SCHEDULE_IMMEDIATELY;
 
 	/* Only proceed if a "grow" request is pending from existing workers. */
 	if (!pgaio_worker_test_grow())
-		return 0;
+		return PM_SCHEDULE_NEVER;
 
 	/*
 	 * maybe_start_io_workers() should start a new I/O worker after this time,
@@ -4487,7 +4489,8 @@ maybe_start_io_workers(void)
 {
 	TimestampTz scheduled_at;
 
-	while ((scheduled_at = maybe_start_io_workers_scheduled_at()) != 0)
+	while ((scheduled_at = maybe_start_io_workers_scheduled_at()) !=
+		   PM_SCHEDULE_NEVER)
 	{
 		TimestampTz now = GetCurrentTimestamp();
 		PMChild    *child;
-- 
2.47.3

From 6c5d16a15add62c68bb7f9c7b6a1e3bde1f406d8 Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Sat, 22 Mar 2025 00:36:49 +1300
Subject: [PATCH v4 1/2] aio: Adjust I/O worker pool size automatically.

The size of the I/O worker pool used to implement io_method=worker was
previously controlled by the io_workers setting, defaulting to 3.  It
was hard to know how to tune it effectively.  It is now replaced with:

  io_min_workers=1
  io_max_workers=8 (up to 32)
  io_worker_idle_timeout=60s
  io_worker_launch_interval=100ms

The pool is automatically sized within the configured range according to
recent variation in demand.  It grows when existing workers detect a
backlog, and shrinks when the highest numbered worker is idle for too
long.  Work was already concentrated into low-numbered workers in
anticipation of this logic.

The logic for waking extra workers now also tries to measure and reduce
the number of spurious wakeups, though they are not entirely eliminated.

Reviewed-by: Dmitry Dolgov <[email protected]>
Discussion: https://postgr.es/m/CA%2BhUKG%2Bm4xV0LMoH2c%3DoRAdEXuCnh%2BtGBTWa7uFeFMGgTLAw%2BQ%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      |  69 ++-
 src/backend/postmaster/postmaster.c           | 161 ++++--
 src/backend/storage/aio/method_worker.c       | 505 +++++++++++++++---
 .../utils/activity/wait_event_names.txt       |   1 +
 src/backend/utils/misc/guc_parameters.dat     |  34 +-
 src/backend/utils/misc/postgresql.conf.sample |   6 +-
 src/include/storage/io_worker.h               |  10 +-
 src/include/storage/lwlocklist.h              |   1 +
 src/include/storage/pmsignal.h                |   1 +
 src/test/modules/test_aio/t/002_io_workers.pl |  15 +-
 src/tools/pgindent/typedefs.list              |   1 +
 11 files changed, 659 insertions(+), 145 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index b44231a362d..94eec85bd96 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2870,16 +2870,75 @@ include_dir 'conf.d'
        </listitem>
       </varlistentry>
 
-      <varlistentry id="guc-io-workers" xreflabel="io_workers">
-       <term><varname>io_workers</varname> (<type>integer</type>)
+      <varlistentry id="guc-io-min-workers" xreflabel="io_min_workers">
+       <term><varname>io_min_workers</varname> (<type>integer</type>)
        <indexterm>
-        <primary><varname>io_workers</varname> configuration parameter</primary>
+        <primary><varname>io_min_workers</varname> configuration parameter</primary>
        </indexterm>
        </term>
        <listitem>
         <para>
-         Selects the number of I/O worker processes to use. The default is
-         3. This parameter can only be set in the
+         Sets the minimum number of I/O worker processes. The default is
+         1. This parameter can only be set in the
+         <filename>postgresql.conf</filename> file or on the server command
+         line.
+        </para>
+        <para>
+         Only has an effect if <xref linkend="guc-io-method"/> is set to
+         <literal>worker</literal>.
+        </para>
+       </listitem>
+      </varlistentry>
+      <varlistentry id="guc-io-max-workers" xreflabel="io_max_workers">
+       <term><varname>io_max_workers</varname> (<type>int</type>)
+       <indexterm>
+        <primary><varname>io_max_workers</varname> configuration parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         Sets the maximum number of I/O worker processes. The default is
+         8. This parameter can only be set in the
+         <filename>postgresql.conf</filename> file or on the server command
+         line.
+        </para>
+        <para>
+         Only has an effect if <xref linkend="guc-io-method"/> is set to
+         <literal>worker</literal>.
+        </para>
+       </listitem>
+      </varlistentry>
+      <varlistentry id="guc-io-worker-idle-timeout" xreflabel="io_worker_idle_timeout">
+       <term><varname>io_worker_idle_timeout</varname> (<type>int</type>)
+       <indexterm>
+        <primary><varname>io_worker_idle_timeout</varname> configuration parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         Sets the time after which entirely idle I/O worker processes exit, reducing the
+         size of pool to match demand.  The default is 1 minute.  This
+         parameter can only be set in the
+         <filename>postgresql.conf</filename> file or on the server command
+         line.
+        </para>
+        <para>
+         Only has an effect if <xref linkend="guc-io-method"/> is set to
+         <literal>worker</literal>.
+        </para>
+       </listitem>
+      </varlistentry>
+      <varlistentry id="guc-io-worker-launch-interval" xreflabel="io_worker_launch_interval">
+       <term><varname>io_worker_launch_interval</varname> (<type>int</type>)
+       <indexterm>
+        <primary><varname>io_worker_launch_interval</varname> configuration parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         Sets the minimum time before another I/O worker can be launched.  This avoids
+         creating too many for an unsustained burst of activity.  The default is 100ms.
+         This parameter can only be set in the
          <filename>postgresql.conf</filename> file or on the server command
          line.
         </para>
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 6f13e8f40a0..c42564500c6 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -409,6 +409,8 @@ static DNSServiceRef bonjour_sdref = NULL;
 #endif
 
 /* State for IO worker management. */
+static TimestampTz io_worker_launch_next_time = 0;
+static TimestampTz io_worker_launch_last_time = 0;
 static int	io_worker_count = 0;
 static PMChild *io_worker_children[MAX_IO_WORKERS];
 
@@ -447,7 +449,8 @@ static int	CountChildren(BackendTypeMask targetMask);
 static void LaunchMissingBackgroundProcesses(void);
 static void maybe_start_bgworkers(void);
 static bool maybe_reap_io_worker(int pid);
-static void maybe_adjust_io_workers(void);
+static void maybe_start_io_workers(void);
+static TimestampTz maybe_start_io_workers_scheduled_at(void);
 static bool CreateOptsFile(int argc, char *argv[], char *fullprogname);
 static PMChild *StartChildProcess(BackendType type);
 static void StartSysLogger(void);
@@ -1391,7 +1394,7 @@ PostmasterMain(int argc, char *argv[])
 	UpdatePMState(PM_STARTUP);
 
 	/* Make sure we can perform I/O while starting up. */
-	maybe_adjust_io_workers();
+	maybe_start_io_workers();
 
 	/* Start bgwriter and checkpointer so they can help with recovery */
 	if (CheckpointerPMChild == NULL)
@@ -1555,14 +1558,13 @@ checkControlFile(void)
 static int
 DetermineSleepTime(void)
 {
-	TimestampTz next_wakeup = 0;
+	TimestampTz next_wakeup;
 
 	/*
-	 * Normal case: either there are no background workers at all, or we're in
-	 * a shutdown sequence (during which we ignore bgworkers altogether).
+	 * If an ImmediateShutdown or a crash restart has set a SIGKILL timeout,
+	 * ignore everything else and wait for that.
 	 */
-	if (Shutdown > NoShutdown ||
-		(!StartWorkerNeeded && !HaveCrashedWorker))
+	if (Shutdown >= ImmediateShutdown || FatalError)
 	{
 		if (AbortStartTime != 0)
 		{
@@ -1582,14 +1584,16 @@ DetermineSleepTime(void)
 
 			return seconds * 1000;
 		}
-		else
-			return 60 * 1000;
 	}
 
-	if (StartWorkerNeeded)
+	/* Time of next maybe_start_io_workers() call, or 0 for none. */
+	next_wakeup = maybe_start_io_workers_scheduled_at();
+
+	/* Ignore bgworkers during shutdown. */
+	if (StartWorkerNeeded && Shutdown == NoShutdown)
 		return 0;
 
-	if (HaveCrashedWorker)
+	if (HaveCrashedWorker && Shutdown == NoShutdown)
 	{
 		dlist_mutable_iter iter;
 
@@ -2542,7 +2546,17 @@ process_pm_child_exit(void)
 			if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
 				HandleChildCrash(pid, exitstatus, _("io worker"));
 
-			maybe_adjust_io_workers();
+			/*
+			 * A worker that exited with an error might have brought the pool
+			 * size below io_min_workers, or allowed the queue to grow to the
+			 * point where another worker called for growth.
+			 *
+			 * In the common case that a worker timed out due to idleness, no
+			 * replacement needs to be started.  maybe_start_io_workers() will
+			 * figure that out.
+			 */
+			maybe_start_io_workers();
+
 			continue;
 		}
 
@@ -3262,7 +3276,7 @@ PostmasterStateMachine(void)
 		UpdatePMState(PM_STARTUP);
 
 		/* Make sure we can perform I/O while starting up. */
-		maybe_adjust_io_workers();
+		maybe_start_io_workers();
 
 		StartupPMChild = StartChildProcess(B_STARTUP);
 		Assert(StartupPMChild != NULL);
@@ -3336,7 +3350,7 @@ LaunchMissingBackgroundProcesses(void)
 	 * A config file change will always lead to this function being called, so
 	 * we always will process the config change in a timely manner.
 	 */
-	maybe_adjust_io_workers();
+	maybe_start_io_workers();
 
 	/*
 	 * The checkpointer and the background writer are active from the start,
@@ -3797,6 +3811,15 @@ process_pm_pmsignal(void)
 		StartWorkerNeeded = true;
 	}
 
+	/* Process IO worker start requests. */
+	if (CheckPostmasterSignal(PMSIGNAL_IO_WORKER_GROW))
+	{
+		/*
+		 * No local flag, as the state is exposed through pgaio_worker_*()
+		 * functions.  This signal is received on potentially actionable level
+		 * changes, so that maybe_start_io_workers() will run.
+		 */
+	}
 	/* Process background worker state changes. */
 	if (CheckPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE))
 	{
@@ -4399,44 +4422,106 @@ maybe_reap_io_worker(int pid)
 }
 
 /*
- * Start or stop IO workers, to close the gap between the number of running
- * workers and the number of configured workers.  Used to respond to change of
- * the io_workers GUC (by increasing and decreasing the number of workers), as
- * well as workers terminating in response to errors (by starting
- * "replacement" workers).
+ * Returns the next time at which maybe_start_io_workers() would start one or
+ * more I/O workers.  Any time in the past means ASAP, and 0 means no worker
+ * is currently scheduled.
+ *
+ * This is called by DetermineSleepTime() and also maybe_start_io_workers()
+ * itself, to make sure that they agree.
  */
-static void
-maybe_adjust_io_workers(void)
+static TimestampTz
+maybe_start_io_workers_scheduled_at(void)
 {
 	if (!pgaio_workers_enabled())
-		return;
+		return 0;
 
 	/*
 	 * If we're in final shutting down state, then we're just waiting for all
 	 * processes to exit.
 	 */
 	if (pmState >= PM_WAIT_IO_WORKERS)
-		return;
+		return 0;
 
 	/* Don't start new workers during an immediate shutdown either. */
 	if (Shutdown >= ImmediateShutdown)
-		return;
+		return 0;
 
 	/*
 	 * Don't start new workers if we're in the shutdown phase of a crash
 	 * restart. But we *do* need to start if we're already starting up again.
 	 */
 	if (FatalError && pmState >= PM_STOP_BACKENDS)
-		return;
+		return 0;
+
+	/*
+	 * Don't start a worker if we're at or above the maximum.  (Excess workers
+	 * exit when the GUC is lowered, but the count can be temporarily too high
+	 * until they are reaped.)
+	 */
+	if (io_worker_count >= io_max_workers)
+		return 0;
+
+	/* If we're under the minimum, start a worker as soon as possible. */
+	if (io_worker_count < io_min_workers)
+		return TIMESTAMP_MINUS_INFINITY;	/* start worker ASAP */
+
+	/* Only proceed if a "grow" request is pending from existing workers. */
+	if (!pgaio_worker_test_grow())
+		return 0;
 
-	Assert(pmState < PM_WAIT_IO_WORKERS);
+	/*
+	 * maybe_start_io_workers() should start a new I/O worker after this time,
+	 * or as soon as possible if is already in the past.
+	 */
+	return io_worker_launch_next_time;
+}
+
+/*
+ * Start I/O workers if required.  Used at startup, to respond to change of
+ * the io_min_workers GUC, when asked to start a new one due to submission
+ * queue backlog, and after workers terminate in response to errors (by
+ * starting "replacement" workers).
+ */
+static void
+maybe_start_io_workers(void)
+{
+	TimestampTz scheduled_at;
 
-	/* Not enough running? */
-	while (io_worker_count < io_workers)
+	while ((scheduled_at = maybe_start_io_workers_scheduled_at()) != 0)
 	{
+		TimestampTz now = GetCurrentTimestamp();
 		PMChild    *child;
 		int			i;
 
+		Assert(pmState < PM_WAIT_IO_WORKERS);
+
+		/* Still waiting for the scheduled time? */
+		if (scheduled_at > now)
+			break;
+
+		/* Clear the grow request flag if it is set. */
+		pgaio_worker_clear_grow();
+
+		/*
+		 * Compute next launch time relative to the previous value, so that
+		 * time spent on the postmaster's other duties don't result in an
+		 * inaccurate launch interval.
+		 */
+		io_worker_launch_next_time =
+			TimestampTzPlusMilliseconds(io_worker_launch_next_time,
+										io_worker_launch_interval);
+
+		/*
+		 * If that's already in the past, the interval is either impossibly
+		 * short or we received no requests for new workers for a period.
+		 * Compute a new future time relative to the last launch time instead.
+		 */
+		if (io_worker_launch_next_time <= now)
+			io_worker_launch_next_time =
+				TimestampTzPlusMilliseconds(io_worker_launch_last_time,
+											io_worker_launch_interval);
+		io_worker_launch_last_time = now;
+
 		/* find unused entry in io_worker_children array */
 		for (i = 0; i < MAX_IO_WORKERS; ++i)
 		{
@@ -4454,20 +4539,14 @@ maybe_adjust_io_workers(void)
 			++io_worker_count;
 		}
 		else
-			break;				/* try again next time */
-	}
-
-	/* Too many running? */
-	if (io_worker_count > io_workers)
-	{
-		/* ask the IO worker in the highest slot to exit */
-		for (int i = MAX_IO_WORKERS - 1; i >= 0; --i)
 		{
-			if (io_worker_children[i] != NULL)
-			{
-				kill(io_worker_children[i]->pid, SIGUSR2);
-				break;
-			}
+			/*
+			 * Fork failure: we'll try again after the launch interval
+			 * expires, or be called again without delay if we don't yet have
+			 * io_min_workers.  Don't loop here though, the postmaster has
+			 * other duties.
+			 */
+			break;
 		}
 	}
 }
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index eb686cede1a..863c7dc0104 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -11,9 +11,8 @@
  * infrastructure for reopening the file, and must processed synchronously by
  * the client code when submitted.
  *
- * So that the submitter can make just one system call when submitting a batch
- * of IOs, wakeups "fan out"; each woken IO worker can wake two more. XXX This
- * could be improved by using futexes instead of latches to wake N waiters.
+ * The pool tries to stabilize at a size that can handle recently seen
+ * variation in demand, within the configured limits.
  *
  * This method of AIO is available in all builds on all operating systems, and
  * is the default.
@@ -29,6 +28,8 @@
 
 #include "postgres.h"
 
+#include <limits.h>
+
 #include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "port/pg_bitutils.h"
@@ -40,6 +41,8 @@
 #include "storage/io_worker.h"
 #include "storage/ipc.h"
 #include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/shmem.h"
 #include "tcop/tcopprot.h"
@@ -48,10 +51,11 @@
 #include "utils/ps_status.h"
 #include "utils/wait_event.h"
 
+/* Saturation for counters used to estimate wakeup:work ratio. */
+#define PGAIO_WORKER_STATS_MAX 4
 
-/* How many workers should each worker wake up if needed? */
-#define IO_WORKER_WAKEUP_FANOUT 2
-
+/* Debugging support: show current IO and wakeups:ios statistics in ps. */
+/* #define PGAIO_WORKER_SHOW_PS_INFO */
 
 typedef struct PgAioWorkerSubmissionQueue
 {
@@ -63,13 +67,34 @@ typedef struct PgAioWorkerSubmissionQueue
 
 typedef struct PgAioWorkerSlot
 {
-	Latch	   *latch;
-	bool		in_use;
+	ProcNumber	proc_number;
 } PgAioWorkerSlot;
 
+/*
+ * Sets of worker IDs are held in a simple bitmap, accessed through functions
+ * that provide a more readable abstraction.  If we wanted to support more
+ * workers than that, the contention on the single queue would surely get too
+ * high, so we might want to consider multiple pools instead of widening this.
+ */
+typedef uint64 PgAioWorkerSet;
+
+#define PGAIO_WORKER_SET_BITS (sizeof(PgAioWorkerSet) * CHAR_BIT)
+
+static_assert(PGAIO_WORKER_SET_BITS >= MAX_IO_WORKERS, "too small");
+
 typedef struct PgAioWorkerControl
 {
-	uint64		idle_worker_mask;
+	/* Seen by postmaster */
+	volatile bool grow;
+
+	/* Protected by AioWorkerSubmissionQueueLock. */
+	PgAioWorkerSet idle_worker_set;
+
+	/* Protected by AioWorkerControlLock. */
+	PgAioWorkerSet worker_set;
+	int			nworkers;
+
+	/* Protected by AioWorkerControlLock. */
 	PgAioWorkerSlot workers[FLEXIBLE_ARRAY_MEMBER];
 } PgAioWorkerControl;
 
@@ -91,15 +116,103 @@ const IoMethodOps pgaio_worker_ops = {
 
 
 /* GUCs */
-int			io_workers = 3;
+int			io_min_workers = 1;
+int			io_max_workers = 8;
+int			io_worker_idle_timeout = 60000;
+int			io_worker_launch_interval = 100;
 
 
 static int	io_worker_queue_size = 64;
-static int	MyIoWorkerId;
+static int	MyIoWorkerId = -1;
 static PgAioWorkerSubmissionQueue *io_worker_submission_queue;
 static PgAioWorkerControl *io_worker_control;
 
 
+static void
+pgaio_worker_set_initialize(PgAioWorkerSet *set)
+{
+	*set = 0;
+}
+
+static bool
+pgaio_worker_set_is_empty(PgAioWorkerSet *set)
+{
+	return *set == 0;
+}
+
+static PgAioWorkerSet
+pgaio_worker_set_singleton(int worker)
+{
+	return UINT64_C(1) << worker;
+}
+
+static void
+pgaio_worker_set_fill(PgAioWorkerSet *set)
+{
+	*set = UINT64_MAX >> (PGAIO_WORKER_SET_BITS - MAX_IO_WORKERS);
+}
+
+static void
+pgaio_worker_set_subtract(PgAioWorkerSet *set1, const PgAioWorkerSet *set2)
+{
+	*set1 &= ~*set2;
+}
+
+static void
+pgaio_worker_set_insert(PgAioWorkerSet *set, int worker)
+{
+	*set |= pgaio_worker_set_singleton(worker);
+}
+
+static void
+pgaio_worker_set_remove(PgAioWorkerSet *set, int worker)
+{
+	*set &= ~pgaio_worker_set_singleton(worker);
+}
+
+static void
+pgaio_worker_set_remove_less_than(PgAioWorkerSet *set, int worker)
+{
+	*set &= ~(pgaio_worker_set_singleton(worker) - 1);
+}
+
+static int
+pgaio_worker_set_get_highest(PgAioWorkerSet *set)
+{
+	Assert(!pgaio_worker_set_is_empty(set));
+	return pg_leftmost_one_pos64(*set);
+}
+
+static int
+pgaio_worker_set_get_lowest(PgAioWorkerSet *set)
+{
+	Assert(!pgaio_worker_set_is_empty(set));
+	return pg_rightmost_one_pos64(*set);
+}
+
+static int
+pgaio_worker_set_pop_lowest(PgAioWorkerSet *set)
+{
+	int			worker = pgaio_worker_set_get_lowest(set);
+
+	pgaio_worker_set_remove(set, worker);
+	return worker;
+}
+
+#ifdef USE_ASSERT_CHECKING
+static bool
+pgaio_worker_set_contains(PgAioWorkerSet *set, int worker)
+{
+	return (*set & pgaio_worker_set_singleton(worker)) != 0;
+}
+
+static int
+pgaio_worker_set_count(PgAioWorkerSet *set)
+{
+	return pg_popcount64(*set);
+}
+#endif
+
 static void
 pgaio_worker_shmem_request(void *arg)
 {
@@ -133,37 +246,107 @@ pgaio_worker_shmem_init(void *arg)
 	io_worker_submission_queue->size = queue_size;
 	io_worker_submission_queue->head = 0;
 	io_worker_submission_queue->tail = 0;
+	io_worker_control->grow = false;
+	pgaio_worker_set_initialize(&io_worker_control->worker_set);
+	pgaio_worker_set_initialize(&io_worker_control->idle_worker_set);
 
-	io_worker_control->idle_worker_mask = 0;
 	for (int i = 0; i < MAX_IO_WORKERS; ++i)
+		io_worker_control->workers[i].proc_number = INVALID_PROC_NUMBER;
+}
+
+static void
+pgaio_worker_grow(bool grow)
+{
+	/*
+	 * This is called from sites that don't hold AioWorkerControlLock, but
+	 * these values change infrequently and an up-to-date value is not
+	 * required for this heuristic purpose.
+	 */
+	if (!grow)
+	{
+		/* Avoid dirtying memory if not already set. */
+		if (io_worker_control->grow)
+			io_worker_control->grow = false;
+	}
+	else
 	{
-		io_worker_control->workers[i].latch = NULL;
-		io_worker_control->workers[i].in_use = false;
+		/* Do nothing if request already pending. */
+		if (!io_worker_control->grow)
+		{
+			io_worker_control->grow = true;
+			SendPostmasterSignal(PMSIGNAL_IO_WORKER_GROW);
+		}
 	}
 }
 
+/*
+ * Called by the postmaster to check if a new worker is needed.
+ */
+bool
+pgaio_worker_test_grow(void)
+{
+	return io_worker_control && io_worker_control->grow;
+}
+
+/*
+ * Called by the postmaster to clear the grow flag.
+ */
+void
+pgaio_worker_clear_grow(void)
+{
+	if (io_worker_control)
+		io_worker_control->grow = false;
+}
+
 static int
-pgaio_worker_choose_idle(void)
+pgaio_worker_choose_idle(int minimum_worker)
 {
+	PgAioWorkerSet worker_set;
 	int			worker;
 
-	if (io_worker_control->idle_worker_mask == 0)
+	Assert(LWLockHeldByMeInMode(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE));
+
+	worker_set = io_worker_control->idle_worker_set;
+	pgaio_worker_set_remove_less_than(&worker_set, minimum_worker);
+	if (pgaio_worker_set_is_empty(&worker_set))
 		return -1;
 
-	/* Find the lowest bit position, and clear it. */
-	worker = pg_rightmost_one_pos64(io_worker_control->idle_worker_mask);
-	io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << worker);
-	Assert(io_worker_control->workers[worker].in_use);
+	/* Find the lowest numbered idle worker and mark it not idle. */
+	worker = pgaio_worker_set_get_lowest(&worker_set);
+	pgaio_worker_set_remove(&io_worker_control->idle_worker_set, worker);
 
 	return worker;
 }
 
+/*
+ * Try to wake a worker by setting its latch, to tell it there are IOs to
+ * process in the submission queue.
+ */
+static void
+pgaio_worker_wake(int worker)
+{
+	ProcNumber	proc_number;
+
+	/*
+	 * If the selected worker is concurrently exiting, then pgaio_worker_die()
+	 * had not yet removed it as of when we saw it in idle_worker_set.  That's
+	 * OK, because it will wake all remaining workers to close wakeup-vs-exit
+	 * races: *someone* will see the queued IO.  If there are no workers
+	 * running, the postmaster will start a new one.
+	 */
+	proc_number = io_worker_control->workers[worker].proc_number;
+	if (proc_number != INVALID_PROC_NUMBER)
+		SetLatch(&GetPGProcByNumber(proc_number)->procLatch);
+}
+
 static bool
 pgaio_worker_submission_queue_insert(PgAioHandle *ioh)
 {
 	PgAioWorkerSubmissionQueue *queue;
 	uint32		new_head;
 
+	Assert(LWLockHeldByMeInMode(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE));
+
 	queue = io_worker_submission_queue;
 	new_head = (queue->head + 1) & (queue->size - 1);
 	if (new_head == queue->tail)
@@ -185,6 +368,8 @@ pgaio_worker_submission_queue_consume(void)
 	PgAioWorkerSubmissionQueue *queue;
 	int			result;
 
+	Assert(LWLockHeldByMeInMode(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE));
+
 	queue = io_worker_submission_queue;
 	if (queue->tail == queue->head)
 		return -1;				/* empty */
@@ -201,6 +386,8 @@ pgaio_worker_submission_queue_depth(void)
 	uint32		head;
 	uint32		tail;
 
+	Assert(LWLockHeldByMeInMode(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE));
+
 	head = io_worker_submission_queue->head;
 	tail = io_worker_submission_queue->tail;
 
@@ -226,8 +413,7 @@ pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 {
 	PgAioHandle **synchronous_ios = NULL;
 	int			nsync = 0;
-	Latch	   *wakeup = NULL;
-	int			worker;
+	int			worker = -1;
 
 	Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
 
@@ -252,19 +438,15 @@ pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 				break;
 			}
 
-			if (wakeup == NULL)
-			{
-				/* Choose an idle worker to wake up if we haven't already. */
-				worker = pgaio_worker_choose_idle();
-				if (worker >= 0)
-					wakeup = io_worker_control->workers[worker].latch;
-
-				pgaio_debug_io(DEBUG4, staged_ios[i],
-							   "choosing worker %d",
-							   worker);
-			}
+			/* Choose one worker to wake for this batch. */
+			if (worker == -1)
+				worker = pgaio_worker_choose_idle(0);
 		}
 		LWLockRelease(AioWorkerSubmissionQueueLock);
+
+		/* Wake up chosen worker.  It will wake peers if necessary. */
+		if (worker != -1)
+			pgaio_worker_wake(worker);
 	}
 	else
 	{
@@ -273,9 +455,6 @@ pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 		nsync = num_staged_ios;
 	}
 
-	if (wakeup)
-		SetLatch(wakeup);
-
 	/* Run whatever is left synchronously. */
 	if (nsync > 0)
 	{
@@ -295,14 +474,27 @@ pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 static void
 pgaio_worker_die(int code, Datum arg)
 {
-	LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE);
-	Assert(io_worker_control->workers[MyIoWorkerId].in_use);
-	Assert(io_worker_control->workers[MyIoWorkerId].latch == MyLatch);
+	PgAioWorkerSet notify_set;
 
-	io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << MyIoWorkerId);
-	io_worker_control->workers[MyIoWorkerId].in_use = false;
-	io_worker_control->workers[MyIoWorkerId].latch = NULL;
+	LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE);
+	pgaio_worker_set_remove(&io_worker_control->idle_worker_set, MyIoWorkerId);
 	LWLockRelease(AioWorkerSubmissionQueueLock);
+
+	LWLockAcquire(AioWorkerControlLock, LW_EXCLUSIVE);
+	Assert(io_worker_control->workers[MyIoWorkerId].proc_number == MyProcNumber);
+	io_worker_control->workers[MyIoWorkerId].proc_number = INVALID_PROC_NUMBER;
+	Assert(pgaio_worker_set_contains(&io_worker_control->worker_set, MyIoWorkerId));
+	pgaio_worker_set_remove(&io_worker_control->worker_set, MyIoWorkerId);
+	notify_set = io_worker_control->worker_set;
+	Assert(io_worker_control->nworkers > 0);
+	io_worker_control->nworkers--;
+	Assert(pgaio_worker_set_count(&io_worker_control->worker_set) ==
+		   io_worker_control->nworkers);
+	LWLockRelease(AioWorkerControlLock);
+
+	/* Notify other workers on pool change. */
+	while (!pgaio_worker_set_is_empty(&notify_set))
+		pgaio_worker_wake(pgaio_worker_set_pop_lowest(&notify_set));
 }
 
 /*
@@ -312,33 +504,34 @@ pgaio_worker_die(int code, Datum arg)
 static void
 pgaio_worker_register(void)
 {
+	PgAioWorkerSet free_worker_set;
+	PgAioWorkerSet old_worker_set;
+
 	MyIoWorkerId = -1;
 
-	/*
-	 * XXX: This could do with more fine-grained locking. But it's also not
-	 * very common for the number of workers to change at the moment...
-	 */
-	LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE);
+	LWLockAcquire(AioWorkerControlLock, LW_EXCLUSIVE);
+	pgaio_worker_set_fill(&free_worker_set);
+	pgaio_worker_set_subtract(&free_worker_set, &io_worker_control->worker_set);
+	if (!pgaio_worker_set_is_empty(&free_worker_set))
+		MyIoWorkerId = pgaio_worker_set_get_lowest(&free_worker_set);
+	if (MyIoWorkerId == -1)
+		elog(ERROR, "couldn't find a free worker ID");
 
-	for (int i = 0; i < MAX_IO_WORKERS; ++i)
-	{
-		if (!io_worker_control->workers[i].in_use)
-		{
-			Assert(io_worker_control->workers[i].latch == NULL);
-			io_worker_control->workers[i].in_use = true;
-			MyIoWorkerId = i;
-			break;
-		}
-		else
-			Assert(io_worker_control->workers[i].latch != NULL);
-	}
+	Assert(io_worker_control->workers[MyIoWorkerId].proc_number ==
+		   INVALID_PROC_NUMBER);
+	io_worker_control->workers[MyIoWorkerId].proc_number = MyProcNumber;
 
-	if (MyIoWorkerId == -1)
-		elog(ERROR, "couldn't find a free worker slot");
+	old_worker_set = io_worker_control->worker_set;
+	Assert(!pgaio_worker_set_contains(&old_worker_set, MyIoWorkerId));
+	pgaio_worker_set_insert(&io_worker_control->worker_set, MyIoWorkerId);
+	io_worker_control->nworkers++;
+	Assert(pgaio_worker_set_count(&io_worker_control->worker_set) ==
+		   io_worker_control->nworkers);
+	LWLockRelease(AioWorkerControlLock);
 
-	io_worker_control->idle_worker_mask |= (UINT64_C(1) << MyIoWorkerId);
-	io_worker_control->workers[MyIoWorkerId].latch = MyLatch;
-	LWLockRelease(AioWorkerSubmissionQueueLock);
+	/* Notify other workers on pool change. */
+	while (!pgaio_worker_set_is_empty(&old_worker_set))
+		pgaio_worker_wake(pgaio_worker_set_pop_lowest(&old_worker_set));
 
 	on_shmem_exit(pgaio_worker_die, 0);
 }
@@ -364,14 +557,48 @@ pgaio_worker_error_callback(void *arg)
 	errcontext("I/O worker executing I/O on behalf of process %d", owner_pid);
 }
 
+/*
+ * Check if this backend is allowed to time out, and thus should use a
+ * non-infinite sleep time.  Only the highest-numbered worker is allowed to
+ * time out, and only if the pool is above io_min_workers.  Serializing
+ * timeouts keeps IDs in a range 0..N without gaps, and avoids undershooting
+ * io_min_workers.
+ *
+ * The result is only instantaneously true and may be temporarily inconsistent
+ * in different workers around transitions, but all workers are woken up on
+ * pool size or GUC changes making the result eventually consistent.
+ */
+static bool
+pgaio_worker_can_timeout(void)
+{
+	PgAioWorkerSet worker_set;
+
+	/* Serialize against pool size changes. */
+	LWLockAcquire(AioWorkerControlLock, LW_SHARED);
+	worker_set = io_worker_control->worker_set;
+	LWLockRelease(AioWorkerControlLock);
+
+	if (MyIoWorkerId != pgaio_worker_set_get_highest(&worker_set))
+		return false;
+
+	if (MyIoWorkerId < io_min_workers)
+		return false;
+
+	return true;
+}
+
 void
 IoWorkerMain(const void *startup_data, size_t startup_data_len)
 {
 	sigjmp_buf	local_sigjmp_buf;
+	TimestampTz idle_timeout_abs = 0;
+	int			timeout_guc_used = 0;
 	PgAioHandle *volatile error_ioh = NULL;
 	ErrorContextCallback errcallback = {0};
 	volatile int error_errno = 0;
 	char		cmd[128];
+	int			ios = 0;
+	int			wakeups = 0;
 
 	AuxiliaryProcessMainCommon();
 
@@ -439,10 +666,9 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 	while (!ShutdownRequestPending)
 	{
 		uint32		io_index;
-		Latch	   *latches[IO_WORKER_WAKEUP_FANOUT];
-		int			nlatches = 0;
-		int			nwakeups = 0;
-		int			worker;
+		int			worker = -1;
+		int			queue_depth = 0;
+		bool		grow = false;
 
 		/*
 		 * Try to get a job to do.
@@ -453,38 +679,64 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 		LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE);
 		if ((io_index = pgaio_worker_submission_queue_consume()) == -1)
 		{
-			/*
-			 * Nothing to do.  Mark self idle.
-			 *
-			 * XXX: Invent some kind of back pressure to reduce useless
-			 * wakeups?
-			 */
-			io_worker_control->idle_worker_mask |= (UINT64_C(1) << MyIoWorkerId);
+			/* Nothing to do.  Mark self idle. */
+			pgaio_worker_set_insert(&io_worker_control->idle_worker_set,
+									MyIoWorkerId);
 		}
 		else
 		{
 			/* Got one.  Clear idle flag. */
-			io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << MyIoWorkerId);
+			pgaio_worker_set_remove(&io_worker_control->idle_worker_set,
+									MyIoWorkerId);
 
-			/* See if we can wake up some peers. */
-			nwakeups = Min(pgaio_worker_submission_queue_depth(),
-						   IO_WORKER_WAKEUP_FANOUT);
-			for (int i = 0; i < nwakeups; ++i)
+			/*
+			 * See if we should wake up a higher numbered peer.  Only do that
+			 * if this worker is not receiving spurious wakeups itself.
+			 *
+			 * This heuristic tries to discover the useful wakeup propagation
+			 * chain length when IOs are very fast and workers wake up to find
+			 * that all IOs have already been taken.
+			 *
+			 * If we chose not to wake a worker when we ideally should have,
+			 * the ratio will soon be corrected.
+			 */
+			if (wakeups <= ios)
 			{
-				if ((worker = pgaio_worker_choose_idle()) < 0)
-					break;
-				latches[nlatches++] = io_worker_control->workers[worker].latch;
+				queue_depth = pgaio_worker_submission_queue_depth();
+				if (queue_depth > 0)
+				{
+					worker = pgaio_worker_choose_idle(MyIoWorkerId + 1);
+
+					/*
+					 * If there were no idle higher numbered peers and there
+					 * are more than enough IOs queued for me and all lower
+					 * numbered peers, then try to start a new worker.
+					 */
+					if (worker == -1 && queue_depth > MyIoWorkerId)
+						grow = true;
+				}
 			}
 		}
 		LWLockRelease(AioWorkerSubmissionQueueLock);
 
-		for (int i = 0; i < nlatches; ++i)
-			SetLatch(latches[i]);
+		/* Propagate wakeups. */
+		if (worker != -1)
+			pgaio_worker_wake(worker);
+		else if (grow)
+			pgaio_worker_grow(true);
 
 		if (io_index != -1)
 		{
 			PgAioHandle *ioh = NULL;
 
+			/* Cancel timeout and update wakeup:work ratio. */
+			idle_timeout_abs = 0;
+			if (++ios == PGAIO_WORKER_STATS_MAX)
+			{
+				wakeups /= 2;
+				ios /= 2;
+			}
+
 			ioh = &pgaio_ctl->io_handles[io_index];
 			error_ioh = ioh;
 			errcallback.arg = ioh;
@@ -537,6 +789,14 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 			}
 #endif
 
+#ifdef PGAIO_WORKER_SHOW_PS_INFO
+			sprintf(cmd, "%d: [%s] %s",
+					MyIoWorkerId,
+					pgaio_io_get_op_name(ioh),
+					pgaio_io_get_target_description(ioh));
+			set_ps_display(cmd);
+#endif
+
 			/*
 			 * We don't expect this to ever fail with ERROR or FATAL, no need
 			 * to keep error_ioh set to the IO.
@@ -550,8 +810,75 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 		}
 		else
 		{
-			WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, -1,
-					  WAIT_EVENT_IO_WORKER_MAIN);
+			int			timeout_ms;
+
+			/* Cancel new worker request if pending. */
+			pgaio_worker_grow(false);
+
+			/* Compute the remaining allowed idle time. */
+			if (io_worker_idle_timeout == -1)
+			{
+				/* Never time out. */
+				timeout_ms = -1;
+			}
+			else
+			{
+				TimestampTz now = GetCurrentTimestamp();
+
+				/* If the GUC changes, reset timer. */
+				if (idle_timeout_abs != 0 &&
+					io_worker_idle_timeout != timeout_guc_used)
+					idle_timeout_abs = 0;
+
+				/* On first sleep, compute absolute timeout. */
+				if (idle_timeout_abs == 0)
+				{
+					idle_timeout_abs =
+						TimestampTzPlusMilliseconds(now,
+													io_worker_idle_timeout);
+					timeout_guc_used = io_worker_idle_timeout;
+				}
+
+				/*
+				 * All workers maintain the absolute timeout value, but only
+				 * the highest worker can actually time out and only if
+				 * io_min_workers is satisfied.  All others wait only for
+				 * explicit wakeups caused by queue insertion, wakeup
+				 * propagation, change of pool size (possibly promoting one to
+				 * new highest) or GUC reload.
+				 */
+				if (pgaio_worker_can_timeout())
+					timeout_ms =
+						TimestampDifferenceMilliseconds(now,
+														idle_timeout_abs);
+				else
+					timeout_ms = -1;
+			}
+
+#ifdef PGAIO_WORKER_SHOW_PS_INFO
+			sprintf(cmd, "%d: idle, wakeups:ios = %d:%d",
+					MyIoWorkerId, wakeups, ios);
+			set_ps_display(cmd);
+#endif
+
+			if (WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
+						  timeout_ms,
+						  WAIT_EVENT_IO_WORKER_MAIN) == WL_TIMEOUT)
+			{
+				/* WL_TIMEOUT */
+				if (pgaio_worker_can_timeout())
+					if (GetCurrentTimestamp() >= idle_timeout_abs)
+						break;
+			}
+			else
+			{
+				/* WL_LATCH_SET */
+				if (++wakeups == PGAIO_WORKER_STATS_MAX)
+				{
+					wakeups /= 2;
+					ios /= 2;
+				}
+			}
 			ResetLatch(MyLatch);
 		}
 
@@ -561,6 +888,10 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 		{
 			ConfigReloadPending = false;
 			ProcessConfigFile(PGC_SIGHUP);
+
+			/* If io_max_workers has been decreased, exit highest first. */
+			if (MyIoWorkerId >= io_max_workers)
+				break;
 		}
 	}
 
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 0a6d16f8154..4f9e88f1402 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -368,6 +368,7 @@ AioWorkerSubmissionQueue	"Waiting to access AIO worker submission queue."
 WaitLSN	"Waiting to read or update shared Wait-for-LSN state."
 LogicalDecodingControl	"Waiting to read or update logical decoding status information."
 DataChecksumsWorker	"Waiting for data checksums worker."
+AioWorkerControl	"Waiting to update AIO worker information."
 
 #
 # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE)
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 7a8a5d0764c..4b27856ea44 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -1382,6 +1382,14 @@
   check_hook => 'check_io_max_concurrency',
 },
 
+{ name => 'io_max_workers', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_IO',
+  short_desc => 'Maximum number of I/O worker processes, for io_method=worker.',
+  variable => 'io_max_workers',
+  boot_val => '8',
+  min => '1',
+  max => 'MAX_IO_WORKERS',
+},
+
 { name => 'io_method', type => 'enum', context => 'PGC_POSTMASTER', group => 'RESOURCES_IO',
   short_desc => 'Selects the method for executing asynchronous I/O.',
   variable => 'io_method',
@@ -1390,14 +1398,32 @@
   assign_hook => 'assign_io_method',
 },
 
-{ name => 'io_workers', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_IO',
-  short_desc => 'Number of IO worker processes, for io_method=worker.',
-  variable => 'io_workers',
-  boot_val => '3',
+{ name => 'io_min_workers', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_IO',
+  short_desc => 'Minimum number of I/O worker processes, for io_method=worker.',
+  variable => 'io_min_workers',
+  boot_val => '1',
   min => '1',
   max => 'MAX_IO_WORKERS',
 },
 
+{ name => 'io_worker_idle_timeout', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_IO',
+  short_desc => 'Maximum time before idle I/O worker processes time out, for io_method=worker.',
+  variable => 'io_worker_idle_timeout',
+  flags => 'GUC_UNIT_MS',
+  boot_val => '60000',
+  min => '0',
+  max => 'INT_MAX',
+},
+
+{ name => 'io_worker_launch_interval', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_IO',
+  short_desc => 'Minimum time before launching a new I/O worker process, for io_method=worker.',
+  variable => 'io_worker_launch_interval',
+  flags => 'GUC_UNIT_MS',
+  boot_val => '100',
+  min => '0',
+  max => 'INT_MAX',
+},
+
 # Not for general use --- used by SET SESSION AUTHORIZATION and SET
 # ROLE
 { name => 'is_superuser', type => 'bool', context => 'PGC_INTERNAL', group => 'UNGROUPED',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 10a281dfd4b..4d6321029b3 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -218,7 +218,11 @@
                                         # can execute simultaneously
                                         # -1 sets based on shared_buffers
                                         # (change requires restart)
-#io_workers = 3                         # 1-32;
+
+#io_min_workers = 1                     # 1-32 (change requires pg_reload_conf())
+#io_max_workers = 8                     # 1-32
+#io_worker_idle_timeout = 60s
+#io_worker_launch_interval = 100ms
 
 # - Worker Processes -
 
diff --git a/src/include/storage/io_worker.h b/src/include/storage/io_worker.h
index f7d5998a138..78f49d6ccf0 100644
--- a/src/include/storage/io_worker.h
+++ b/src/include/storage/io_worker.h
@@ -17,6 +17,14 @@
 
 pg_noreturn extern void IoWorkerMain(const void *startup_data, size_t startup_data_len);
 
-extern PGDLLIMPORT int io_workers;
+/* Public GUCs. */
+extern PGDLLIMPORT int io_min_workers;
+extern PGDLLIMPORT int io_max_workers;
+extern PGDLLIMPORT int io_worker_idle_timeout;
+extern PGDLLIMPORT int io_worker_launch_interval;
+
+/* Interfaces visible to the postmaster. */
+extern bool pgaio_worker_test_grow(void);
+extern void pgaio_worker_clear_grow(void);
 
 #endif							/* IO_WORKER_H */
diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h
index af8553bcb6c..d7eb648bd27 100644
--- a/src/include/storage/lwlocklist.h
+++ b/src/include/storage/lwlocklist.h
@@ -88,6 +88,7 @@ PG_LWLOCK(53, AioWorkerSubmissionQueue)
 PG_LWLOCK(54, WaitLSN)
 PG_LWLOCK(55, LogicalDecodingControl)
 PG_LWLOCK(56, DataChecksumsWorker)
+PG_LWLOCK(57, AioWorkerControl)
 
 /*
  * There also exist several built-in LWLock tranches.  As with the predefined
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
index 001e6eea61c..bcce4011790 100644
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -38,6 +38,7 @@ typedef enum
 	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */
 	PMSIGNAL_START_AUTOVAC_LAUNCHER,	/* start an autovacuum launcher */
 	PMSIGNAL_START_AUTOVAC_WORKER,	/* start an autovacuum worker */
+	PMSIGNAL_IO_WORKER_GROW,	/* I/O worker pool wants to grow */
 	PMSIGNAL_BACKGROUND_WORKER_CHANGE,	/* background worker state change */
 	PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */
 	PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */
diff --git a/src/test/modules/test_aio/t/002_io_workers.pl b/src/test/modules/test_aio/t/002_io_workers.pl
index 34bc132ea08..b9775811d4d 100644
--- a/src/test/modules/test_aio/t/002_io_workers.pl
+++ b/src/test/modules/test_aio/t/002_io_workers.pl
@@ -14,6 +14,9 @@ $node->init();
 $node->append_conf(
 	'postgresql.conf', qq(
 io_method=worker
+io_worker_idle_timeout=0ms
+io_worker_launch_interval=0ms
+io_max_workers=32
 ));
 
 $node->start();
@@ -31,7 +34,7 @@ sub test_number_of_io_workers_dynamic
 {
 	my $node = shift;
 
-	my $prev_worker_count = $node->safe_psql('postgres', 'SHOW io_workers');
+	my $prev_worker_count = $node->safe_psql('postgres', 'SHOW io_min_workers');
 
 	# Verify that worker count can't be set to 0
 	change_number_of_io_workers($node, 0, $prev_worker_count, 1);
@@ -62,24 +65,24 @@ sub change_number_of_io_workers
 	my ($result, $stdout, $stderr);
 
 	($result, $stdout, $stderr) =
-	  $node->psql('postgres', "ALTER SYSTEM SET io_workers = $worker_count");
+	  $node->psql('postgres', "ALTER SYSTEM SET io_min_workers = $worker_count");
 	$node->safe_psql('postgres', 'SELECT pg_reload_conf()');
 
 	if ($expect_failure)
 	{
 		like(
 			$stderr,
-			qr/$worker_count is outside the valid range for parameter "io_workers"/,
-			"updating number of io_workers to $worker_count failed, as expected"
+			qr/$worker_count is outside the valid range for parameter "io_min_workers"/,
+			"updating io_min_workers to $worker_count failed, as expected"
 		);
 
 		return $prev_worker_count;
 	}
 	else
 	{
-		is( $node->safe_psql('postgres', 'SHOW io_workers'),
+		is( $node->safe_psql('postgres', 'SHOW io_min_workers'),
 			$worker_count,
-			"updating number of io_workers from $prev_worker_count to $worker_count"
+			"updating number of io_min_workers from $prev_worker_count to $worker_count"
 		);
 
 		check_io_worker_count($node, $worker_count);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index e9430e07b36..a0955420d35 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2265,6 +2265,7 @@ PgAioUringCaps
 PgAioUringContext
 PgAioWaitRef
 PgAioWorkerControl
+PgAioWorkerSet
 PgAioWorkerSlot
 PgAioWorkerSubmissionQueue
 PgArchData
-- 
2.47.3

Reply via email to