From 607b380d16ac7dba01642e4ea9ecdcc9ae048977 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 13 Mar 2019 11:43:23 +1300
Subject: [PATCH] Use condition variables to wait for checkpoints.

Previously we used a polling/sleeping loop to wait for checkpoints
to begin and end, which leads to up to a couple hundred milliseconds
of needless thumb twiddling.  Use condition variables instead.

Author: Thomas Munro
Reported-by: Andres Freund
Discussion:
---
 doc/src/sgml/monitoring.sgml          | 10 +++++++++-
 src/backend/postmaster/checkpointer.c | 21 +++++++++++++++++----
 src/backend/postmaster/pgstat.c       |  6 ++++++
 src/include/pgstat.h                  |  2 ++
 4 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index e2630fd3682..60b89356f70 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1281,7 +1281,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
          <entry>Waiting in an extension.</entry>
         </row>
         <row>
-         <entry morerows="34"><literal>IPC</literal></entry>
+         <entry morerows="36"><literal>IPC</literal></entry>
          <entry><literal>BgWorkerShutdown</literal></entry>
          <entry>Waiting for background worker to shut down.</entry>
         </row>
@@ -1293,6 +1293,14 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
          <entry><literal>BtreePage</literal></entry>
          <entry>Waiting for the page number needed to continue a parallel B-tree scan to become available.</entry>
         </row>
+        <row>
+         <entry><literal>CheckpointDone</literal></entry>
+         <entry>Waiting for a checkpoint to complete.</entry>
+        </row>
+        <row>
+         <entry><literal>CheckpointStart</literal></entry>
+         <entry>Waiting for a checkpoint to start.</entry>
+        </row>
         <row>
          <entry><literal>ClogGroupUpdate</literal></entry>
          <entry>Waiting for group leader to update transaction status at transaction end.</entry>
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fe96c41359b..bb2866d98ec 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -124,6 +124,9 @@ typedef struct
 	int			ckpt_done;		/* advances when checkpoint done */
 	int			ckpt_failed;	/* advances when checkpoint fails */
 
+	ConditionVariable ckpt_started_cv;	/* fired when checkpoint starts */
+	ConditionVariable ckpt_done_cv;		/* fired when checkpoint done */
+
 	int			ckpt_flags;		/* checkpoint flags, as defined in xlog.h */
 
 	uint32		num_backend_writes; /* counts user backend buffer writes */
@@ -428,6 +431,8 @@ CheckpointerMain(void)
 			CheckpointerShmem->ckpt_started++;
 			SpinLockRelease(&CheckpointerShmem->ckpt_lck);
 
+			ConditionVariableBroadcast(&CheckpointerShmem->ckpt_started_cv);
+
 			/*
 			 * The end-of-recovery checkpoint is a real checkpoint that's
 			 * performed while we're still in recovery.
@@ -488,6 +493,8 @@ CheckpointerMain(void)
 			CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
 			SpinLockRelease(&CheckpointerShmem->ckpt_lck);
 
+			ConditionVariableBroadcast(&CheckpointerShmem->ckpt_done_cv);
+
 			if (ckpt_performed)
 			{
 				/*
@@ -915,6 +922,8 @@ CheckpointerShmemInit(void)
 		MemSet(CheckpointerShmem, 0, size);
 		SpinLockInit(&CheckpointerShmem->ckpt_lck);
 		CheckpointerShmem->max_requests = NBuffers;
+		ConditionVariableInit(&CheckpointerShmem->ckpt_started_cv);
+		ConditionVariableInit(&CheckpointerShmem->ckpt_done_cv);
 	}
 }
 
@@ -1023,6 +1032,7 @@ RequestCheckpoint(int flags)
 					new_failed;
 
 		/* Wait for a new checkpoint to start. */
+		ConditionVariablePrepareToSleep(&CheckpointerShmem->ckpt_started_cv);
 		for (;;)
 		{
 			SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
@@ -1032,13 +1042,15 @@ RequestCheckpoint(int flags)
 			if (new_started != old_started)
 				break;
 
-			CHECK_FOR_INTERRUPTS();
-			pg_usleep(100000L);
+			ConditionVariableSleep(&CheckpointerShmem->ckpt_started_cv,
+								   WAIT_EVENT_CHECKPOINT_START);
 		}
+		ConditionVariableCancelSleep();
 
 		/*
 		 * We are waiting for ckpt_done >= new_started, in a modulo sense.
 		 */
+		ConditionVariablePrepareToSleep(&CheckpointerShmem->ckpt_done_cv);
 		for (;;)
 		{
 			int			new_done;
@@ -1051,9 +1063,10 @@ RequestCheckpoint(int flags)
 			if (new_done - new_started >= 0)
 				break;
 
-			CHECK_FOR_INTERRUPTS();
-			pg_usleep(100000L);
+			ConditionVariableSleep(&CheckpointerShmem->ckpt_done_cv,
+								   WAIT_EVENT_CHECKPOINT_DONE);
 		}
+		ConditionVariableCancelSleep();
 
 		if (new_failed != old_failed)
 			ereport(ERROR,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index ba31f532ea4..2fbfadd9f0c 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3623,6 +3623,12 @@ pgstat_get_wait_ipc(WaitEventIPC w)
 		case WAIT_EVENT_BTREE_PAGE:
 			event_name = "BtreePage";
 			break;
+		case WAIT_EVENT_CHECKPOINT_DONE:
+			event_name = "CheckpointDone";
+			break;
+		case WAIT_EVENT_CHECKPOINT_START:
+			event_name = "CheckpointStart";
+			break;
 		case WAIT_EVENT_CLOG_GROUP_UPDATE:
 			event_name = "ClogGroupUpdate";
 			break;
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 725c8b0d64a..ea6cc8b560f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -817,6 +817,8 @@ typedef enum
 	WAIT_EVENT_BGWORKER_STARTUP,
 	WAIT_EVENT_BTREE_PAGE,
 	WAIT_EVENT_CLOG_GROUP_UPDATE,
+	WAIT_EVENT_CHECKPOINT_DONE,
+	WAIT_EVENT_CHECKPOINT_START,
 	WAIT_EVENT_EXECUTE_GATHER,
 	WAIT_EVENT_HASH_BATCH_ALLOCATING,
 	WAIT_EVENT_HASH_BATCH_ELECTING,
-- 
2.20.1

