diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 354165b..fb67cc9 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -815,6 +815,12 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       <entry>Number of buffers allocated</entry>
      </row>
      <row>
+      <entry><structfield>buffers_backend_clocksweep</></entry>
+      <entry><type>bigint</type></entry>
+      <entry>Number of buffer allocations that are not satisfied from
+      freelist</entry>
+     </row>
+     <row>
       <entry><structfield>stats_reset</></entry>
       <entry><type>timestamp with time zone</type></entry>
       <entry>Time at which these statistics were last reset</entry>
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 4a542e6..38698b0 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -27,6 +27,7 @@
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "pg_getopt.h"
+#include "postmaster/bgreclaimer.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
 #include "postmaster/walwriter.h"
@@ -179,7 +180,8 @@ static IndexList *ILHead = NULL;
  *	 AuxiliaryProcessMain
  *
  *	 The main entry point for auxiliary processes, such as the bgwriter,
- *	 walwriter, walreceiver, bootstrapper and the shared memory checker code.
+ *	 walwriter, walreceiver, bgreclaimer, bootstrapper and the shared
+ *	 memory checker code.
  *
  *	 This code is here just because of historical reasons.
  */
@@ -323,6 +325,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			case WalReceiverProcess:
 				statmsg = "wal receiver process";
 				break;
+			case BgReclaimerProcess:
+				statmsg = "reclaimer process";
+				break;
 			default:
 				statmsg = "??? process";
 				break;
@@ -437,6 +442,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			WalReceiverMain();
 			proc_exit(1);		/* should never return */
 
+		case BgReclaimerProcess:
+			/* don't set signals, bgreclaimer has its own agenda */
+			BackgroundReclaimerMain();
+			proc_exit(1);		/* should never return */
+
 		default:
 			elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType);
 			proc_exit(1);
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 22663c3..97c23c9 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -712,6 +712,7 @@ CREATE VIEW pg_stat_bgwriter AS
         pg_stat_get_buf_written_backend() AS buffers_backend,
         pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
         pg_stat_get_buf_alloc() AS buffers_alloc,
+        pg_stat_get_buf_clocksweep_backend() AS buffers_backend_clocksweep,
         pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 
 CREATE VIEW pg_user_mappings AS
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 71c2321..168d0d8 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -12,7 +12,8 @@ subdir = src/backend/postmaster
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = autovacuum.o bgworker.o bgwriter.o checkpointer.o fork_process.o \
-	pgarch.o pgstat.o postmaster.o startup.o syslogger.o walwriter.o
+OBJS = autovacuum.o bgreclaimer.o bgworker.o bgwriter.o checkpointer.o \
+	fork_process.o pgarch.o pgstat.o postmaster.o startup.o syslogger.o \
+	walwriter.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/bgreclaimer.c b/src/backend/postmaster/bgreclaimer.c
new file mode 100644
index 0000000..1c64900
--- /dev/null
+++ b/src/backend/postmaster/bgreclaimer.c
@@ -0,0 +1,306 @@
+/*-------------------------------------------------------------------------
+ *
+ * bgreclaimer.c
+ *
+ * The background reclaimer (bgreclaimer) is new as of Postgres 9.5.  It
+ * attempts to keep regular backends from having to run clock sweep (which
+ * they only need to do if they don't find the next candidate buffer from
+ * the freelist).  In the best scenario all requests for shared buffers will
+ * be fulfilled from freelist as the background reclaimer process always tries
+ * to maintain buffers on freelist.  However, regular backends are still
+ * empowered to run clock sweep to find a usable buffer if the bgreclaimer
+ * fails to maintain enough buffers on freelist.
+ *
+ * The bgreclaimer is started by the postmaster as soon as the startup subprocess
+ * finishes, or as soon as recovery begins if we are doing archive recovery.
+ * It remains alive until the postmaster commands it to terminate.
+ * Normal termination is by SIGTERM, which instructs the bgreclaimer to exit(0).
+ * Emergency termination is by SIGQUIT; like any backend, the bgreclaimer will
+ * simply abort and exit on SIGQUIT.
+ *
+ * If the bgreclaimer exits unexpectedly, the postmaster treats that the same
+ * as a backend crash: shared memory may be corrupted, so remaining backends
+ * should be killed by SIGQUIT and then a recovery cycle started.
+ *
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/postmaster/bgreclaimer.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "postmaster/bgreclaimer.h"
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+static volatile sig_atomic_t shutdown_requested = false;
+
+/* Signal handlers */
+
+static void ReclaimQuickDieHandler(SIGNAL_ARGS);
+static void ReclaimSigHupHandler(SIGNAL_ARGS);
+static void ReclaimShutdownHandler(SIGNAL_ARGS);
+static void ReclaimSigUsr1Handler(SIGNAL_ARGS);
+
+
+/*
+ * Main entry point for bgreclaim process
+ *
+ * This is invoked from AuxiliaryProcessMain, which has already created the
+ * basic execution environment, but not enabled signals yet.
+ */
+void
+BackgroundReclaimerMain(void)
+{
+	sigjmp_buf	local_sigjmp_buf;
+	MemoryContext bgreclaim_context;
+
+	/*
+	 * If possible, make this process a group leader, so that the postmaster
+	 * can signal any child processes too.  (bgreclaim probably never has any
+	 * child processes, but for consistency we make all postmaster child
+	 * processes do this.)
+	 */
+#ifdef HAVE_SETSID
+	if (setsid() < 0)
+		elog(FATAL, "setsid() failed: %m");
+#endif
+
+	/*
+	 * Properly accept or ignore signals the postmaster might send us.
+	 *
+	 * bgreclaim doesn't participate in ProcSignal signalling, but a SIGUSR1
+	 * handler is still needed for latch wakeups.
+	 */
+	pqsignal(SIGHUP, ReclaimSigHupHandler);	/* set flag to read config file */
+	pqsignal(SIGINT, SIG_IGN);
+	pqsignal(SIGTERM, ReclaimShutdownHandler);		/* shutdown */
+	pqsignal(SIGQUIT, ReclaimQuickDieHandler);		/* hard crash time */
+	pqsignal(SIGALRM, SIG_IGN);
+	pqsignal(SIGPIPE, SIG_IGN);
+	pqsignal(SIGUSR1, ReclaimSigUsr1Handler);
+	pqsignal(SIGUSR2, SIG_IGN);
+
+	/*
+	 * Reset some signals that are accepted by postmaster but not here
+	 */
+	pqsignal(SIGCHLD, SIG_DFL);
+	pqsignal(SIGTTIN, SIG_DFL);
+	pqsignal(SIGTTOU, SIG_DFL);
+	pqsignal(SIGCONT, SIG_DFL);
+	pqsignal(SIGWINCH, SIG_DFL);
+
+	/* We allow SIGQUIT (quickdie) at all times */
+	sigdelset(&BlockSig, SIGQUIT);
+
+
+	/*
+	 * Create a memory context that we will do all our work in.  We do this so
+	 * that we can reset the context during error recovery and thereby avoid
+	 * possible memory leaks.  As of now, the memory allocation can be done
+	 * only during processing of SIGHUP signal.
+	 */
+	bgreclaim_context = AllocSetContextCreate(TopMemoryContext,
+											 "Background Reclaim",
+											 ALLOCSET_DEFAULT_MINSIZE,
+											 ALLOCSET_DEFAULT_INITSIZE,
+											 ALLOCSET_DEFAULT_MAXSIZE);
+	MemoryContextSwitchTo(bgreclaim_context);
+
+	/*
+	 * If an exception is encountered, processing resumes here.
+	 *
+	 * See notes in postgres.c about the design of this coding.
+	 */
+	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+	{
+		/* Since not using PG_TRY, must reset error stack by hand */
+		error_context_stack = NULL;
+
+		/* Prevent interrupts while cleaning up */
+		HOLD_INTERRUPTS();
+
+		/* Report the error to the server log */
+		EmitErrorReport();
+
+		/*
+		 * These operations are really just a minimal subset of
+		 * AbortTransaction().  We don't have very many resources to worry
+		 * about in bgreclaim, but we do have buffers and file descriptors.
+		 * Currently we don't use LWLocks in bgreclaimer, however it can be
+		 * added in future in bgreclaimer or in config processing path and there
+		 * is no saving from not doing so.
+		 */
+		LWLockReleaseAll();
+		UnlockBuffers();
+		AtEOXact_Buffers(false);
+		AtEOXact_Files();
+
+		/*
+		 * Now return to normal top-level context and clear ErrorContext for
+		 * next time.
+		 */
+		MemoryContextSwitchTo(bgreclaim_context);
+		FlushErrorState();
+
+		/* Flush any leaked data in the top-level context */
+		MemoryContextResetAndDeleteChildren(bgreclaim_context);
+
+		/* Now we can allow interrupts again */
+		RESUME_INTERRUPTS();
+
+		/*
+		 * Sleep at least 1 second after any error.  We don't want to be
+		 * filling the error logs as fast as we can.
+		 */
+		pg_usleep(1000000L);
+	}
+
+	/* We can now handle ereport(ERROR) */
+	PG_exception_stack = &local_sigjmp_buf;
+
+	/*
+	 * Unblock signals (they were blocked when the postmaster forked us)
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	StrategyInitBgReclaimerLatch(&MyProc->procLatch);
+
+	/*
+	 * Loop forever
+	 */
+	for (;;)
+	{
+		int			rc;
+
+		/* Clear any already-pending wakeups */
+		ResetLatch(&MyProc->procLatch);
+
+		if (got_SIGHUP)
+		{
+			got_SIGHUP = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+		if (shutdown_requested)
+		{
+			/*
+			 * From here on, elog(ERROR) should end with exit(1), not send
+			 * control back to the sigsetjmp block above
+			 */
+			ExitOnAnyError = true;
+			/* Normal exit from the bgreclaimer is here */
+			proc_exit(0);		/* done */
+		}
+
+		/*
+		 * Backend will signal bgreclaimer when the number of buffers in
+		 * freelist falls below than low water mark of freelist.
+		 */
+		rc = WaitLatch(&MyProc->procLatch,
+					   WL_LATCH_SET | WL_POSTMASTER_DEATH,
+					   -1);
+
+		if (rc & WL_LATCH_SET)
+			BgMoveBuffersToFreelist();
+
+		/*
+		 * Emergency bailout if postmaster has died.  This is to avoid the
+		 * necessity for manual cleanup of all postmaster children.
+		 */
+		if (rc & WL_POSTMASTER_DEATH)
+			exit(1);
+	}
+}
+
+
+/* --------------------------------
+ *		signal handler routines
+ * --------------------------------
+ */
+
+/*
+ * ReclaimQuickDieHandler() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+ReclaimQuickDieHandler(SIGNAL_ARGS)
+{
+	PG_SETMASK(&BlockSig);
+
+	/*
+	 * We DO NOT want to run proc_exit() callbacks -- we're here because
+	 * shared memory may be corrupted, so we don't want to try to clean up our
+	 * transaction.  Just nail the windows shut and get out of town.  Now that
+	 * there's an atexit callback to prevent third-party code from breaking
+	 * things by calling exit() directly, we have to reset the callbacks
+	 * explicitly to make this work as intended.
+	 */
+	on_exit_reset();
+
+	/*
+	 * Note we do exit(2) not exit(0).  This is to force the postmaster into a
+	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+	 * backend.  This is necessary precisely because we don't clean up our
+	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
+	 * should ensure the postmaster sees this as a crash, too, but no harm in
+	 * being doubly sure.)
+	 */
+	exit(2);
+}
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+ReclaimSigHupHandler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	got_SIGHUP = true;
+	if (MyProc)
+		SetLatch(&MyProc->procLatch);
+
+	errno = save_errno;
+}
+
+/* SIGTERM: set flag to shutdown and exit */
+static void
+ReclaimShutdownHandler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	shutdown_requested = true;
+	if (MyProc)
+		SetLatch(&MyProc->procLatch);
+
+	errno = save_errno;
+}
+
+/* SIGUSR1: used for latch wakeups */
+static void
+ReclaimSigUsr1Handler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	latch_sigusr1_handler();
+
+	errno = save_errno;
+}
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index c7f41a5..7475e5a 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -5021,6 +5021,7 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 	globalStats.buf_written_backend += msg->m_buf_written_backend;
 	globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
 	globalStats.buf_alloc += msg->m_buf_alloc;
+	globalStats.buf_backend_clocksweep += msg->m_buf_backend_clocksweep;
 }
 
 /* ----------
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 14535c8..565cf4b 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -143,13 +143,13 @@
  * authorization phase).  This is used mainly to keep track of how many
  * children we have and send them appropriate signals when necessary.
  *
- * "Special" children such as the startup, bgwriter and autovacuum launcher
- * tasks are not in this list.  Autovacuum worker and walsender are in it.
- * Also, "dead_end" children are in it: these are children launched just for
- * the purpose of sending a friendly rejection message to a would-be client.
- * We must track them because they are attached to shared memory, but we know
- * they will never become live backends.  dead_end children are not assigned a
- * PMChildSlot.
+ * "Special" children such as the startup, bgwriter, bgreclaimer and
+ * autovacuum launcher tasks are not in this list.  Autovacuum worker and
+ * walsender are in it.  Also, "dead_end" children are in it: these are
+ * children launched just for the purpose of sending a friendly rejection
+ * message to a would-be client.  We must track them because they are attached
+ * to shared memory, but we know they will never become live backends.
+ * dead_end children are not assigned a PMChildSlot.
  *
  * Background workers that request shared memory access during registration are
  * in this list, too.
@@ -243,7 +243,8 @@ static pid_t StartupPID = 0,
 			AutoVacPID = 0,
 			PgArchPID = 0,
 			PgStatPID = 0,
-			SysLoggerPID = 0;
+			SysLoggerPID = 0,
+			BgReclaimerPID = 0;
 
 /* Startup/shutdown state */
 #define			NoShutdown		0
@@ -269,13 +270,13 @@ static bool RecoveryError = false;		/* T if WAL recovery failed */
  * hot standby during archive recovery.
  *
  * When the startup process is ready to start archive recovery, it signals the
- * postmaster, and we switch to PM_RECOVERY state. The background writer and
- * checkpointer are launched, while the startup process continues applying WAL.
- * If Hot Standby is enabled, then, after reaching a consistent point in WAL
- * redo, startup process signals us again, and we switch to PM_HOT_STANDBY
- * state and begin accepting connections to perform read-only queries.  When
- * archive recovery is finished, the startup process exits with exit code 0
- * and we switch to PM_RUN state.
+ * postmaster, and we switch to PM_RECOVERY state. The background writer,
+ * background reclaimer and checkpointer are launched, while the startup
+ * process continues applying WAL.  If Hot Standby is enabled, then, after
+ * reaching a consistent point in WAL redo, startup process signals us again,
+ * and we switch to PM_HOT_STANDBY state and begin accepting connections to
+ * perform read-only queries.  When archive recovery is finished, the startup
+ * process exits with exit code 0 and we switch to PM_RUN state.
  *
  * Normal child backends can only be launched when we are in PM_RUN or
  * PM_HOT_STANDBY state.  (We also allow launch of normal
@@ -505,6 +506,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define StartCheckpointer()		StartChildProcess(CheckpointerProcess)
 #define StartWalWriter()		StartChildProcess(WalWriterProcess)
 #define StartWalReceiver()		StartChildProcess(WalReceiverProcess)
+#define StartBackgroundReclaimer() StartChildProcess(BgReclaimerProcess)
 
 /* Macros to check exit status of a child process */
 #define EXIT_STATUS_0(st)  ((st) == 0)
@@ -568,8 +570,8 @@ PostmasterMain(int argc, char *argv[])
 	 * handling setup of child processes.  See tcop/postgres.c,
 	 * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c,
 	 * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c,
-	 * postmaster/syslogger.c, postmaster/bgworker.c and
-	 * postmaster/checkpointer.c.
+	 * postmaster/syslogger.c, postmaster/bgworker.c, postmaster/bgreclaimer.c
+	 * and postmaster/checkpointer.c.
 	 */
 	pqinitmask();
 	PG_SETMASK(&BlockSig);
@@ -1583,7 +1585,8 @@ ServerLoop(void)
 		/*
 		 * If no background writer process is running, and we are not in a
 		 * state that prevents it, start one.  It doesn't matter if this
-		 * fails, we'll just try again later.  Likewise for the checkpointer.
+		 * fails, we'll just try again later.  Likewise for the checkpointer
+		 * and bgreclaimer.
 		 */
 		if (pmState == PM_RUN || pmState == PM_RECOVERY ||
 			pmState == PM_HOT_STANDBY)
@@ -1592,6 +1595,8 @@ ServerLoop(void)
 				CheckpointerPID = StartCheckpointer();
 			if (BgWriterPID == 0)
 				BgWriterPID = StartBackgroundWriter();
+			if (BgReclaimerPID == 0)
+				BgReclaimerPID = StartBackgroundReclaimer();
 		}
 
 		/*
@@ -2330,6 +2335,8 @@ SIGHUP_handler(SIGNAL_ARGS)
 			signal_child(SysLoggerPID, SIGHUP);
 		if (PgStatPID != 0)
 			signal_child(PgStatPID, SIGHUP);
+		if (BgReclaimerPID != 0)
+			signal_child(BgReclaimerPID, SIGHUP);
 
 		/* Reload authentication config files too */
 		if (!load_hba())
@@ -2398,6 +2405,9 @@ pmdie(SIGNAL_ARGS)
 				/* and the walwriter too */
 				if (WalWriterPID != 0)
 					signal_child(WalWriterPID, SIGTERM);
+				/* and the bgreclaimer too */
+				if (BgReclaimerPID != 0)
+					signal_child(BgReclaimerPID, SIGTERM);
 
 				/*
 				 * If we're in recovery, we can't kill the startup process
@@ -2440,14 +2450,16 @@ pmdie(SIGNAL_ARGS)
 				signal_child(BgWriterPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (BgReclaimerPID != 0)
+				signal_child(BgReclaimerPID, SIGTERM);
 			SignalUnconnectedWorkers(SIGTERM);
 			if (pmState == PM_RECOVERY)
 			{
 				/*
-				 * Only startup, bgwriter, walreceiver, unconnected bgworkers,
-				 * and/or checkpointer should be active in this state; we just
-				 * signaled the first four, and we don't want to kill
-				 * checkpointer yet.
+				 * Only startup, bgwriter, walreceiver, bgreclaimer,
+				 * unconnected bgworkers, and/or checkpointer should be
+				 * active in this state; we just signaled the first five,
+				 * and we don't want to kill checkpointer yet.
 				 */
 				pmState = PM_WAIT_BACKENDS;
 			}
@@ -2600,6 +2612,8 @@ reaper(SIGNAL_ARGS)
 				BgWriterPID = StartBackgroundWriter();
 			if (WalWriterPID == 0)
 				WalWriterPID = StartWalWriter();
+			if (BgReclaimerPID == 0)
+				BgReclaimerPID = StartBackgroundReclaimer();
 
 			/*
 			 * Likewise, start other special children as needed.  In a restart
@@ -2625,7 +2639,8 @@ reaper(SIGNAL_ARGS)
 		/*
 		 * Was it the bgwriter?  Normal exit can be ignored; we'll start a new
 		 * one at the next iteration of the postmaster's main loop, if
-		 * necessary.  Any other exit condition is treated as a crash.
+		 * necessary.  Any other exit condition is treated as a crash.  Likewise
+		 * for bgreclaimer.
 		 */
 		if (pid == BgWriterPID)
 		{
@@ -2636,6 +2651,17 @@ reaper(SIGNAL_ARGS)
 			continue;
 		}
 
+		if (pid == BgReclaimerPID)
+		{
+			BgReclaimerPID = 0;
+			if (!EXIT_STATUS_0(exitstatus))
+				HandleChildCrash(pid, exitstatus,
+								 _("background reclaimer process"));
+			continue;
+		}
+
+
+
 		/*
 		 * Was it the checkpointer?
 		 */
@@ -2997,7 +3023,7 @@ CleanupBackend(int pid,
 
 /*
  * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer,
- * walwriter, autovacuum, or background worker.
+ * walwriter, autovacuum, bgreclaimer or background worker.
  *
  * The objectives here are to clean up our local state about the child
  * process, and to signal all other remaining children to quickdie.
@@ -3201,6 +3227,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
 	}
 
+	/* Take care of the bgreclaimer too */
+	if (pid == BgReclaimerPID)
+		BgReclaimerPID = 0;
+	else if (BgReclaimerPID != 0 && take_action)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) BgReclaimerPID)));
+		signal_child(BgReclaimerPID, (SendStop ? SIGSTOP : SIGQUIT));
+	}
+
 	/*
 	 * Force a power-cycle of the pgarch process too.  (This isn't absolutely
 	 * necessary, but it seems like a good idea for robustness, and it
@@ -3371,14 +3409,14 @@ PostmasterStateMachine(void)
 		/*
 		 * PM_WAIT_BACKENDS state ends when we have no regular backends
 		 * (including autovac workers), no bgworkers (including unconnected
-		 * ones), and no walwriter, autovac launcher or bgwriter.  If we are
-		 * doing crash recovery or an immediate shutdown then we expect the
-		 * checkpointer to exit as well, otherwise not. The archiver, stats,
-		 * and syslogger processes are disregarded since they are not
-		 * connected to shared memory; we also disregard dead_end children
-		 * here. Walsenders are also disregarded, they will be terminated
-		 * later after writing the checkpoint record, like the archiver
-		 * process.
+		 * ones), and no walwriter, autovac launcher, bgwriter or bgreclaimer.
+		 * If we are doing crash recovery or an immediate shutdown then we
+		 * expect the checkpointer to exit as well, otherwise not. The
+		 * archiver, stats, and syslogger processes are disregarded since they
+		 * are not connected to shared memory; we also disregard dead_end
+		 * children here. Walsenders are also disregarded, they will be
+		 * terminated later after writing the checkpoint record, like the
+		 * archiver process.
 		 */
 		if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_WORKER) == 0 &&
 			CountUnconnectedWorkers() == 0 &&
@@ -3388,7 +3426,8 @@ PostmasterStateMachine(void)
 			(CheckpointerPID == 0 ||
 			 (!FatalError && Shutdown < ImmediateShutdown)) &&
 			WalWriterPID == 0 &&
-			AutoVacPID == 0)
+			AutoVacPID == 0 &&
+			BgReclaimerPID == 0)
 		{
 			if (Shutdown >= ImmediateShutdown || FatalError)
 			{
@@ -3486,6 +3525,7 @@ PostmasterStateMachine(void)
 			Assert(CheckpointerPID == 0);
 			Assert(WalWriterPID == 0);
 			Assert(AutoVacPID == 0);
+			Assert(BgReclaimerPID == 0);
 			/* syslogger is not considered here */
 			pmState = PM_NO_CHILDREN;
 		}
@@ -3698,6 +3738,8 @@ TerminateChildren(int signal)
 		signal_child(WalReceiverPID, signal);
 	if (AutoVacPID != 0)
 		signal_child(AutoVacPID, signal);
+	if (BgReclaimerPID != 0)
+		signal_child(BgReclaimerPID, signal);
 	if (PgArchPID != 0)
 		signal_child(PgArchPID, signal);
 	if (PgStatPID != 0)
@@ -4778,6 +4820,8 @@ sigusr1_handler(SIGNAL_ARGS)
 		CheckpointerPID = StartCheckpointer();
 		Assert(BgWriterPID == 0);
 		BgWriterPID = StartBackgroundWriter();
+		Assert(BgReclaimerPID == 0);
+		BgReclaimerPID = StartBackgroundReclaimer();
 
 		pmState = PM_RECOVERY;
 	}
@@ -5122,6 +5166,10 @@ StartChildProcess(AuxProcType type)
 				ereport(LOG,
 						(errmsg("could not fork WAL receiver process: %m")));
 				break;
+			case BgReclaimerProcess:
+				ereport(LOG,
+				   (errmsg("could not fork background writer process: %m")));
+				break;
 			default:
 				ereport(LOG,
 						(errmsg("could not fork process: %m")));
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index 1fd38d0..e671d75 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -125,14 +125,10 @@ bits of the tag's hash value.  The rules stated above apply to each partition
 independently.  If it is necessary to lock more than one partition at a time,
 they must be locked in partition-number order to avoid risk of deadlock.
 
-* A separate system-wide LWLock, the BufFreelistLock, provides mutual
-exclusion for operations that access the buffer free list or select
-buffers for replacement.  This is always taken in exclusive mode since
-there are no read-only operations on those data structures.  The buffer
-management policy is designed so that BufFreelistLock need not be taken
-except in paths that will require I/O, and thus will be slow anyway.
-(Details appear below.)  It is never necessary to hold the BufMappingLock
-and the BufFreelistLock at the same time.
+* BufferStrategyControl contains a spinlock freelist_lck that provides mutual
+exclusion for operations that access the buffer freelist or select
+buffers for replacement.  It also contains victimbuf_lck that protects
+information related to the current clock sweep condition.
 
 * Each buffer header contains a spinlock that must be taken when examining
 or changing fields of that buffer header.  This allows operations such as
@@ -160,16 +156,20 @@ Normal Buffer Replacement Strategy
 
 There is a "free list" of buffers that are prime candidates for replacement.
 In particular, buffers that are completely free (contain no valid page) are
-always in this list.  We could also throw buffers into this list if we
-consider their pages unlikely to be needed soon; however, the current
-algorithm never does that.  The list is singly-linked using fields in the
+always in this list.  Allocating pages from this list is much cheaper than
+running the "clock sweep" algorithm, which may encounter many buffers
+that are poor candidates for eviction before finding a good candidate.
+Therefore, we have a background process called bgreclaimer which works
+to keep this list populated.  The list is singly-linked using fields in the
 buffer headers; we maintain head and tail pointers in global variables.
 (Note: although the list links are in the buffer headers, they are
-considered to be protected by the BufFreelistLock, not the buffer-header
+considered to be protected by the freelist_lck, not the buffer-header
 spinlocks.)  To choose a victim buffer to recycle when there are no free
 buffers available, we use a simple clock-sweep algorithm, which avoids the
-need to take system-wide locks during common operations.  It works like
-this:
+need to take system-wide locks during common operations.  The background
+reclaimer attempts to keep regular backends from having to run clock sweep
+by maintaining buffers on freelist, however backends are also empowered
+to run clock sweep. Clock sweep works like this:
 
 Each buffer header contains a usage counter, which is incremented (up to a
 small limit value) whenever the buffer is pinned.  (This requires only the
@@ -178,25 +178,28 @@ buffer reference count, so it's nearly free.)
 
 The "clock hand" is a buffer index, nextVictimBuffer, that moves circularly
 through all the available buffers.  nextVictimBuffer is protected by the
-BufFreelistLock.
+victimbuf_lck spinlock.
 
 The algorithm for a process that needs to obtain a victim buffer is:
 
-1. Obtain BufFreelistLock.
+1. Obtain spinlock freelist_lck.
 
-2. If buffer free list is nonempty, remove its head buffer.  If the buffer
-is pinned or has a nonzero usage count, it cannot be used; ignore it and
-return to the start of step 2.  Otherwise, pin the buffer, release
-BufFreelistLock, and return the buffer.
+2. If buffer free list is nonempty, remove its head buffer and release
+the freelist_lck.  Now set the bgwriter or bgreclaimer latch if required.
 
-3. Otherwise, select the buffer pointed to by nextVictimBuffer, and
+3. If we get the buffer, check if it is neither pinned nor
+has nonzero usage count, pin the buffer, and return the buffer.
+Otherwise again try to get the buffer from freelist and return
+to the start of step 3.
+
+4. Otherwise, select the buffer pointed to by nextVictimBuffer, and
 circularly advance nextVictimBuffer for next time.
 
-4. If the selected buffer is pinned or has a nonzero usage count, it cannot
-be used.  Decrement its usage count (if nonzero) and return to step 3 to
+5. If the selected buffer is pinned or has a nonzero usage count, it cannot
+be used.  Decrement its usage count (if nonzero) and return to step 4 to
 examine the next buffer.
 
-5. Pin the selected buffer, release BufFreelistLock, and return the buffer.
+6. Pin the selected buffer, and return the buffer.
 
 (Note that if the selected buffer is dirty, we will have to write it out
 before we can recycle it; if someone else pins the buffer meanwhile we will
@@ -259,7 +262,7 @@ dirty and not pinned nor marked with a positive usage count.  It pins,
 writes, and releases any such buffer.
 
 If we can assume that reading nextVictimBuffer is an atomic action, then
-the writer doesn't even need to take the BufFreelistLock in order to look
+the writer doesn't even need to take the spinlock in order to look
 for buffers to write; it needs only to spinlock each buffer header for long
 enough to check the dirtybit.  Even without that assumption, the writer
 only needs to take the lock long enough to read the variable value, not
@@ -281,3 +284,38 @@ As of 8.4, background writer starts during recovery mode when there is
 some form of potentially extended recovery to perform. It performs an
 identical service to normal processing, except that checkpoints it
 writes are technically restartpoints.
+
+
+Background Reclaimer's Processing
+---------------------------------
+
+The background reclaimer runs the clock sweep to identify buffers that
+are good candidates for eviction and puts them on the freelist.  This
+makes buffer allocation much faster, since removing a buffer from the
+head of a linked list is much cheaper than linearly scanning the whole
+buffer pool until a promising candidate is found.  It's possible that
+a buffer we add to the freelist may be accessed or even pinned before
+it's evicted; if that happens, the backend that would have evicted it
+will simply disregard it and take the next buffer instead (or run the
+clock sweep itself, if necessary).  However, to make sure that doesn't
+happen too often, we need to keep the freelist as short as possible,
+so that there won't be many other buffer accesses between when the
+time a buffer is added to the freelist and the time when it's actually
+evicted.
+
+We use two water marks to control the activity of the bgreclaimer
+process.  Each time bgreclaimer is awoken, it will move buffers to the
+freelist until the length of the free list reaches the high water
+mark.  It will then sleep.  When the number of buffers on the freelist
+reaches the low water mark, backends attempting to allocate new
+buffers will set the bgreclaimer's latch, waking it up again.  While
+it's important for the high water mark to be small (for the reasons
+described above), we also need to ensure adequate separation between
+the low and high water marks, so that the bgreclaimer isn't constantly
+being awoken to find just a handful of additional candidate buffers,
+and we need to ensure that the low watermark is adequate to keep the
+freelist from becoming completely empty before bgreclaimer has time to
+wake up and beginning filling it again.
+
+To execute clock sweep, bgreclaimer advances the strategy point
+(victim buffer) whereas bgwriter always scan ahead of strategy point.
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 3240432..559d393 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -889,15 +889,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	/* Loop here in case we have to try another victim buffer */
 	for (;;)
 	{
-		bool		lock_held;
-
 		/*
 		 * Select a victim buffer.  The buffer is returned with its header
-		 * spinlock still held!  Also (in most cases) the BufFreelistLock is
-		 * still held, since it would be bad to hold the spinlock while
-		 * possibly waking up other processes.
+		 * spinlock still held!
 		 */
-		buf = StrategyGetBuffer(strategy, &lock_held);
+		buf = StrategyGetBuffer(strategy);
 
 		Assert(buf->refcount == 0);
 
@@ -907,10 +903,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		/* Pin the buffer and then release the buffer spinlock */
 		PinBuffer_Locked(buf);
 
-		/* Now it's safe to release the freelist lock */
-		if (lock_held)
-			LWLockRelease(BufFreelistLock);
-
 		/*
 		 * If the buffer was dirty, try to write it out.  There is a race
 		 * condition here, in that someone might dirty it after we released it
@@ -1933,6 +1925,92 @@ BgBufferSync(void)
 }
 
 /*
+ * Move buffers with reference and a usage_count of zero to freelist. By
+ * maintaining enough buffers in the freelist (up to the list's high water
+ * mark), we drastically reduce the likelihood of individual backends
+ * having to perform the clock sweep themselves.
+ *
+ * This is called by the background reclaim process when the number
+ * of buffers in freelist falls below low water mark of freelist.
+ */
+void
+BgMoveBuffersToFreelist(void)
+{
+	uint32	num_needed_on_freelist = 0;
+	uint32	recent_alloc = 0;
+	uint32  recent_backend_clocksweep = 0;
+	volatile uint32	next_victim = 0;
+
+	/* Execute the clock sweep */
+	for (;;)
+	{
+		uint32	tmp_num_needed_on_freelist;
+		uint32	tmp_recent_alloc;
+		uint32  tmp_recent_backend_clocksweep;
+
+		StrategyGetFreelistAccessInfo(&tmp_num_needed_on_freelist,
+									  &tmp_recent_alloc,
+									  &tmp_recent_backend_clocksweep);
+
+		num_needed_on_freelist += tmp_num_needed_on_freelist;
+		recent_alloc += tmp_recent_alloc;
+		recent_backend_clocksweep += tmp_recent_backend_clocksweep;
+
+		if (tmp_num_needed_on_freelist == 0)
+			break;
+
+		while (tmp_num_needed_on_freelist > 0)
+		{
+			volatile BufferDesc *bufHdr;
+			bool	add_to_freelist = false;
+
+			/*
+			 * Choose next victim buffer to look if that can be moved
+			 * to freelist.
+			 */
+			StrategySyncNextVictimBuffer(&next_victim);
+
+			bufHdr = &BufferDescriptors[next_victim];
+
+			/*
+			 * If the buffer is pinned or has a nonzero usage_count, we cannot
+			 * move it to freelist; decrement the usage_count (unless pinned)
+			 * and keep scanning.
+			 */
+			LockBufHdr(bufHdr);
+			if (bufHdr->refcount == 0)
+			{
+				if (bufHdr->usage_count > 0)
+					bufHdr->usage_count--;
+				else
+					add_to_freelist = true;
+			}
+			UnlockBufHdr(bufHdr);
+
+			if (add_to_freelist && StrategyMoveBufferToFreeListTail(bufHdr))
+				tmp_num_needed_on_freelist--;
+		}
+
+		/*
+		 * Report buffer alloc and buffer request not satisfied
+		 * from freelist counts to pgstat.
+		 */
+		BgWriterStats.m_buf_alloc += recent_alloc;
+		BgWriterStats.m_buf_backend_clocksweep += recent_backend_clocksweep;
+
+		/*
+		 * Send off activity statistics to the stats collector
+		 */
+		pgstat_send_bgwriter();
+	}
+
+#ifdef BGW_DEBUG
+	elog(DEBUG1, "bgreclaimer: recent_alloc=%u recent_backend_clocksweep =%d next_victim=%d num_freed=%u",
+		 recent_alloc, recent_backend_clocksweep, next_victim, num_needed_on_freelist);
+#endif
+}
+
+/*
  * SyncOneBuffer -- process a single buffer during syncing.
  *
  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 4befab0..e1d8445 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -29,6 +29,7 @@ typedef struct
 
 	int			firstFreeBuffer;	/* Head of list of unused buffers */
 	int			lastFreeBuffer; /* Tail of list of unused buffers */
+	int			numFreeListBuffers; /* number of buffers on freelist */
 
 	/*
 	 * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
@@ -37,19 +38,48 @@ typedef struct
 
 	/*
 	 * Statistics.  These counters should be wide enough that they can't
-	 * overflow during a single bgwriter cycle.
+	 * overflow during a single bgwriter cycle.  completePasses is only
+	 * recorded by bgwriter, numBufferBackendClocksweep is only recorded
+	 * by bgreclaimer, however numBufferAllocs is recorded by both bgwriter
+	 * and bgreclaimer.
 	 */
 	uint32		completePasses; /* Complete cycles of the clock sweep */
 	uint32		numBufferAllocs;	/* Buffers allocated since last reset */
 
+	/* Buffers not statistied from freelist since last reset */
+	uint32		numBufferBackendClocksweep;
+
+	/*
+	 * protects freelist and related variables (firstFreeBuffer,
+	 * lastFreeBuffer, numBufferAllocs, numBufferBackendClocksweep,
+	 * numFreeListBuffers, BufferDesc->freeNext).
+	 */
+	slock_t	     freelist_lck;
+
 	/*
-	 * Notification latch, or NULL if none.  See StrategyNotifyBgWriter.
+	 * Protects nextVictimBuffer and completePasses. We need separate
+	 * lock to protect victim buffer and completePasses so that
+	 * clock sweep of one backend doesn't contend with another backend
+	 * which is evicting buffer from freelist.  We can consider having
+	 * victimbuf_lck and freelist_lck in separate cache lines by keeping
+	 * them apart in structure and by adding padding bytes, however at
+	 * the moment there is no proof that having them in same cache line
+	 * hits the performance in any scenario.
+	 */
+	slock_t	     victimbuf_lck;
+
+	/*
+	 * Latch to wake bgwriter.
 	 */
 	Latch	   *bgwriterLatch;
+	/*
+	 * Latch to wake bgreclaimer.
+	 */
+	Latch	   *bgreclaimerLatch;
 } BufferStrategyControl;
 
 /* Pointers to shared state */
-static BufferStrategyControl *StrategyControl = NULL;
+static volatile BufferStrategyControl *StrategyControl = NULL;
 
 /*
  * Private (non-shared) state for managing a ring of shared buffers to re-use.
@@ -84,8 +114,28 @@ typedef struct BufferAccessStrategyData
 	Buffer		buffers[1];		/* VARIABLE SIZE ARRAY */
 }	BufferAccessStrategyData;
 
+/*
+ * Water mark indicators for maintaining buffers on freelist.  When the
+ * number of buffers on freelist drops below the low water mark, the
+ * allocating backend sets the latch and bgreclaimer wakes up and begins
+ * adding buffers to the freelist until it reaches the high water mark and
+ * then again goes back to sleep.
+ */
+int freelistLowWaterMark;
+int freelistHighWaterMark;
+
+/*
+ * Percentage indicators for maintaining buffers on freelist.
+ * High water mark is percentage of total number of buffers (NBuffers).
+ * and Low water mark is percentage of the high water mark.
+*/
+#define HIGH_WATER_MARK_FREELIST_BUFFERS_PERCENT	0.005
+#define LOW_WATER_MARK_FREELIST_BUFFERS_PERCENT	0.2
+#define MIN_HIGH_WATER_MARK	5
+#define MAX_HIGH_WATER_MARK	2000
 
 /* Prototypes for internal functions */
+static volatile BufferDesc *GetBufferFromFreelist(BufferAccessStrategy strategy);
 static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
 static void AddBufferToRing(BufferAccessStrategy strategy,
 				volatile BufferDesc *buf);
@@ -101,103 +151,40 @@ static void AddBufferToRing(BufferAccessStrategy strategy,
  *	strategy is a BufferAccessStrategy object, or NULL for default strategy.
  *
  *	To ensure that no one else can pin the buffer before we do, we must
- *	return the buffer with the buffer header spinlock still held.  If
- *	*lock_held is set on exit, we have returned with the BufFreelistLock
- *	still held, as well; the caller must release that lock once the spinlock
- *	is dropped.  We do it that way because releasing the BufFreelistLock
- *	might awaken other processes, and it would be bad to do the associated
- *	kernel calls while holding the buffer header spinlock.
+ *	return the buffer with the buffer header spinlock still held.
  */
 volatile BufferDesc *
-StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held)
+StrategyGetBuffer(BufferAccessStrategy strategy)
 {
-	volatile BufferDesc *buf;
-	Latch	   *bgwriterLatch;
+	volatile BufferDesc *buf = NULL;
 	int			trycounter;
 
 	/*
 	 * If given a strategy object, see whether it can select a buffer. We
-	 * assume strategy objects don't need the BufFreelistLock.
+	 * assume strategy objects don't need the freelist_lck.
 	 */
 	if (strategy != NULL)
 	{
 		buf = GetBufferFromRing(strategy);
 		if (buf != NULL)
-		{
-			*lock_held = false;
 			return buf;
-		}
 	}
 
-	/* Nope, so lock the freelist */
-	*lock_held = true;
-	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
-
-	/*
-	 * We count buffer allocation requests so that the bgwriter can estimate
-	 * the rate of buffer consumption.  Note that buffers recycled by a
-	 * strategy object are intentionally not counted here.
-	 */
-	StrategyControl->numBufferAllocs++;
-
-	/*
-	 * If bgwriterLatch is set, we need to waken the bgwriter, but we should
-	 * not do so while holding BufFreelistLock; so release and re-grab.  This
-	 * is annoyingly tedious, but it happens at most once per bgwriter cycle,
-	 * so the performance hit is minimal.
-	 */
-	bgwriterLatch = StrategyControl->bgwriterLatch;
-	if (bgwriterLatch)
-	{
-		StrategyControl->bgwriterLatch = NULL;
-		LWLockRelease(BufFreelistLock);
-		SetLatch(bgwriterLatch);
-		LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
-	}
-
-	/*
-	 * Try to get a buffer from the freelist.  Note that the freeNext fields
-	 * are considered to be protected by the BufFreelistLock not the
-	 * individual buffer spinlocks, so it's OK to manipulate them without
-	 * holding the spinlock.
-	 */
-	while (StrategyControl->firstFreeBuffer >= 0)
-	{
-		buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
-		Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
-
-		/* Unconditionally remove buffer from freelist */
-		StrategyControl->firstFreeBuffer = buf->freeNext;
-		buf->freeNext = FREENEXT_NOT_IN_LIST;
-
-		/*
-		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
-		 * it; discard it and retry.  (This can only happen if VACUUM put a
-		 * valid buffer in the freelist and then someone else used it before
-		 * we got to it.  It's probably impossible altogether as of 8.3, but
-		 * we'd better check anyway.)
-		 */
-		LockBufHdr(buf);
-		if (buf->refcount == 0 && buf->usage_count == 0)
-		{
-			if (strategy != NULL)
-				AddBufferToRing(strategy, buf);
-			return buf;
-		}
-		UnlockBufHdr(buf);
-	}
+	/* Nope, so get the buffer from freelist */
+	buf = GetBufferFromFreelist(strategy);
+	if (buf != NULL)
+		return buf;
 
 	/* Nothing on the freelist, so run the "clock sweep" algorithm */
 	trycounter = NBuffers;
+
 	for (;;)
 	{
-		buf = &BufferDescriptors[StrategyControl->nextVictimBuffer];
+		volatile uint32	next_victim;
 
-		if (++StrategyControl->nextVictimBuffer >= NBuffers)
-		{
-			StrategyControl->nextVictimBuffer = 0;
-			StrategyControl->completePasses++;
-		}
+		StrategySyncNextVictimBuffer(&next_victim);
+
+		buf = &BufferDescriptors[next_victim];
 
 		/*
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
@@ -241,7 +228,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held)
 void
 StrategyFreeBuffer(volatile BufferDesc *buf)
 {
-	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&StrategyControl->freelist_lck);
 
 	/*
 	 * It is possible that we are told to put something in the freelist that
@@ -253,12 +240,51 @@ StrategyFreeBuffer(volatile BufferDesc *buf)
 		if (buf->freeNext < 0)
 			StrategyControl->lastFreeBuffer = buf->buf_id;
 		StrategyControl->firstFreeBuffer = buf->buf_id;
+		++StrategyControl->numFreeListBuffers;
 	}
 
-	LWLockRelease(BufFreelistLock);
+	SpinLockRelease(&StrategyControl->freelist_lck);
 }
 
 /*
+ * StrategyMoveBufferToFreeListTail: put a buffer on the end of freelist
+ */
+bool
+StrategyMoveBufferToFreeListTail(volatile BufferDesc *buf)
+{
+	bool		freed = false;
+	SpinLockAcquire(&StrategyControl->freelist_lck);
+
+	/*
+	 * It is possible that we are told to put something in the freelist that
+	 * is already in it; don't screw up the list if so.
+	 */
+	if (buf->freeNext == FREENEXT_NOT_IN_LIST)
+	{
+		++StrategyControl->numFreeListBuffers;
+		freed = true;
+		/*
+		 * put the buffer on end of list and if list is empty then
+		 * assign first and last freebuffer with this buffer id.
+		 */
+		buf->freeNext = FREENEXT_END_OF_LIST;
+		if (StrategyControl->firstFreeBuffer < 0)
+		{
+			StrategyControl->firstFreeBuffer = buf->buf_id;
+			StrategyControl->lastFreeBuffer = buf->buf_id;
+			SpinLockRelease(&StrategyControl->freelist_lck);
+			return freed;
+		}
+		BufferDescriptors[StrategyControl->lastFreeBuffer].freeNext = buf->buf_id;
+		StrategyControl->lastFreeBuffer = buf->buf_id;
+	}
+	SpinLockRelease(&StrategyControl->freelist_lck);
+
+	return freed;
+}
+
+
+/*
  * StrategySyncStart -- tell BufferSync where to start syncing
  *
  * The result is the buffer index of the best buffer to sync first.
@@ -274,20 +300,73 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
 {
 	int			result;
 
-	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&StrategyControl->victimbuf_lck);
 	result = StrategyControl->nextVictimBuffer;
+
 	if (complete_passes)
 		*complete_passes = StrategyControl->completePasses;
+	SpinLockRelease(&StrategyControl->victimbuf_lck);
+
 	if (num_buf_alloc)
 	{
+		SpinLockAcquire(&StrategyControl->freelist_lck);
 		*num_buf_alloc = StrategyControl->numBufferAllocs;
 		StrategyControl->numBufferAllocs = 0;
+		SpinLockRelease(&StrategyControl->freelist_lck);
 	}
-	LWLockRelease(BufFreelistLock);
 	return result;
 }
 
 /*
+ * StrategyGetFreelistAccessInfo -- get information required by bgreclaimer
+ * to move unused buffers to freelist.
+ *
+ * The result is the number of buffers that are required to be moved to
+ * freelist and count of recent buffer allocs and buffer allocs not
+ * satisfied from freelist.
+ */
+void
+StrategyGetFreelistAccessInfo(uint32 *num_buf_to_free, uint32 *num_buf_alloc,
+							  uint32 *num_buf_backend_clocksweep)
+{
+	int			curfreebuffers;
+
+	SpinLockAcquire(&StrategyControl->freelist_lck);
+	curfreebuffers = StrategyControl->numFreeListBuffers;
+	if (curfreebuffers < freelistHighWaterMark)
+		*num_buf_to_free = freelistHighWaterMark - curfreebuffers;
+	else
+		*num_buf_to_free = 0;
+
+	*num_buf_alloc = StrategyControl->numBufferAllocs;
+	StrategyControl->numBufferAllocs = 0;
+
+	*num_buf_backend_clocksweep = StrategyControl->numBufferBackendClocksweep;
+	StrategyControl->numBufferBackendClocksweep = 0;
+
+	SpinLockRelease(&StrategyControl->freelist_lck);
+
+	return;
+}
+
+/*
+ * StrategySyncNextVictimBuffer -- tell bgreclaimer where to start looking
+ * for next unused buffer.
+ */
+void
+StrategySyncNextVictimBuffer(volatile uint32 *next_victim_buffer)
+{
+	SpinLockAcquire(&StrategyControl->victimbuf_lck);
+	*next_victim_buffer = StrategyControl->nextVictimBuffer;
+	if (++StrategyControl->nextVictimBuffer >= NBuffers)
+	{
+		StrategyControl->nextVictimBuffer = 0;
+		StrategyControl->completePasses++;
+	}
+	SpinLockRelease(&StrategyControl->victimbuf_lck);
+}
+
+/*
  * StrategyNotifyBgWriter -- set or clear allocation notification latch
  *
  * If bgwriterLatch isn't NULL, the next invocation of StrategyGetBuffer will
@@ -299,15 +378,27 @@ void
 StrategyNotifyBgWriter(Latch *bgwriterLatch)
 {
 	/*
-	 * We acquire the BufFreelistLock just to ensure that the store appears
+	 * We acquire the freelist_lck just to ensure that the store appears
 	 * atomic to StrategyGetBuffer.  The bgwriter should call this rather
 	 * infrequently, so there's no performance penalty from being safe.
 	 */
-	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&StrategyControl->freelist_lck);
 	StrategyControl->bgwriterLatch = bgwriterLatch;
-	LWLockRelease(BufFreelistLock);
+	SpinLockRelease(&StrategyControl->freelist_lck);
 }
 
+/*
+ * StrategyInitBgReclaimerLatch -- Initialize bgreclaimer latch.
+ * This will be used by bgreclaimer to wake itself when backend
+ * sets this latch.
+ */
+void
+StrategyInitBgReclaimerLatch(Latch *bgreclaimerLatch)
+{
+	SpinLockAcquire(&StrategyControl->freelist_lck);
+	StrategyControl->bgreclaimerLatch = bgreclaimerLatch;
+	SpinLockRelease(&StrategyControl->freelist_lck);
+}
 
 /*
  * StrategyShmemSize
@@ -376,6 +467,7 @@ StrategyInitialize(bool init)
 		 */
 		StrategyControl->firstFreeBuffer = 0;
 		StrategyControl->lastFreeBuffer = NBuffers - 1;
+		StrategyControl->numFreeListBuffers = NBuffers;
 
 		/* Initialize the clock sweep pointer */
 		StrategyControl->nextVictimBuffer = 0;
@@ -383,12 +475,33 @@ StrategyInitialize(bool init)
 		/* Clear statistics */
 		StrategyControl->completePasses = 0;
 		StrategyControl->numBufferAllocs = 0;
+		StrategyControl->numBufferBackendClocksweep = 0;
 
 		/* No pending notification */
 		StrategyControl->bgwriterLatch = NULL;
+		StrategyControl->bgreclaimerLatch = NULL;
+		SpinLockInit(&StrategyControl->freelist_lck);
+		SpinLockInit(&StrategyControl->victimbuf_lck);
 	}
 	else
 		Assert(!init);
+
+	/*
+	 * Initialize the low and high water mark number of buffer's
+	 * for freelist.  This is used to maintain buffer's on freelist
+	 * so that backend doesn't often need to perform clock sweep to
+	 * find the buffer.  We need to maintain enough buffers so that
+	 * requests can be satisfied from freelist.  These numbers
+	 * are based on results of benchmarks at various workloads.
+	 */
+	freelistHighWaterMark = HIGH_WATER_MARK_FREELIST_BUFFERS_PERCENT * NBuffers;
+	if (freelistHighWaterMark < MIN_HIGH_WATER_MARK)
+		freelistHighWaterMark = MIN_HIGH_WATER_MARK;
+	else if (freelistHighWaterMark > MAX_HIGH_WATER_MARK)
+		freelistHighWaterMark = MAX_HIGH_WATER_MARK;
+
+	freelistLowWaterMark = LOW_WATER_MARK_FREELIST_BUFFERS_PERCENT *
+						   freelistHighWaterMark;
 }
 
 
@@ -467,6 +580,118 @@ FreeAccessStrategy(BufferAccessStrategy strategy)
 }
 
 /*
+ * GetBufferFromFreelist -- returns a buffer from the freelist, or NULL if the
+ *		freelist is empty.
+ *
+ * The bufhdr spin lock is held on the returned buffer.
+ */
+static volatile BufferDesc *
+GetBufferFromFreelist(BufferAccessStrategy strategy)
+{
+	volatile BufferDesc *buf = NULL;
+	Latch	   *bgwriterLatch;
+	Latch	   *bgreclaimerLatch;
+	int			numFreeListBuffers;
+
+	SpinLockAcquire(&StrategyControl->freelist_lck);
+
+	/*
+	 * We count buffer allocation requests so that the bgwriter or bgreclaimer
+	 * can know the rate of buffer consumption and report it as stats.  Note
+	 * that buffers recycled by a strategy object are intentionally not counted
+	 * here.
+	 */
+	StrategyControl->numBufferAllocs++;
+
+	/*
+	 * Remember the values of bgwriter and bgreclaimer latch so that they can
+	 * be set outside spin lock and try to get a buffer from the freelist.
+	 */
+	bgreclaimerLatch = StrategyControl->bgreclaimerLatch;
+	bgwriterLatch = StrategyControl->bgwriterLatch;
+	if (bgwriterLatch)
+		StrategyControl->bgwriterLatch = NULL;
+
+	numFreeListBuffers = StrategyControl->numFreeListBuffers;
+
+	if (StrategyControl->firstFreeBuffer >= 0)
+	{
+		buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
+		Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
+
+		/* Unconditionally remove buffer from freelist */
+		StrategyControl->firstFreeBuffer = buf->freeNext;
+		buf->freeNext = FREENEXT_NOT_IN_LIST;
+		--StrategyControl->numFreeListBuffers;
+	}
+	else
+		StrategyControl->numBufferBackendClocksweep++;
+
+	SpinLockRelease(&StrategyControl->freelist_lck);
+
+	/* If bgwriterLatch is set, we need to waken the bgwriter */
+	if (bgwriterLatch)
+		SetLatch(bgwriterLatch);
+
+	/*
+	 * If the number of free buffers has fallen below the low water mark,
+	 * awaken the bgreclaimer to repopulate it.  bgreclaimerLatch is initialized in
+	 * early phase of BgReclaimer startup, however we still check before using
+	 * it to avoid any problem incase we reach here before its initializion.
+	 */
+	if (numFreeListBuffers < freelistLowWaterMark  && bgreclaimerLatch)
+		SetLatch(StrategyControl->bgreclaimerLatch);
+
+	if (buf != NULL)
+	{
+		/*
+		 * Try to get a buffer from the freelist.  Note that the freeNext fields
+		 * are considered to be protected by the freelist_lck not the
+		 * individual buffer spinlocks, so it's OK to manipulate them without
+		 * holding the buffer spinlock.
+		 */
+		for(;;)
+		{
+			/*
+			 * If the buffer is pinned or has a nonzero usage_count, we cannot use
+			 * it; discard it and retry.
+			 */
+			LockBufHdr(buf);
+			if (buf->refcount == 0 && buf->usage_count == 0)
+			{
+				if (strategy != NULL)
+					AddBufferToRing(strategy, buf);
+				return buf;
+			}
+			UnlockBufHdr(buf);
+
+			SpinLockAcquire(&StrategyControl->freelist_lck);
+
+			if (StrategyControl->firstFreeBuffer >= 0)
+			{
+				buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
+				Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
+
+				/* Unconditionally remove buffer from freelist */
+				StrategyControl->firstFreeBuffer = buf->freeNext;
+				buf->freeNext = FREENEXT_NOT_IN_LIST;
+				--StrategyControl->numFreeListBuffers;
+
+				SpinLockRelease(&StrategyControl->freelist_lck);
+			}
+			else
+			{
+				StrategyControl->numBufferBackendClocksweep++;
+				SpinLockRelease(&StrategyControl->freelist_lck);
+				break;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/*
  * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
  *		ring is empty.
  *
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 44ccd37..00d815f 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -100,6 +100,7 @@ extern Datum pg_stat_get_bgwriter_stat_reset_time(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_buf_written_backend(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_buf_fsync_backend(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_buf_alloc(PG_FUNCTION_ARGS);
+extern Datum pg_stat_get_buf_clocksweep_backend(PG_FUNCTION_ARGS);
 
 extern Datum pg_stat_get_xact_numscans(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_xact_tuples_returned(PG_FUNCTION_ARGS);
@@ -1496,6 +1497,12 @@ pg_stat_get_buf_alloc(PG_FUNCTION_ARGS)
 }
 
 Datum
+pg_stat_get_buf_clocksweep_backend(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_INT64(pgstat_fetch_global()->buf_backend_clocksweep);
+}
+
+Datum
 pg_stat_get_xact_numscans(PG_FUNCTION_ARGS)
 {
 	Oid			relid = PG_GETARG_OID(0);
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index e66430d..b7efb3d 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -885,7 +885,7 @@ DATA(insert OID = 2334 (  array_agg_finalfn   PGNSP PGUID 12 1 0 0 0 f f f f f f
 DESCR("aggregate final function");
 DATA(insert OID = 2335 (  array_agg		   PGNSP PGUID 12 1 0 0 0 t f f f f f i 1 0 2277 "2283" _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
 DESCR("concatenate aggregate input into an array");
-DATA(insert OID = 3218 ( width_bucket	   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "2283 2277" _null_ _null_ _null_ _null_ width_bucket_array _null_ _null_ _null_ ));
+DATA(insert OID = 3154 ( width_bucket	   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "2283 2277" _null_ _null_ _null_ _null_ width_bucket_array _null_ _null_ _null_ ));
 DESCR("bucket number of operand given a sorted array of bucket lower bounds");
 DATA(insert OID = 3816 (  array_typanalyze PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ array_typanalyze _null_ _null_ _null_ ));
 DESCR("array typanalyze");
@@ -2783,6 +2783,8 @@ DATA(insert OID = 3063 ( pg_stat_get_buf_fsync_backend PGNSP PGUID 12 1 0 0 0 f
 DESCR("statistics: number of backend buffer writes that did their own fsync");
 DATA(insert OID = 2859 ( pg_stat_get_buf_alloc			PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0 20 "" _null_ _null_ _null_ _null_ pg_stat_get_buf_alloc _null_ _null_ _null_ ));
 DESCR("statistics: number of buffer allocations");
+DATA(insert OID = 3218 ( pg_stat_get_buf_clocksweep_backend			PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0 20 "" _null_ _null_ _null_ _null_ pg_stat_get_buf_clocksweep_backend _null_ _null_ _null_ ));
+DESCR("statistics: number of buffer allocations not satisfied from freelsit");
 
 DATA(insert OID = 2978 (  pg_stat_get_function_calls		PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_function_calls _null_ _null_ _null_ ));
 DESCR("statistics: number of function calls");
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 3807955..8e58fb4 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -366,6 +366,7 @@ typedef enum
 	CheckpointerProcess,
 	WalWriterProcess,
 	WalReceiverProcess,
+	BgReclaimerProcess,
 
 	NUM_AUXPROCTYPES			/* Must be last! */
 } AuxProcType;
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 0892533..51a2023 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -397,6 +397,7 @@ typedef struct PgStat_MsgBgWriter
 	PgStat_Counter m_buf_written_backend;
 	PgStat_Counter m_buf_fsync_backend;
 	PgStat_Counter m_buf_alloc;
+	PgStat_Counter m_buf_backend_clocksweep;
 	PgStat_Counter m_checkpoint_write_time;		/* times in milliseconds */
 	PgStat_Counter m_checkpoint_sync_time;
 } PgStat_MsgBgWriter;
@@ -545,7 +546,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9C
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9D
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -670,6 +671,7 @@ typedef struct PgStat_GlobalStats
 	PgStat_Counter buf_written_backend;
 	PgStat_Counter buf_fsync_backend;
 	PgStat_Counter buf_alloc;
+	PgStat_Counter buf_backend_clocksweep;
 	TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
diff --git a/src/include/postmaster/bgreclaimer.h b/src/include/postmaster/bgreclaimer.h
new file mode 100644
index 0000000..bbd6943
--- /dev/null
+++ b/src/include/postmaster/bgreclaimer.h
@@ -0,0 +1,18 @@
+/*-------------------------------------------------------------------------
+ *
+ * bgreclaimer.h
+ *	  POSTGRES buffer reclaimer definitions.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ *
+ * src/include/postmaster/bgreclaimer.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _BGRECLAIMER_H
+#define _BGRECLAIMER_H
+
+extern void BackgroundReclaimerMain(void) __attribute__((noreturn));
+
+
+#endif   /* _BGRECLAIMER_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index c019013..b57d95a 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -115,9 +115,8 @@ typedef struct buftag
  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
  * changes after initialization, so does not need locking.  freeNext is
- * protected by the BufFreelistLock not buf_hdr_lock.  The LWLocks can take
- * care of themselves.  The buf_hdr_lock is *not* used to control access to
- * the data in the buffer!
+ * protected by the freelist_lck not buf_hdr_lock.  The buf_hdr_lock is
+ * *not* used to control access to the data in the buffer!
  *
  * An exception is that if we have the buffer pinned, its tag can't change
  * underneath us, so we can examine the tag without locking the spinlock.
@@ -185,14 +184,19 @@ extern BufferDesc *LocalBufferDescriptors;
  */
 
 /* freelist.c */
-extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
-				  bool *lock_held);
+extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
 extern void StrategyFreeBuffer(volatile BufferDesc *buf);
+extern bool StrategyMoveBufferToFreeListTail(volatile BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 					 volatile BufferDesc *buf);
 
 extern int	StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
+extern void StrategyGetFreelistAccessInfo(uint32 *num_buf_to_free,
+										  uint32 *num_buf_alloc,
+										  uint32 *num_buf_backend_clocksweep);
+extern void StrategySyncNextVictimBuffer(volatile uint32 *next_victim_buffer);
 extern void StrategyNotifyBgWriter(Latch *bgwriterLatch);
+extern void StrategyInitBgReclaimerLatch(Latch *bgwriterLatch);
 
 extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 42d9120..da4f837 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -200,6 +200,7 @@ extern void AbortBufferIO(void);
 
 extern void BufmgrCommit(void);
 extern bool BgBufferSync(void);
+extern void BgMoveBuffersToFreelist(void);
 
 extern void AtProcExit_LocalBuffers(void);
 
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 1d90b9f..754a838 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -89,7 +89,6 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
  * if you remove a lock, consider leaving a gap in the numbering sequence for
  * the benefit of DTrace and other external debugging scripts.
  */
-#define BufFreelistLock				(&MainLWLockArray[0].lock)
 #define ShmemIndexLock				(&MainLWLockArray[1].lock)
 #define OidGenLock					(&MainLWLockArray[2].lock)
 #define XidGenLock					(&MainLWLockArray[3].lock)
@@ -136,7 +135,7 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
  */
 
 /* Number of partitions of the shared buffer mapping hashtable */
-#define NUM_BUFFER_PARTITIONS  16
+#define NUM_BUFFER_PARTITIONS  128
 
 /* Number of partitions the shared lock tables are divided into */
 #define LOG2_NUM_LOCK_PARTITIONS  4
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index c23f4da..b0688a8 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -215,11 +215,12 @@ extern PGPROC *PreparedXactProcs;
  * We set aside some extra PGPROC structures for auxiliary processes,
  * ie things that aren't full-fledged backends but need shmem access.
  *
- * Background writer, checkpointer and WAL writer run during normal operation.
- * Startup process and WAL receiver also consume 2 slots, but WAL writer is
- * launched only after startup has exited, so we only need 4 slots.
+ * Background writer, Background reclaimer, checkpointer and WAL writer run
+ * during normal operation.  Startup process and WAL receiver also consume 2
+ * slots, but WAL writer is launched only after startup has exited, so we only
+ * need 5 slots.
  */
-#define NUM_AUXILIARY_PROCS		4
+#define NUM_AUXILIARY_PROCS		5
 
 
 /* configurable options */
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index ca56b47..939075e 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1671,6 +1671,7 @@ pg_stat_bgwriter| SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints
     pg_stat_get_buf_written_backend() AS buffers_backend,
     pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
     pg_stat_get_buf_alloc() AS buffers_alloc,
+    pg_stat_get_buf_clocksweep_backend() AS buffers_backend_clocksweep,
     pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 pg_stat_database| SELECT d.oid AS datid,
     d.datname,