diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 5e9e735..48d642a 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -815,6 +815,12 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       <entry>Number of buffers allocated</entry>
      </row>
      <row>
+      <entry><structfield>buffers_backend_clocksweep</></entry>
+      <entry><type>bigint</type></entry>
+      <entry>Number of buffer allocations that are not satisfied from
+      freelist</entry>
+     </row>
+     <row>
       <entry><structfield>stats_reset</></entry>
       <entry><type>timestamp with time zone</type></entry>
       <entry>Time at which these statistics were last reset</entry>
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 4a542e6..38698b0 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -27,6 +27,7 @@
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "pg_getopt.h"
+#include "postmaster/bgreclaimer.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
 #include "postmaster/walwriter.h"
@@ -179,7 +180,8 @@ static IndexList *ILHead = NULL;
  *	 AuxiliaryProcessMain
  *
  *	 The main entry point for auxiliary processes, such as the bgwriter,
- *	 walwriter, walreceiver, bootstrapper and the shared memory checker code.
+ *	 walwriter, walreceiver, bgreclaimer, bootstrapper and the shared
+ *	 memory checker code.
  *
  *	 This code is here just because of historical reasons.
  */
@@ -323,6 +325,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			case WalReceiverProcess:
 				statmsg = "wal receiver process";
 				break;
+			case BgReclaimerProcess:
+				statmsg = "reclaimer process";
+				break;
 			default:
 				statmsg = "??? process";
 				break;
@@ -437,6 +442,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			WalReceiverMain();
 			proc_exit(1);		/* should never return */
 
+		case BgReclaimerProcess:
+			/* don't set signals, bgreclaimer has its own agenda */
+			BackgroundReclaimerMain();
+			proc_exit(1);		/* should never return */
+
 		default:
 			elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType);
 			proc_exit(1);
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 1bde175..f4717c6 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -712,6 +712,7 @@ CREATE VIEW pg_stat_bgwriter AS
         pg_stat_get_buf_written_backend() AS buffers_backend,
         pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
         pg_stat_get_buf_alloc() AS buffers_alloc,
+		pg_stat_get_buf_clocksweep_backend() AS buffers_backend_clocksweep,
         pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 
 CREATE VIEW pg_user_mappings AS
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 71c2321..168d0d8 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -12,7 +12,8 @@ subdir = src/backend/postmaster
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = autovacuum.o bgworker.o bgwriter.o checkpointer.o fork_process.o \
-	pgarch.o pgstat.o postmaster.o startup.o syslogger.o walwriter.o
+OBJS = autovacuum.o bgreclaimer.o bgworker.o bgwriter.o checkpointer.o \
+	fork_process.o pgarch.o pgstat.o postmaster.o startup.o syslogger.o \
+	walwriter.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/bgreclaimer.c b/src/backend/postmaster/bgreclaimer.c
new file mode 100644
index 0000000..3df2337
--- /dev/null
+++ b/src/backend/postmaster/bgreclaimer.c
@@ -0,0 +1,302 @@
+/*-------------------------------------------------------------------------
+ *
+ * bgreclaimer.c
+ *
+ * The background reclaimer (bgreclaimer) is new as of Postgres 9.5.  It
+ * attempts to keep regular backends from having to run clock sweep (which
+ * they would only do when they don't find a usable shared buffer from
+ * freelist to read in another page).  In the best scenario all requests
+ * for shared buffers will be fulfilled from freelist as the background
+ * reclaimer process always tries to maintain buffers on freelist.  However,
+ * regular backends are still empowered to run clock sweep to find a usable
+ * buffer if the bgreclaimer fails to maintain enough buffers on freelist.
+ *
+ * The bgwriter is started by the postmaster as soon as the startup subprocess
+ * finishes, or as soon as recovery begins if we are doing archive recovery.
+ * It remains alive until the postmaster commands it to terminate.
+ * Normal termination is by SIGTERM, which instructs the bgreclaimer to exit(0).
+ * Emergency termination is by SIGQUIT; like any backend, the bgreclaimer will
+ * simply abort and exit on SIGQUIT.
+ *
+ * If the bgreclaimer exits unexpectedly, the postmaster treats that the same
+ * as a backend crash: shared memory may be corrupted, so remaining backends
+ * should be killed by SIGQUIT and then a recovery cycle started.
+ *
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/postmaster/bgreclaimer.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgreclaimer.h"
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+static volatile sig_atomic_t shutdown_requested = false;
+
+/* Signal handlers */
+
+static void bgreclaim_quickdie(SIGNAL_ARGS);
+static void BgreclaimSigHupHandler(SIGNAL_ARGS);
+static void ReqShutdownHandler(SIGNAL_ARGS);
+static void bgreclaim_sigusr1_handler(SIGNAL_ARGS);
+
+
+/*
+ * Main entry point for bgreclaim process
+ *
+ * This is invoked from AuxiliaryProcessMain, which has already created the
+ * basic execution environment, but not enabled signals yet.
+ */
+void
+BackgroundReclaimerMain(void)
+{
+	sigjmp_buf	local_sigjmp_buf;
+	MemoryContext bgreclaim_context;
+
+	/*
+	 * If possible, make this process a group leader, so that the postmaster
+	 * can signal any child processes too.  (bgreclaim probably never has any
+	 * child processes, but for consistency we make all postmaster child
+	 * processes do this.)
+	 */
+#ifdef HAVE_SETSID
+	if (setsid() < 0)
+		elog(FATAL, "setsid() failed: %m");
+#endif
+
+	/*
+	 * Properly accept or ignore signals the postmaster might send us.
+	 *
+	 * bgreclaim doesn't participate in ProcSignal signalling, but a SIGUSR1
+	 * handler is still needed for latch wakeups.
+	 */
+	pqsignal(SIGHUP, BgreclaimSigHupHandler);	/* set flag to read config file */
+	pqsignal(SIGINT, SIG_IGN);
+	pqsignal(SIGTERM, ReqShutdownHandler);		/* shutdown */
+	pqsignal(SIGQUIT, bgreclaim_quickdie);		/* hard crash time */
+	pqsignal(SIGALRM, SIG_IGN);
+	pqsignal(SIGPIPE, SIG_IGN);
+	pqsignal(SIGUSR1, bgreclaim_sigusr1_handler);
+	pqsignal(SIGUSR2, SIG_IGN);
+
+	/*
+	 * Reset some signals that are accepted by postmaster but not here
+	 */
+	pqsignal(SIGCHLD, SIG_DFL);
+	pqsignal(SIGTTIN, SIG_DFL);
+	pqsignal(SIGTTOU, SIG_DFL);
+	pqsignal(SIGCONT, SIG_DFL);
+	pqsignal(SIGWINCH, SIG_DFL);
+
+	/* We allow SIGQUIT (quickdie) at all times */
+	sigdelset(&BlockSig, SIGQUIT);
+
+
+	/*
+	 * Create a memory context that we will do all our work in.  We do this so
+	 * that we can reset the context during error recovery and thereby avoid
+	 * possible memory leaks.  As of now, the memory allocation can be done
+	 * only during processing of SIGHUP signal.
+	 */
+	bgreclaim_context = AllocSetContextCreate(TopMemoryContext,
+											 "Background Reclaim",
+											 ALLOCSET_DEFAULT_MINSIZE,
+											 ALLOCSET_DEFAULT_INITSIZE,
+											 ALLOCSET_DEFAULT_MAXSIZE);
+	MemoryContextSwitchTo(bgreclaim_context);
+
+	/*
+	 * If an exception is encountered, processing resumes here.
+	 *
+	 * See notes in postgres.c about the design of this coding.
+	 */
+	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+	{
+		/* Since not using PG_TRY, must reset error stack by hand */
+		error_context_stack = NULL;
+
+		/* Prevent interrupts while cleaning up */
+		HOLD_INTERRUPTS();
+
+		/* Report the error to the server log */
+		EmitErrorReport();
+
+		/*
+		 * These operations are really just a minimal subset of
+		 * AbortTransaction().  We don't have very many resources to worry
+		 * about in bgreclaim, but we do have buffers and file descriptors.
+		 */
+		UnlockBuffers();
+		AtEOXact_Buffers(false);
+		AtEOXact_Files();
+
+		/*
+		 * Now return to normal top-level context and clear ErrorContext for
+		 * next time.
+		 */
+		MemoryContextSwitchTo(bgreclaim_context);
+		FlushErrorState();
+
+		/* Flush any leaked data in the top-level context */
+		MemoryContextResetAndDeleteChildren(bgreclaim_context);
+
+		/* Now we can allow interrupts again */
+		RESUME_INTERRUPTS();
+	}
+
+	/* We can now handle ereport(ERROR) */
+	PG_exception_stack = &local_sigjmp_buf;
+
+	/*
+	 * Unblock signals (they were blocked when the postmaster forked us)
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	StrategyInitBgReclaimerLatch(&MyProc->procLatch);
+
+	/*
+	 * Loop forever
+	 */
+	for (;;)
+	{
+		int			rc;
+
+		/* Clear any already-pending wakeups */
+		ResetLatch(&MyProc->procLatch);
+
+		if (got_SIGHUP)
+		{
+			got_SIGHUP = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+		if (shutdown_requested)
+		{
+			/*
+			 * From here on, elog(ERROR) should end with exit(1), not send
+			 * control back to the sigsetjmp block above
+			 */
+			ExitOnAnyError = true;
+			/* Normal exit from the bgwriter is here */
+			proc_exit(0);		/* done */
+		}
+
+		/*
+		 * Backend will signal bgreclaimer when the number of buffers in
+		 * freelist falls below than low water mark of freelist.
+		 */
+		rc = WaitLatch(&MyProc->procLatch,
+					   WL_LATCH_SET | WL_POSTMASTER_DEATH,
+					   -1);
+
+		if (rc & WL_LATCH_SET)
+			BgMoveBuffersToFreelist();
+
+		/*
+		 * Send off activity statistics to the stats collector
+		 */
+		pgstat_send_bgwriter();
+
+		/*
+		 * Emergency bailout if postmaster has died.  This is to avoid the
+		 * necessity for manual cleanup of all postmaster children.
+		 */
+		if (rc & WL_POSTMASTER_DEATH)
+			exit(1);
+	}
+}
+
+
+/* --------------------------------
+ *		signal handler routines
+ * --------------------------------
+ */
+
+/*
+ * bgreclaim_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+bgreclaim_quickdie(SIGNAL_ARGS)
+{
+	PG_SETMASK(&BlockSig);
+
+	/*
+	 * We DO NOT want to run proc_exit() callbacks -- we're here because
+	 * shared memory may be corrupted, so we don't want to try to clean up our
+	 * transaction.  Just nail the windows shut and get out of town.  Now that
+	 * there's an atexit callback to prevent third-party code from breaking
+	 * things by calling exit() directly, we have to reset the callbacks
+	 * explicitly to make this work as intended.
+	 */
+	on_exit_reset();
+
+	/*
+	 * Note we do exit(2) not exit(0).  This is to force the postmaster into a
+	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+	 * backend.  This is necessary precisely because we don't clean up our
+	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
+	 * should ensure the postmaster sees this as a crash, too, but no harm in
+	 * being doubly sure.)
+	 */
+	exit(2);
+}
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+BgreclaimSigHupHandler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	got_SIGHUP = true;
+	if (MyProc)
+		SetLatch(&MyProc->procLatch);
+
+	errno = save_errno;
+}
+
+/* SIGTERM: set flag to shutdown and exit */
+static void
+ReqShutdownHandler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	shutdown_requested = true;
+	if (MyProc)
+		SetLatch(&MyProc->procLatch);
+
+	errno = save_errno;
+}
+
+/* SIGUSR1: used for latch wakeups */
+static void
+bgreclaim_sigusr1_handler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	latch_sigusr1_handler();
+
+	errno = save_errno;
+}
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index c7f41a5..7475e5a 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -5021,6 +5021,7 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 	globalStats.buf_written_backend += msg->m_buf_written_backend;
 	globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
 	globalStats.buf_alloc += msg->m_buf_alloc;
+	globalStats.buf_backend_clocksweep += msg->m_buf_backend_clocksweep;
 }
 
 /* ----------
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 14535c8..565cf4b 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -143,13 +143,13 @@
  * authorization phase).  This is used mainly to keep track of how many
  * children we have and send them appropriate signals when necessary.
  *
- * "Special" children such as the startup, bgwriter and autovacuum launcher
- * tasks are not in this list.  Autovacuum worker and walsender are in it.
- * Also, "dead_end" children are in it: these are children launched just for
- * the purpose of sending a friendly rejection message to a would-be client.
- * We must track them because they are attached to shared memory, but we know
- * they will never become live backends.  dead_end children are not assigned a
- * PMChildSlot.
+ * "Special" children such as the startup, bgwriter, bgreclaimer and
+ * autovacuum launcher tasks are not in this list.  Autovacuum worker and
+ * walsender are in it.  Also, "dead_end" children are in it: these are
+ * children launched just for the purpose of sending a friendly rejection
+ * message to a would-be client.  We must track them because they are attached
+ * to shared memory, but we know they will never become live backends.
+ * dead_end children are not assigned a PMChildSlot.
  *
  * Background workers that request shared memory access during registration are
  * in this list, too.
@@ -243,7 +243,8 @@ static pid_t StartupPID = 0,
 			AutoVacPID = 0,
 			PgArchPID = 0,
 			PgStatPID = 0,
-			SysLoggerPID = 0;
+			SysLoggerPID = 0,
+			BgReclaimerPID = 0;
 
 /* Startup/shutdown state */
 #define			NoShutdown		0
@@ -269,13 +270,13 @@ static bool RecoveryError = false;		/* T if WAL recovery failed */
  * hot standby during archive recovery.
  *
  * When the startup process is ready to start archive recovery, it signals the
- * postmaster, and we switch to PM_RECOVERY state. The background writer and
- * checkpointer are launched, while the startup process continues applying WAL.
- * If Hot Standby is enabled, then, after reaching a consistent point in WAL
- * redo, startup process signals us again, and we switch to PM_HOT_STANDBY
- * state and begin accepting connections to perform read-only queries.  When
- * archive recovery is finished, the startup process exits with exit code 0
- * and we switch to PM_RUN state.
+ * postmaster, and we switch to PM_RECOVERY state. The background writer,
+ * background reclaimer and checkpointer are launched, while the startup
+ * process continues applying WAL.  If Hot Standby is enabled, then, after
+ * reaching a consistent point in WAL redo, startup process signals us again,
+ * and we switch to PM_HOT_STANDBY state and begin accepting connections to
+ * perform read-only queries.  When archive recovery is finished, the startup
+ * process exits with exit code 0 and we switch to PM_RUN state.
  *
  * Normal child backends can only be launched when we are in PM_RUN or
  * PM_HOT_STANDBY state.  (We also allow launch of normal
@@ -505,6 +506,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define StartCheckpointer()		StartChildProcess(CheckpointerProcess)
 #define StartWalWriter()		StartChildProcess(WalWriterProcess)
 #define StartWalReceiver()		StartChildProcess(WalReceiverProcess)
+#define StartBackgroundReclaimer() StartChildProcess(BgReclaimerProcess)
 
 /* Macros to check exit status of a child process */
 #define EXIT_STATUS_0(st)  ((st) == 0)
@@ -568,8 +570,8 @@ PostmasterMain(int argc, char *argv[])
 	 * handling setup of child processes.  See tcop/postgres.c,
 	 * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c,
 	 * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c,
-	 * postmaster/syslogger.c, postmaster/bgworker.c and
-	 * postmaster/checkpointer.c.
+	 * postmaster/syslogger.c, postmaster/bgworker.c, postmaster/bgreclaimer.c
+	 * and postmaster/checkpointer.c.
 	 */
 	pqinitmask();
 	PG_SETMASK(&BlockSig);
@@ -1583,7 +1585,8 @@ ServerLoop(void)
 		/*
 		 * If no background writer process is running, and we are not in a
 		 * state that prevents it, start one.  It doesn't matter if this
-		 * fails, we'll just try again later.  Likewise for the checkpointer.
+		 * fails, we'll just try again later.  Likewise for the checkpointer
+		 * and bgreclaimer.
 		 */
 		if (pmState == PM_RUN || pmState == PM_RECOVERY ||
 			pmState == PM_HOT_STANDBY)
@@ -1592,6 +1595,8 @@ ServerLoop(void)
 				CheckpointerPID = StartCheckpointer();
 			if (BgWriterPID == 0)
 				BgWriterPID = StartBackgroundWriter();
+			if (BgReclaimerPID == 0)
+				BgReclaimerPID = StartBackgroundReclaimer();
 		}
 
 		/*
@@ -2330,6 +2335,8 @@ SIGHUP_handler(SIGNAL_ARGS)
 			signal_child(SysLoggerPID, SIGHUP);
 		if (PgStatPID != 0)
 			signal_child(PgStatPID, SIGHUP);
+		if (BgReclaimerPID != 0)
+			signal_child(BgReclaimerPID, SIGHUP);
 
 		/* Reload authentication config files too */
 		if (!load_hba())
@@ -2398,6 +2405,9 @@ pmdie(SIGNAL_ARGS)
 				/* and the walwriter too */
 				if (WalWriterPID != 0)
 					signal_child(WalWriterPID, SIGTERM);
+				/* and the bgreclaimer too */
+				if (BgReclaimerPID != 0)
+					signal_child(BgReclaimerPID, SIGTERM);
 
 				/*
 				 * If we're in recovery, we can't kill the startup process
@@ -2440,14 +2450,16 @@ pmdie(SIGNAL_ARGS)
 				signal_child(BgWriterPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (BgReclaimerPID != 0)
+				signal_child(BgReclaimerPID, SIGTERM);
 			SignalUnconnectedWorkers(SIGTERM);
 			if (pmState == PM_RECOVERY)
 			{
 				/*
-				 * Only startup, bgwriter, walreceiver, unconnected bgworkers,
-				 * and/or checkpointer should be active in this state; we just
-				 * signaled the first four, and we don't want to kill
-				 * checkpointer yet.
+				 * Only startup, bgwriter, walreceiver, bgreclaimer,
+				 * unconnected bgworkers, and/or checkpointer should be
+				 * active in this state; we just signaled the first five,
+				 * and we don't want to kill checkpointer yet.
 				 */
 				pmState = PM_WAIT_BACKENDS;
 			}
@@ -2600,6 +2612,8 @@ reaper(SIGNAL_ARGS)
 				BgWriterPID = StartBackgroundWriter();
 			if (WalWriterPID == 0)
 				WalWriterPID = StartWalWriter();
+			if (BgReclaimerPID == 0)
+				BgReclaimerPID = StartBackgroundReclaimer();
 
 			/*
 			 * Likewise, start other special children as needed.  In a restart
@@ -2625,7 +2639,8 @@ reaper(SIGNAL_ARGS)
 		/*
 		 * Was it the bgwriter?  Normal exit can be ignored; we'll start a new
 		 * one at the next iteration of the postmaster's main loop, if
-		 * necessary.  Any other exit condition is treated as a crash.
+		 * necessary.  Any other exit condition is treated as a crash.  Likewise
+		 * for bgreclaimer.
 		 */
 		if (pid == BgWriterPID)
 		{
@@ -2636,6 +2651,17 @@ reaper(SIGNAL_ARGS)
 			continue;
 		}
 
+		if (pid == BgReclaimerPID)
+		{
+			BgReclaimerPID = 0;
+			if (!EXIT_STATUS_0(exitstatus))
+				HandleChildCrash(pid, exitstatus,
+								 _("background reclaimer process"));
+			continue;
+		}
+
+
+
 		/*
 		 * Was it the checkpointer?
 		 */
@@ -2997,7 +3023,7 @@ CleanupBackend(int pid,
 
 /*
  * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer,
- * walwriter, autovacuum, or background worker.
+ * walwriter, autovacuum, bgreclaimer or background worker.
  *
  * The objectives here are to clean up our local state about the child
  * process, and to signal all other remaining children to quickdie.
@@ -3201,6 +3227,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
 	}
 
+	/* Take care of the bgreclaimer too */
+	if (pid == BgReclaimerPID)
+		BgReclaimerPID = 0;
+	else if (BgReclaimerPID != 0 && take_action)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) BgReclaimerPID)));
+		signal_child(BgReclaimerPID, (SendStop ? SIGSTOP : SIGQUIT));
+	}
+
 	/*
 	 * Force a power-cycle of the pgarch process too.  (This isn't absolutely
 	 * necessary, but it seems like a good idea for robustness, and it
@@ -3371,14 +3409,14 @@ PostmasterStateMachine(void)
 		/*
 		 * PM_WAIT_BACKENDS state ends when we have no regular backends
 		 * (including autovac workers), no bgworkers (including unconnected
-		 * ones), and no walwriter, autovac launcher or bgwriter.  If we are
-		 * doing crash recovery or an immediate shutdown then we expect the
-		 * checkpointer to exit as well, otherwise not. The archiver, stats,
-		 * and syslogger processes are disregarded since they are not
-		 * connected to shared memory; we also disregard dead_end children
-		 * here. Walsenders are also disregarded, they will be terminated
-		 * later after writing the checkpoint record, like the archiver
-		 * process.
+		 * ones), and no walwriter, autovac launcher, bgwriter or bgreclaimer.
+		 * If we are doing crash recovery or an immediate shutdown then we
+		 * expect the checkpointer to exit as well, otherwise not. The
+		 * archiver, stats, and syslogger processes are disregarded since they
+		 * are not connected to shared memory; we also disregard dead_end
+		 * children here. Walsenders are also disregarded, they will be
+		 * terminated later after writing the checkpoint record, like the
+		 * archiver process.
 		 */
 		if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_WORKER) == 0 &&
 			CountUnconnectedWorkers() == 0 &&
@@ -3388,7 +3426,8 @@ PostmasterStateMachine(void)
 			(CheckpointerPID == 0 ||
 			 (!FatalError && Shutdown < ImmediateShutdown)) &&
 			WalWriterPID == 0 &&
-			AutoVacPID == 0)
+			AutoVacPID == 0 &&
+			BgReclaimerPID == 0)
 		{
 			if (Shutdown >= ImmediateShutdown || FatalError)
 			{
@@ -3486,6 +3525,7 @@ PostmasterStateMachine(void)
 			Assert(CheckpointerPID == 0);
 			Assert(WalWriterPID == 0);
 			Assert(AutoVacPID == 0);
+			Assert(BgReclaimerPID == 0);
 			/* syslogger is not considered here */
 			pmState = PM_NO_CHILDREN;
 		}
@@ -3698,6 +3738,8 @@ TerminateChildren(int signal)
 		signal_child(WalReceiverPID, signal);
 	if (AutoVacPID != 0)
 		signal_child(AutoVacPID, signal);
+	if (BgReclaimerPID != 0)
+		signal_child(BgReclaimerPID, signal);
 	if (PgArchPID != 0)
 		signal_child(PgArchPID, signal);
 	if (PgStatPID != 0)
@@ -4778,6 +4820,8 @@ sigusr1_handler(SIGNAL_ARGS)
 		CheckpointerPID = StartCheckpointer();
 		Assert(BgWriterPID == 0);
 		BgWriterPID = StartBackgroundWriter();
+		Assert(BgReclaimerPID == 0);
+		BgReclaimerPID = StartBackgroundReclaimer();
 
 		pmState = PM_RECOVERY;
 	}
@@ -5122,6 +5166,10 @@ StartChildProcess(AuxProcType type)
 				ereport(LOG,
 						(errmsg("could not fork WAL receiver process: %m")));
 				break;
+			case BgReclaimerProcess:
+				ereport(LOG,
+				   (errmsg("could not fork background writer process: %m")));
+				break;
 			default:
 				ereport(LOG,
 						(errmsg("could not fork process: %m")));
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index 1fd38d0..dfb9cb5 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -125,14 +125,10 @@ bits of the tag's hash value.  The rules stated above apply to each partition
 independently.  If it is necessary to lock more than one partition at a time,
 they must be locked in partition-number order to avoid risk of deadlock.
 
-* A separate system-wide LWLock, the BufFreelistLock, provides mutual
-exclusion for operations that access the buffer free list or select
-buffers for replacement.  This is always taken in exclusive mode since
-there are no read-only operations on those data structures.  The buffer
-management policy is designed so that BufFreelistLock need not be taken
-except in paths that will require I/O, and thus will be slow anyway.
-(Details appear below.)  It is never necessary to hold the BufMappingLock
-and the BufFreelistLock at the same time.
+* BufferStrategyControl contains a spinlock freelist_lck that provides mutual
+exclusion for operations that access the buffer freelist or select
+buffers for replacement.  It also contains victimbuf_lck that protects
+information related to the current clock sweep condition.
 
 * Each buffer header contains a spinlock that must be taken when examining
 or changing fields of that buffer header.  This allows operations such as
@@ -160,16 +156,20 @@ Normal Buffer Replacement Strategy
 
 There is a "free list" of buffers that are prime candidates for replacement.
 In particular, buffers that are completely free (contain no valid page) are
-always in this list.  We could also throw buffers into this list if we
-consider their pages unlikely to be needed soon; however, the current
-algorithm never does that.  The list is singly-linked using fields in the
+always in this list.  Allocating pages from this list is much cheaper than
+running the "clock sweep" algorithm, which may encounter many buffers
+that are poor candidates for eviction before finding a good candidate.
+Therefore, we have a background process called bgreclaimer which works
+to keep this list populated.  The list is singly-linked using fields in the
 buffer headers; we maintain head and tail pointers in global variables.
 (Note: although the list links are in the buffer headers, they are
-considered to be protected by the BufFreelistLock, not the buffer-header
+considered to be protected by the freelist_lck, not the buffer-header
 spinlocks.)  To choose a victim buffer to recycle when there are no free
 buffers available, we use a simple clock-sweep algorithm, which avoids the
-need to take system-wide locks during common operations.  It works like
-this:
+need to take system-wide locks during common operations.  The background
+reclaimer attempts to keep regular backends from having to run clock sweep
+by maintaining buffers on freelist, however backends are also empowered
+to run clock sweep. Clock sweep works like this:
 
 Each buffer header contains a usage counter, which is incremented (up to a
 small limit value) whenever the buffer is pinned.  (This requires only the
@@ -178,25 +178,28 @@ buffer reference count, so it's nearly free.)
 
 The "clock hand" is a buffer index, nextVictimBuffer, that moves circularly
 through all the available buffers.  nextVictimBuffer is protected by the
-BufFreelistLock.
+victimbuf_lck spinlock.
 
 The algorithm for a process that needs to obtain a victim buffer is:
 
-1. Obtain BufFreelistLock.
+1. Obtain spinlock freelist_lck.
 
-2. If buffer free list is nonempty, remove its head buffer.  If the buffer
-is pinned or has a nonzero usage count, it cannot be used; ignore it and
-return to the start of step 2.  Otherwise, pin the buffer, release
-BufFreelistLock, and return the buffer.
+2. If buffer free list is nonempty, remove its head buffer and release
+the freelist_lck.  Now set the bgwriter or bgreclaimer latch if required.
 
-3. Otherwise, select the buffer pointed to by nextVictimBuffer, and
+3. If we get the buffer, check if it is neither pinned nor
+has nonzero usage count, pin the buffer, and return the buffer.
+Otherwise again try to get the buffer from freelist and return
+to the start of step 3.
+
+4. Otherwise, select the buffer pointed to by nextVictimBuffer, and
 circularly advance nextVictimBuffer for next time.
 
-4. If the selected buffer is pinned or has a nonzero usage count, it cannot
-be used.  Decrement its usage count (if nonzero) and return to step 3 to
+5. If the selected buffer is pinned or has a nonzero usage count, it cannot
+be used.  Decrement its usage count (if nonzero) and return to step 4 to
 examine the next buffer.
 
-5. Pin the selected buffer, release BufFreelistLock, and return the buffer.
+6. Pin the selected buffer, and return the buffer.
 
 (Note that if the selected buffer is dirty, we will have to write it out
 before we can recycle it; if someone else pins the buffer meanwhile we will
@@ -259,7 +262,7 @@ dirty and not pinned nor marked with a positive usage count.  It pins,
 writes, and releases any such buffer.
 
 If we can assume that reading nextVictimBuffer is an atomic action, then
-the writer doesn't even need to take the BufFreelistLock in order to look
+the writer doesn't even need to take the spinlock in order to look
 for buffers to write; it needs only to spinlock each buffer header for long
 enough to check the dirtybit.  Even without that assumption, the writer
 only needs to take the lock long enough to read the variable value, not
@@ -281,3 +284,19 @@ As of 8.4, background writer starts during recovery mode when there is
 some form of potentially extended recovery to perform. It performs an
 identical service to normal processing, except that checkpoints it
 writes are technically restartpoints.
+
+
+Background Reclaimer's Processing
+---------------------------------
+
+The background reclaimer is designed to move buffers to freelist that are
+likely to be recycled soon, thereby offloading the need to perform
+clock sweep work from active backends.  To do this, it runs the clock sweep
+and move the the unpinned and zero usage count buffers to freelist.  It
+keeps on doing this until the number of buffers in freelist reaches the
+high water mark.
+
+Two water mark indicators are used to maintain sufficient number of buffers
+on freelist.  Low water mark indicator is used by backends to wake bgreclaimer
+when the number of buffers in freelist falls below it.  High water mark
+indicator is used by bgreclaimer to move buffers to freelist.
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 3240432..7df657c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -889,15 +889,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	/* Loop here in case we have to try another victim buffer */
 	for (;;)
 	{
-		bool		lock_held;
-
 		/*
 		 * Select a victim buffer.  The buffer is returned with its header
-		 * spinlock still held!  Also (in most cases) the BufFreelistLock is
-		 * still held, since it would be bad to hold the spinlock while
-		 * possibly waking up other processes.
+		 * spinlock still held!
 		 */
-		buf = StrategyGetBuffer(strategy, &lock_held);
+		buf = StrategyGetBuffer(strategy);
 
 		Assert(buf->refcount == 0);
 
@@ -907,10 +903,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		/* Pin the buffer and then release the buffer spinlock */
 		PinBuffer_Locked(buf);
 
-		/* Now it's safe to release the freelist lock */
-		if (lock_held)
-			LWLockRelease(BufFreelistLock);
-
 		/*
 		 * If the buffer was dirty, try to write it out.  There is a race
 		 * condition here, in that someone might dirty it after we released it
@@ -1933,6 +1925,93 @@ BgBufferSync(void)
 }
 
 /*
+ * Move buffers with reference and usage_count as zero to freelist.
+ * By maintaining enough number of buffers on freelist (equal to
+ * high water mark of freelist), we drastically reduce the odds for
+ * backend's to perform clock sweep.
+ *
+ * This is called by the background reclaim process when the number
+ * of buffers in freelist falls below low water mark of freelist.
+ */
+void
+BgMoveBuffersToFreelist(void)
+{
+	volatile uint32	next_to_clean;
+	uint32	tmp_num_to_free;
+	uint32	num_to_free = 0;
+	uint32  tmp_next_to_clean;
+	volatile BufferDesc *bufHdr;
+	uint32	tmp_recent_alloc;
+	uint32	recent_alloc = 0;
+	uint32  tmp_recent_backend_clocksweep;
+	uint32  recent_backend_clocksweep = 0;
+	
+
+	/* Execute the clock sweep */
+	for (;;)
+	{
+		StrategySyncStartAndEnd(&tmp_next_to_clean,
+								&tmp_num_to_free,
+								&tmp_recent_alloc,
+								&tmp_recent_backend_clocksweep);
+
+		num_to_free += tmp_num_to_free;
+		recent_alloc += tmp_recent_alloc;
+		recent_backend_clocksweep += tmp_recent_backend_clocksweep;
+		next_to_clean = tmp_next_to_clean;
+
+		if (tmp_num_to_free == 0)
+			break;
+
+		while (tmp_num_to_free > 0)
+		{
+			bufHdr = &BufferDescriptors[next_to_clean];
+
+			LockBufHdr(bufHdr);
+
+			if (bufHdr->refcount == 0)
+			{
+				if (bufHdr->usage_count > 0)
+				{
+					/*
+					 * Reduce usage count so that we can find the reusable
+					 * buffers in later cycles.
+					 */
+					bufHdr->usage_count--;
+					UnlockBufHdr(bufHdr);
+				}
+				else
+				{
+					UnlockBufHdr(bufHdr);
+					if (StrategyMoveBufferToFreeListEnd(bufHdr))
+						tmp_num_to_free--;
+				}
+			}
+			else
+				UnlockBufHdr(bufHdr);
+
+			/*
+			 * Choose next victim buffer to look if that can be moved
+			 * to freelist.
+			 */
+			StrategySyncNextVictimBuffer(&next_to_clean);
+		}
+	}
+
+	/*
+	 * Report buffer alloc and buffer request not satisfied
+	 * from freelist counts to pgstat.
+	 */
+	BgWriterStats.m_buf_alloc += recent_alloc;
+	BgWriterStats.m_buf_backend_clocksweep += recent_backend_clocksweep;
+
+#ifdef BGW_DEBUG
+	elog(DEBUG1, "bgreclaimer: recent_alloc=%u recent_backend_clocksweep =%d next_to_clean=%d num_freed=%u",
+		 recent_alloc, recent_backend_clocksweep, next_to_clean, num_to_free);
+#endif
+}
+
+/*
  * SyncOneBuffer -- process a single buffer during syncing.
  *
  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 4befab0..c4ee126 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -29,6 +29,7 @@ typedef struct
 
 	int			firstFreeBuffer;	/* Head of list of unused buffers */
 	int			lastFreeBuffer; /* Tail of list of unused buffers */
+	int			numFreeListBuffers; /* number of buffers on freelist */
 
 	/*
 	 * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
@@ -37,19 +38,48 @@ typedef struct
 
 	/*
 	 * Statistics.  These counters should be wide enough that they can't
-	 * overflow during a single bgwriter cycle.
+	 * overflow during a single bgwriter cycle.  completePasses is only
+	 * recorded by bgwriter, numBufferBackendClocksweep is only recorded
+	 * by bgreclaimer, however numBufferAllocs is recorded by both bgwriter
+	 * and bgreclaimer.
 	 */
 	uint32		completePasses; /* Complete cycles of the clock sweep */
 	uint32		numBufferAllocs;	/* Buffers allocated since last reset */
 
+	/* Buffers not statistied from freelist since last reset */
+	uint32		numBufferBackendClocksweep;
+
+	/*
+	 * protects freelist and related variables (firstFreeBuffer,
+	 * lastFreeBuffer, numBufferAllocs, numBufferBackendClocksweep,
+	 * numFreeListBuffers, BufferDesc->freeNext).
+	 */
+	slock_t	     freelist_lck;
+
 	/*
-	 * Notification latch, or NULL if none.  See StrategyNotifyBgWriter.
+	 * Protects nextVictimBuffer and completePasses. We need separate
+	 * lock to protect victim buffer and completePasses so that
+	 * clock sweep of one backend doesn't contend with another backend
+	 * which is evicting buffer from freelist.  We can consider having
+	 * victimbuf_lck and freelist_lck in separate cache lines by keeping
+	 * them apart in structure and by adding padding bytes, however at
+	 * the moment there is no proof that having them in same cache line
+	 * hits the performance in any scenario.
+	 */
+	slock_t	     victimbuf_lck;
+
+	/*
+	 * Latch to wake bgwriter.
 	 */
 	Latch	   *bgwriterLatch;
+	/*
+	 * Latch to wake bgreclaimer.
+	 */
+	Latch	   *bgreclaimerLatch;
 } BufferStrategyControl;
 
 /* Pointers to shared state */
-static BufferStrategyControl *StrategyControl = NULL;
+static volatile BufferStrategyControl *StrategyControl = NULL;
 
 /*
  * Private (non-shared) state for managing a ring of shared buffers to re-use.
@@ -84,6 +114,19 @@ typedef struct BufferAccessStrategyData
 	Buffer		buffers[1];		/* VARIABLE SIZE ARRAY */
 }	BufferAccessStrategyData;
 
+/*
+ * Water mark indicators for maintaining buffers on freelist.  When the
+ * number of buffers on freelist drops below the low water mark, the
+ * allocating backend sets the latch and bgreclaimer wakesup and begin
+ * adding buffer's to freelist until it reaches high water mark and then
+ * again goes back to sleep.
+ */
+int freelistLowWaterMark;
+int freelistHighWaterMark;
+
+/* Percentage indicators for maintaining buffers on freelist */
+#define HIGH_WATER_MARK_FREELIST_BUFFERS_PERCENT	0.005
+#define LOW_WATER_MARK_FREELIST_BUFFERS_PERCENT	0.2
 
 /* Prototypes for internal functions */
 static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
@@ -101,67 +144,51 @@ static void AddBufferToRing(BufferAccessStrategy strategy,
  *	strategy is a BufferAccessStrategy object, or NULL for default strategy.
  *
  *	To ensure that no one else can pin the buffer before we do, we must
- *	return the buffer with the buffer header spinlock still held.  If
- *	*lock_held is set on exit, we have returned with the BufFreelistLock
- *	still held, as well; the caller must release that lock once the spinlock
- *	is dropped.  We do it that way because releasing the BufFreelistLock
- *	might awaken other processes, and it would be bad to do the associated
- *	kernel calls while holding the buffer header spinlock.
+ *	return the buffer with the buffer header spinlock still held.
  */
 volatile BufferDesc *
-StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held)
+StrategyGetBuffer(BufferAccessStrategy strategy)
 {
-	volatile BufferDesc *buf;
+	volatile BufferDesc *buf = NULL;
 	Latch	   *bgwriterLatch;
+	Latch	   *bgreclaimerLatch;
+	int			numFreeListBuffers;
 	int			trycounter;
 
 	/*
 	 * If given a strategy object, see whether it can select a buffer. We
-	 * assume strategy objects don't need the BufFreelistLock.
+	 * assume strategy objects don't need the freelist_lck.
 	 */
 	if (strategy != NULL)
 	{
 		buf = GetBufferFromRing(strategy);
 		if (buf != NULL)
-		{
-			*lock_held = false;
 			return buf;
-		}
 	}
 
 	/* Nope, so lock the freelist */
-	*lock_held = true;
-	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&StrategyControl->freelist_lck);
 
 	/*
-	 * We count buffer allocation requests so that the bgwriter can estimate
-	 * the rate of buffer consumption.  Note that buffers recycled by a
-	 * strategy object are intentionally not counted here.
+	 * We count buffer allocation requests so that the bgwriter or bgreclaimer
+	 * can know the rate of buffer consumption and report it as stats.  Note
+	 * that buffers recycled by a strategy object are intentionally not counted
+	 * here.
 	 */
 	StrategyControl->numBufferAllocs++;
 
 	/*
-	 * If bgwriterLatch is set, we need to waken the bgwriter, but we should
-	 * not do so while holding BufFreelistLock; so release and re-grab.  This
-	 * is annoyingly tedious, but it happens at most once per bgwriter cycle,
-	 * so the performance hit is minimal.
+	 * Remember the values of bgwriter and bgreclaimer latch so that they can
+	 * be set outside spin lock and try to get a buffer from the freelist.
 	 */
+	bgreclaimerLatch = StrategyControl->bgreclaimerLatch;
 	bgwriterLatch = StrategyControl->bgwriterLatch;
 	if (bgwriterLatch)
-	{
 		StrategyControl->bgwriterLatch = NULL;
-		LWLockRelease(BufFreelistLock);
-		SetLatch(bgwriterLatch);
-		LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
-	}
 
-	/*
-	 * Try to get a buffer from the freelist.  Note that the freeNext fields
-	 * are considered to be protected by the BufFreelistLock not the
-	 * individual buffer spinlocks, so it's OK to manipulate them without
-	 * holding the spinlock.
-	 */
-	while (StrategyControl->firstFreeBuffer >= 0)
+	numFreeListBuffers = StrategyControl->numFreeListBuffers;
+
+	if (StrategyControl->firstFreeBuffer >= 0)
 	{
 		buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
 		Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
@@ -169,28 +196,86 @@ StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held)
 		/* Unconditionally remove buffer from freelist */
 		StrategyControl->firstFreeBuffer = buf->freeNext;
 		buf->freeNext = FREENEXT_NOT_IN_LIST;
+		--StrategyControl->numFreeListBuffers;
+	}
+	else
+		StrategyControl->numBufferBackendClocksweep++;
+
+	SpinLockRelease(&StrategyControl->freelist_lck);
+
+	/*
+	 * If bgwriterLatch is set, we need to waken the bgwriter, but we should
+	 * not do so while holding freelist_lck; so set it after releasing the
+	 * freelist_lck.  This is annoyingly tedious, but it happens at most once
+	 * per bgwriter cycle, so the performance hit is minimal.
+	 */
+	if (bgwriterLatch)
+		SetLatch(bgwriterLatch);
+
+	/*
+	 * Ideally numFreeListBuffers should get called under freelist spinlock,
+	 * however here we need this number for estimating approximate number of
+	 * free buffers required on freelist, so it should not be a problem, even
+	 * if numFreeListBuffers is not exact.  bgreclaimerLatch is initialized in
+	 * early phase of BgReclaimer startup, however we still check before using
+	 * it to avoid any problem incase we reach here before its initializion.
+	 */
+	if (numFreeListBuffers < freelistLowWaterMark  && bgreclaimerLatch)
+		SetLatch(StrategyControl->bgreclaimerLatch);
 
+	if (buf != NULL)
+	{
 		/*
-		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
-		 * it; discard it and retry.  (This can only happen if VACUUM put a
-		 * valid buffer in the freelist and then someone else used it before
-		 * we got to it.  It's probably impossible altogether as of 8.3, but
-		 * we'd better check anyway.)
+		 * Try to get a buffer from the freelist.  Note that the freeNext fields
+		 * are considered to be protected by the freelist_lck not the
+		 * individual buffer spinlocks, so it's OK to manipulate them without
+		 * holding the buffer spinlock.
 		 */
-		LockBufHdr(buf);
-		if (buf->refcount == 0 && buf->usage_count == 0)
+		for(;;)
 		{
-			if (strategy != NULL)
-				AddBufferToRing(strategy, buf);
-			return buf;
+			/*
+			 * If the buffer is pinned or has a nonzero usage_count, we cannot use
+			 * it; discard it and retry.
+			 */
+			LockBufHdr(buf);
+			if (buf->refcount == 0 && buf->usage_count == 0)
+			{
+				if (strategy != NULL)
+					AddBufferToRing(strategy, buf);
+				return buf;
+			}
+			UnlockBufHdr(buf);
+
+			SpinLockAcquire(&StrategyControl->freelist_lck);
+
+			if (StrategyControl->firstFreeBuffer >= 0)
+			{
+				buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
+				Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
+
+				/* Unconditionally remove buffer from freelist */
+				StrategyControl->firstFreeBuffer = buf->freeNext;
+				buf->freeNext = FREENEXT_NOT_IN_LIST;
+				--StrategyControl->numFreeListBuffers;
+
+				SpinLockRelease(&StrategyControl->freelist_lck);
+			}
+			else
+			{
+				StrategyControl->numBufferBackendClocksweep++;
+				SpinLockRelease(&StrategyControl->freelist_lck);
+				break;
+			}
 		}
-		UnlockBufHdr(buf);
 	}
 
 	/* Nothing on the freelist, so run the "clock sweep" algorithm */
 	trycounter = NBuffers;
+
 	for (;;)
 	{
+		SpinLockAcquire(&StrategyControl->victimbuf_lck);
+
 		buf = &BufferDescriptors[StrategyControl->nextVictimBuffer];
 
 		if (++StrategyControl->nextVictimBuffer >= NBuffers)
@@ -199,6 +284,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held)
 			StrategyControl->completePasses++;
 		}
 
+		SpinLockRelease(&StrategyControl->victimbuf_lck);
+
 		/*
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
@@ -241,7 +328,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held)
 void
 StrategyFreeBuffer(volatile BufferDesc *buf)
 {
-	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&StrategyControl->freelist_lck);
 
 	/*
 	 * It is possible that we are told to put something in the freelist that
@@ -253,12 +340,51 @@ StrategyFreeBuffer(volatile BufferDesc *buf)
 		if (buf->freeNext < 0)
 			StrategyControl->lastFreeBuffer = buf->buf_id;
 		StrategyControl->firstFreeBuffer = buf->buf_id;
+		++StrategyControl->numFreeListBuffers;
 	}
 
-	LWLockRelease(BufFreelistLock);
+	SpinLockRelease(&StrategyControl->freelist_lck);
 }
 
 /*
+ * StrategyMoveBufferToFreeListEnd: put a buffer on the end of freelist
+ */
+bool
+StrategyMoveBufferToFreeListEnd(volatile BufferDesc *buf)
+{
+	bool		freed = false;
+	SpinLockAcquire(&StrategyControl->freelist_lck);
+
+	/*
+	 * It is possible that we are told to put something in the freelist that
+	 * is already in it; don't screw up the list if so.
+	 */
+	if (buf->freeNext == FREENEXT_NOT_IN_LIST)
+	{
+		++StrategyControl->numFreeListBuffers;
+		freed = true;
+		/*
+		 * put the buffer on end of list and if list is empty then
+		 * assign first and last freebuffer with this buffer id.
+		 */
+		buf->freeNext = FREENEXT_END_OF_LIST;
+		if (StrategyControl->firstFreeBuffer < 0)
+		{
+			StrategyControl->firstFreeBuffer = buf->buf_id;
+			StrategyControl->lastFreeBuffer = buf->buf_id;
+			SpinLockRelease(&StrategyControl->freelist_lck);
+			return freed;
+		}
+		BufferDescriptors[StrategyControl->lastFreeBuffer].freeNext = buf->buf_id;
+		StrategyControl->lastFreeBuffer = buf->buf_id;
+	}
+	SpinLockRelease(&StrategyControl->freelist_lck);
+
+	return freed;
+}
+
+
+/*
  * StrategySyncStart -- tell BufferSync where to start syncing
  *
  * The result is the buffer index of the best buffer to sync first.
@@ -274,20 +400,79 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
 {
 	int			result;
 
-	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&StrategyControl->victimbuf_lck);
 	result = StrategyControl->nextVictimBuffer;
+
 	if (complete_passes)
 		*complete_passes = StrategyControl->completePasses;
+	SpinLockRelease(&StrategyControl->victimbuf_lck);
+
 	if (num_buf_alloc)
 	{
+		SpinLockAcquire(&StrategyControl->freelist_lck);
 		*num_buf_alloc = StrategyControl->numBufferAllocs;
 		StrategyControl->numBufferAllocs = 0;
+		SpinLockRelease(&StrategyControl->freelist_lck);
 	}
-	LWLockRelease(BufFreelistLock);
 	return result;
 }
 
 /*
+ * StrategySyncStartAndEnd -- tell bgreclaimer where to start looking
+ * for unused buffers.
+ *
+ * The result is the buffer index of the best buffer to start looking for
+ * unused buffers, number of buffers that are required to be moved to
+ * freelist and count of recent buffer allocs and buffer allocs not
+ * satisfied from freelist.
+ */
+void
+StrategySyncStartAndEnd(uint32 *start, uint32 *end, uint32 *num_buf_alloc,
+						uint32 *num_buf_backend_clocksweep)
+{
+	int			curfreebuffers;
+
+	SpinLockAcquire(&StrategyControl->victimbuf_lck);
+	*start = StrategyControl->nextVictimBuffer;
+	SpinLockRelease(&StrategyControl->victimbuf_lck);
+
+	SpinLockAcquire(&StrategyControl->freelist_lck);
+	curfreebuffers = StrategyControl->numFreeListBuffers;
+	if (curfreebuffers < freelistHighWaterMark)
+		*end = freelistHighWaterMark - curfreebuffers;
+	else
+		*end = 0;
+
+	if (num_buf_alloc)
+	{
+		*num_buf_alloc = StrategyControl->numBufferAllocs;
+		StrategyControl->numBufferAllocs = 0;
+	}
+	if (num_buf_backend_clocksweep)
+	{
+		*num_buf_backend_clocksweep = StrategyControl->numBufferBackendClocksweep;
+		StrategyControl->numBufferBackendClocksweep = 0;
+	}
+	SpinLockRelease(&StrategyControl->freelist_lck);
+
+	return;
+}
+
+/*
+ * StrategySyncNextVictimBuffer -- tell bgreclaimer where to start looking
+ * for next unused buffer.
+ */
+void
+StrategySyncNextVictimBuffer(volatile uint32 *next_victim_buffer)
+{
+	SpinLockAcquire(&StrategyControl->victimbuf_lck);
+	if (++StrategyControl->nextVictimBuffer >= NBuffers)
+		StrategyControl->nextVictimBuffer = 0;
+	*next_victim_buffer = StrategyControl->nextVictimBuffer;
+	SpinLockRelease(&StrategyControl->victimbuf_lck);
+}
+
+/*
  * StrategyNotifyBgWriter -- set or clear allocation notification latch
  *
  * If bgwriterLatch isn't NULL, the next invocation of StrategyGetBuffer will
@@ -299,15 +484,27 @@ void
 StrategyNotifyBgWriter(Latch *bgwriterLatch)
 {
 	/*
-	 * We acquire the BufFreelistLock just to ensure that the store appears
+	 * We acquire the freelist_lck just to ensure that the store appears
 	 * atomic to StrategyGetBuffer.  The bgwriter should call this rather
 	 * infrequently, so there's no performance penalty from being safe.
 	 */
-	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&StrategyControl->freelist_lck);
 	StrategyControl->bgwriterLatch = bgwriterLatch;
-	LWLockRelease(BufFreelistLock);
+	SpinLockRelease(&StrategyControl->freelist_lck);
 }
 
+/*
+ * StrategyInitBgReclaimerLatch -- Initialize bgreclaimer latch.
+ * This will be used by bgreclaimer to wake itself when backend
+ * sets this latch.
+ */
+void
+StrategyInitBgReclaimerLatch(Latch *bgreclaimerLatch)
+{
+	SpinLockAcquire(&StrategyControl->freelist_lck);
+	StrategyControl->bgreclaimerLatch = bgreclaimerLatch;
+	SpinLockRelease(&StrategyControl->freelist_lck);
+}
 
 /*
  * StrategyShmemSize
@@ -376,6 +573,7 @@ StrategyInitialize(bool init)
 		 */
 		StrategyControl->firstFreeBuffer = 0;
 		StrategyControl->lastFreeBuffer = NBuffers - 1;
+		StrategyControl->numFreeListBuffers = NBuffers;
 
 		/* Initialize the clock sweep pointer */
 		StrategyControl->nextVictimBuffer = 0;
@@ -383,12 +581,35 @@ StrategyInitialize(bool init)
 		/* Clear statistics */
 		StrategyControl->completePasses = 0;
 		StrategyControl->numBufferAllocs = 0;
+		StrategyControl->numBufferBackendClocksweep = 0;
 
 		/* No pending notification */
 		StrategyControl->bgwriterLatch = NULL;
+		StrategyControl->bgreclaimerLatch = NULL;
+		SpinLockInit(&StrategyControl->freelist_lck);
+		SpinLockInit(&StrategyControl->victimbuf_lck);
 	}
 	else
 		Assert(!init);
+
+	/*
+	 * Initialize the low and high water mark number of buffer's
+	 * for freelist.  This is used to maintain buffer's on freelist
+	 * so that backend doesn't often need to perform clock sweep to
+	 * find the buffer.  We need to maintain enough buffers so that
+	 * requests can be satisfied from freelist, if based on water mark
+	 * calculation count of buffers on freelist goes beyond 2000 or
+	 * lesser than 5, then we set it to hard coded values.  These numbers
+	 * are based on results of benchmarks at various workloads.
+	 */
+	freelistHighWaterMark = HIGH_WATER_MARK_FREELIST_BUFFERS_PERCENT * NBuffers;
+	if (freelistHighWaterMark < 5)
+		freelistHighWaterMark = 5;
+	else if (freelistHighWaterMark > 2000)
+		freelistHighWaterMark = 2000;
+
+	freelistLowWaterMark = LOW_WATER_MARK_FREELIST_BUFFERS_PERCENT *
+						   freelistHighWaterMark;
 }
 
 
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 44ccd37..00d815f 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -100,6 +100,7 @@ extern Datum pg_stat_get_bgwriter_stat_reset_time(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_buf_written_backend(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_buf_fsync_backend(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_buf_alloc(PG_FUNCTION_ARGS);
+extern Datum pg_stat_get_buf_clocksweep_backend(PG_FUNCTION_ARGS);
 
 extern Datum pg_stat_get_xact_numscans(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_xact_tuples_returned(PG_FUNCTION_ARGS);
@@ -1496,6 +1497,12 @@ pg_stat_get_buf_alloc(PG_FUNCTION_ARGS)
 }
 
 Datum
+pg_stat_get_buf_clocksweep_backend(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_INT64(pgstat_fetch_global()->buf_backend_clocksweep);
+}
+
+Datum
 pg_stat_get_xact_numscans(PG_FUNCTION_ARGS)
 {
 	Oid			relid = PG_GETARG_OID(0);
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 5176ed0..1265bb1 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2779,6 +2779,8 @@ DATA(insert OID = 3063 ( pg_stat_get_buf_fsync_backend PGNSP PGUID 12 1 0 0 0 f
 DESCR("statistics: number of backend buffer writes that did their own fsync");
 DATA(insert OID = 2859 ( pg_stat_get_buf_alloc			PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0 20 "" _null_ _null_ _null_ _null_ pg_stat_get_buf_alloc _null_ _null_ _null_ ));
 DESCR("statistics: number of buffer allocations");
+DATA(insert OID = 3218 ( pg_stat_get_buf_clocksweep_backend			PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0 20 "" _null_ _null_ _null_ _null_ pg_stat_get_buf_clocksweep_backend _null_ _null_ _null_ ));
+DESCR("statistics: number of buffer allocations not satisfied from freelsit");
 
 DATA(insert OID = 2978 (  pg_stat_get_function_calls		PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_function_calls _null_ _null_ _null_ ));
 DESCR("statistics: number of function calls");
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 3807955..8e58fb4 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -366,6 +366,7 @@ typedef enum
 	CheckpointerProcess,
 	WalWriterProcess,
 	WalReceiverProcess,
+	BgReclaimerProcess,
 
 	NUM_AUXPROCTYPES			/* Must be last! */
 } AuxProcType;
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 0892533..51a2023 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -397,6 +397,7 @@ typedef struct PgStat_MsgBgWriter
 	PgStat_Counter m_buf_written_backend;
 	PgStat_Counter m_buf_fsync_backend;
 	PgStat_Counter m_buf_alloc;
+	PgStat_Counter m_buf_backend_clocksweep;
 	PgStat_Counter m_checkpoint_write_time;		/* times in milliseconds */
 	PgStat_Counter m_checkpoint_sync_time;
 } PgStat_MsgBgWriter;
@@ -545,7 +546,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9C
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9D
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -670,6 +671,7 @@ typedef struct PgStat_GlobalStats
 	PgStat_Counter buf_written_backend;
 	PgStat_Counter buf_fsync_backend;
 	PgStat_Counter buf_alloc;
+	PgStat_Counter buf_backend_clocksweep;
 	TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
diff --git a/src/include/postmaster/bgreclaimer.h b/src/include/postmaster/bgreclaimer.h
new file mode 100644
index 0000000..bbd6943
--- /dev/null
+++ b/src/include/postmaster/bgreclaimer.h
@@ -0,0 +1,18 @@
+/*-------------------------------------------------------------------------
+ *
+ * bgreclaimer.h
+ *	  POSTGRES buffer reclaimer definitions.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ *
+ * src/include/postmaster/bgreclaimer.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _BGRECLAIMER_H
+#define _BGRECLAIMER_H
+
+extern void BackgroundReclaimerMain(void) __attribute__((noreturn));
+
+
+#endif   /* _BGRECLAIMER_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index c019013..5c30ec7 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -115,9 +115,8 @@ typedef struct buftag
  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
  * changes after initialization, so does not need locking.  freeNext is
- * protected by the BufFreelistLock not buf_hdr_lock.  The LWLocks can take
- * care of themselves.  The buf_hdr_lock is *not* used to control access to
- * the data in the buffer!
+ * protected by the freelist_lck not buf_hdr_lock.  The buf_hdr_lock is
+ * *not* used to control access to the data in the buffer!
  *
  * An exception is that if we have the buffer pinned, its tag can't change
  * underneath us, so we can examine the tag without locking the spinlock.
@@ -185,14 +184,19 @@ extern BufferDesc *LocalBufferDescriptors;
  */
 
 /* freelist.c */
-extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
-				  bool *lock_held);
+extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
 extern void StrategyFreeBuffer(volatile BufferDesc *buf);
+extern bool StrategyMoveBufferToFreeListEnd(volatile BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 					 volatile BufferDesc *buf);
 
 extern int	StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
+extern void  StrategySyncStartAndEnd(uint32 *start, uint32 *end,
+									uint32 *num_buf_alloc,
+									uint32 *num_buf_backend_clocksweep);
+extern void StrategySyncNextVictimBuffer(volatile uint32 *next_victim_buffer);
 extern void StrategyNotifyBgWriter(Latch *bgwriterLatch);
+extern void StrategyInitBgReclaimerLatch(Latch *bgwriterLatch);
 
 extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 42d9120..da4f837 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -200,6 +200,7 @@ extern void AbortBufferIO(void);
 
 extern void BufmgrCommit(void);
 extern bool BgBufferSync(void);
+extern void BgMoveBuffersToFreelist(void);
 
 extern void AtProcExit_LocalBuffers(void);
 
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 1d90b9f..754a838 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -89,7 +89,6 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
  * if you remove a lock, consider leaving a gap in the numbering sequence for
  * the benefit of DTrace and other external debugging scripts.
  */
-#define BufFreelistLock				(&MainLWLockArray[0].lock)
 #define ShmemIndexLock				(&MainLWLockArray[1].lock)
 #define OidGenLock					(&MainLWLockArray[2].lock)
 #define XidGenLock					(&MainLWLockArray[3].lock)
@@ -136,7 +135,7 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
  */
 
 /* Number of partitions of the shared buffer mapping hashtable */
-#define NUM_BUFFER_PARTITIONS  16
+#define NUM_BUFFER_PARTITIONS  128
 
 /* Number of partitions the shared lock tables are divided into */
 #define LOG2_NUM_LOCK_PARTITIONS  4
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index c23f4da..b0688a8 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -215,11 +215,12 @@ extern PGPROC *PreparedXactProcs;
  * We set aside some extra PGPROC structures for auxiliary processes,
  * ie things that aren't full-fledged backends but need shmem access.
  *
- * Background writer, checkpointer and WAL writer run during normal operation.
- * Startup process and WAL receiver also consume 2 slots, but WAL writer is
- * launched only after startup has exited, so we only need 4 slots.
+ * Background writer, Background reclaimer, checkpointer and WAL writer run
+ * during normal operation.  Startup process and WAL receiver also consume 2
+ * slots, but WAL writer is launched only after startup has exited, so we only
+ * need 5 slots.
  */
-#define NUM_AUXILIARY_PROCS		4
+#define NUM_AUXILIARY_PROCS		5
 
 
 /* configurable options */
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index ca56b47..24ac85a 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1671,6 +1671,7 @@ pg_stat_bgwriter| SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints
     pg_stat_get_buf_written_backend() AS buffers_backend,
     pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
     pg_stat_get_buf_alloc() AS buffers_alloc,
+	pg_stat_get_buf_clocksweep_backend() AS buffers_backend_clocksweep,
     pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 pg_stat_database| SELECT d.oid AS datid,
     d.datname,
