[HACKERS] WIP: dynahash replacement for buffer table

Robert Haas Tue, 14 Oct 2014 06:32:27 -0700

A few years ago I started working on a concurrent hash table for
PostgreSQL.  The hash table part of it worked, but I never did
anything with it, really.  Amit mentioned to me earlier this week that
he was seeing contention inside the dynahash machinery, which inspired
me to go back and update the patch.  I took the basic infrastructure
from before and used it to replace the buffer table.  Patch is
attached.


The key idea here is that lookups are done without any locks, only
memory barriers; and inserts and deletes are done using atomic ops.
The algorithm is not strictly lock-free for reasons explained in the
comments in chash.c, but it's a lot less locky than what we have now,
so in theory you might think that would be a good thing.

I haven't had time to do much performance testing yet, but it looks
like this may be slower at low client counts and faster at high client
counts.  However, my results weren't real reproducible, and I haven't
done comprehensive testing yet.  What was really bizarre is that I
couldn't really pin down the cause of the slowness at low client
counts; a quick perf profile showed overhead concentrated in
CHashBucketScan, basically memory access latency for walking the
bucket chain.  But the table is built to have a load factor of 1, so I
can't see why that should amount to much, or why it should be
significantly worse than for dynahash.

This patch contains assorted leftovers and is grotty in various ways,
but I'm sharing it anyway just to get it out there.

git branch also available at:
http://git.postgresql.org/gitweb/?p=users/rhaas/postgres.git;a=shortlog;h=refs/heads/chash2014

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

diff --git a/contrib/Makefile b/contrib/Makefile
index b37d0dd..0b91ac1 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -20,6 +20,7 @@ SUBDIRS = \
 		earthdistance	\
 		file_fdw	\
 		fuzzystrmatch	\
+		hashtest	\
 		hstore		\
 		intagg		\
 		intarray	\
diff --git a/contrib/hashtest/Makefile b/contrib/hashtest/Makefile
new file mode 100644
index 0000000..3ee42f8
--- /dev/null
+++ b/contrib/hashtest/Makefile
@@ -0,0 +1,18 @@
+# contrib/hashtest/Makefile
+
+MODULE_big = hashtest
+OBJS = hashtest.o
+
+EXTENSION = hashtest
+DATA = hashtest--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/hashtest
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/hashtest/hashtest--1.0.sql b/contrib/hashtest/hashtest--1.0.sql
new file mode 100644
index 0000000..e271baf
--- /dev/null
+++ b/contrib/hashtest/hashtest--1.0.sql
@@ -0,0 +1,52 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION hashtest" to load this file. \quit
+
+CREATE FUNCTION chash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_collision_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_collision_test'
+LANGUAGE C;
diff --git a/contrib/hashtest/hashtest.c b/contrib/hashtest/hashtest.c
new file mode 100644
index 0000000..172a5bb
--- /dev/null
+++ b/contrib/hashtest/hashtest.c
@@ -0,0 +1,527 @@
+/*-------------------------------------------------------------------------
+ * hashtest.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "libpq/auth.h"
+#include "lib/stringinfo.h"
+#include "miscadmin.h"
+#include "portability/instr_time.h"
+#include "storage/ipc.h"
+#include "utils/chash.h"
+
+PG_MODULE_MAGIC;
+
+void		_PG_init(void);
+Datum		chash_insert_test(PG_FUNCTION_ARGS);
+Datum		chash_search_test(PG_FUNCTION_ARGS);
+Datum		chash_delete_test(PG_FUNCTION_ARGS);
+Datum		chash_concurrent_test(PG_FUNCTION_ARGS);
+Datum		chash_collision_test(PG_FUNCTION_ARGS);
+Datum		dynahash_insert_test(PG_FUNCTION_ARGS);
+Datum		dynahash_search_test(PG_FUNCTION_ARGS);
+Datum		dynahash_delete_test(PG_FUNCTION_ARGS);
+Datum		dynahash_concurrent_test(PG_FUNCTION_ARGS);
+Datum		dynahash_collision_test(PG_FUNCTION_ARGS);
+static void hashtest_shmem_startup(void);
+
+PG_FUNCTION_INFO_V1(chash_insert_test);
+PG_FUNCTION_INFO_V1(chash_search_test);
+PG_FUNCTION_INFO_V1(chash_delete_test);
+PG_FUNCTION_INFO_V1(chash_concurrent_test);
+PG_FUNCTION_INFO_V1(chash_collision_test);
+PG_FUNCTION_INFO_V1(dynahash_insert_test);
+PG_FUNCTION_INFO_V1(dynahash_search_test);
+PG_FUNCTION_INFO_V1(dynahash_delete_test);
+PG_FUNCTION_INFO_V1(dynahash_concurrent_test);
+PG_FUNCTION_INFO_V1(dynahash_collision_test);
+
+typedef struct
+{
+	uint32	key;
+	uint32	val;
+} hentry;
+
+static CHashDescriptor cdesc = {
+	"hashtest-chash",	/* name */
+	1048576,			/* capacity */
+	sizeof(hentry),		/* element size */
+	sizeof(uint32)		/* key size */
+};
+
+#define DYNAHASH_PARTITIONS		16
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static CHashTable chash;
+static HTAB *dynahash;
+static LWLockId dynahash_lock[DYNAHASH_PARTITIONS];
+static ClientAuthentication_hook_type original_client_auth_hook = NULL;
+
+static void hashtest_client_auth_hook(Port *port, int status);
+static void chash_write_stats_to_log(int code, Datum dummy);
+
+#define dynahash_get_lock(hashcode) \
+	(dynahash_lock[(hashcode) % DYNAHASH_PARTITIONS])
+
+void
+_PG_init(void)
+{
+	Size	cs;
+	Size	ds;
+
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = hashtest_shmem_startup;
+	chash = CHashBootstrap(&cdesc);
+	cs = CHashEstimateSize(chash);
+	RequestAddinShmemSpace(cs);
+	ds = hash_estimate_size(cdesc.capacity, cdesc.element_size);
+	RequestAddinShmemSpace(ds);
+	elog(LOG, "chash: %u bytes; dynahash: %u bytes", (unsigned) cs,
+		 (unsigned) ds);
+	RequestAddinLWLocks(DYNAHASH_PARTITIONS);
+	original_client_auth_hook = ClientAuthentication_hook;
+	ClientAuthentication_hook = hashtest_client_auth_hook;
+	
+}
+
+static void
+hashtest_client_auth_hook(Port *port, int status)
+{
+	if (original_client_auth_hook)
+		original_client_auth_hook(port, status);
+	on_proc_exit(chash_write_stats_to_log, (Datum) 0);
+}
+
+static void
+chash_write_stats_to_log(int code, Datum dummy)
+{
+	uint64	stats[CHS_NumberOfStatistics];
+	CHashStatisticsType i;
+	StringInfoData	buf;
+
+	CHashStatistics(chash, stats);
+	initStringInfo(&buf);
+
+	for (i = 0; i < CHS_NumberOfStatistics; ++i)
+	{
+		if (stats[i] == 0)
+			continue;
+		appendStringInfo(&buf, UINT64_FORMAT " %s; ", stats[i],
+						 CHashStatisticsNames[i]);
+	}
+
+	if (buf.len > 1)
+	{
+		buf.data[buf.len-2] = '\0';
+		elog(LOG, "chash statistics: %s", buf.data);
+	}
+}
+
+static void
+hashtest_shmem_startup(void)
+{
+	HASHCTL		info;
+	uint32		i;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	/* Initialize concurrent hash table. */
+	chash = CHashInitialize(chash, &cdesc);
+
+	/* Initialize shared dynahash table. */
+	info.keysize = cdesc.key_size;
+	info.entrysize = cdesc.element_size;
+	info.hash = tag_hash;
+	info.num_partitions = DYNAHASH_PARTITIONS;
+
+	dynahash = ShmemInitHash("hashtest-dynahash",
+							 cdesc.capacity, cdesc.capacity,
+							 &info,
+							 HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+
+	for (i = 0; i < DYNAHASH_PARTITIONS; ++i)
+		dynahash_lock[i] = LWLockAssign();
+}
+
+Datum
+chash_insert_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		e.val = i * 31;
+		ok = CHashInsert(chash, &e);
+		if (!ok)
+			elog(LOG, "insert %u: failed", i);
+		ok = CHashInsert(chash, &e);
+		if (ok)
+			elog(LOG, "insert %u: worked twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_search_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		ok = CHashSearch(chash, &e);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (e.val != e.key * 31)
+			elog(LOG, "search %u: found %u", i, e.val);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_delete_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		ok = CHashDelete(chash, &e);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+		ok = CHashDelete(chash, &e);
+		if (ok)
+			elog(LOG, "delete %u: found twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_concurrent_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+	uint32	seed = MyProcPid << 16;
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = seed | i;
+		e.val = MyProcPid;
+		ok = CHashInsert(chash, &e);
+		if (!ok)
+			elog(LOG, "insert %u: found", i);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = seed | i;
+		e.val = 0;
+		ok = CHashSearch(chash, &e);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "search %u: not found", i);
+			while (!CHashSearch(chash, &e))
+				++retry;
+			elog(LOG, "search %u: eventually found it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+		if (e.val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u", i, (unsigned) MyProcPid, e.val);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = seed | i;
+		ok = CHashDelete(chash, &e);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "delete %u: not found", i);
+			while (!CHashDelete(chash, &e))
+				++retry;
+			elog(LOG, "delete %u: eventually deleted it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_collision_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	/* Don't stack-allocate this. */
+	static bool mine[10000];
+
+	memset(mine, 0, 10000 * sizeof(bool));
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		e.val = MyProcPid;
+		ok = CHashInsert(chash, &e);
+		if (ok)
+			mine[i] = true;
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		if (!mine[i])
+			continue;
+		e.key = i;
+		ok = CHashSearch(chash, &e);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (e.val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u",
+				 i, (unsigned) MyProcPid, e.val);
+		ok = CHashDelete(chash, &e);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+static bool
+dynahash_insert(uint32 key, uint32 val)
+{
+	bool	found;
+	uint32	hashcode;
+	hentry *e;
+	LWLockId	lockid;
+
+	hashcode = get_hash_value(dynahash, (void *) &key);
+ 	lockid = dynahash_get_lock(hashcode);
+	LWLockAcquire(lockid, LW_EXCLUSIVE);
+	e = hash_search_with_hash_value(dynahash, (void *) &key,
+									hashcode, HASH_ENTER, &found);
+	if (!found)
+		e->val = val;
+	LWLockRelease(lockid);
+
+	return !found;
+}
+
+static bool
+dynahash_search(uint32 key, uint32 *val)
+{
+	uint32	hashcode;
+	hentry *e;
+	LWLockId	lockid;
+
+	hashcode = get_hash_value(dynahash, (void *) &key);
+ 	lockid = dynahash_get_lock(hashcode);
+	LWLockAcquire(lockid, LW_SHARED);
+	e = hash_search_with_hash_value(dynahash, (void *) &key,
+									hashcode, HASH_FIND, NULL);
+	if (e)
+		*val = e->val;
+	LWLockRelease(lockid);
+
+	return e != NULL;
+}
+
+static bool
+dynahash_delete(uint32 key)
+{
+	uint32	hashcode;
+	hentry *e;
+	LWLockId	lockid;
+
+	hashcode = get_hash_value(dynahash, (void *) &key);
+ 	lockid = dynahash_get_lock(hashcode);
+	LWLockAcquire(lockid, LW_EXCLUSIVE);
+	e = hash_search_with_hash_value(dynahash, (void *) &key,
+									hashcode, HASH_REMOVE, NULL);
+	LWLockRelease(lockid);
+
+	return e != NULL;
+}
+
+Datum
+dynahash_insert_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool	ok;
+
+		ok = dynahash_insert(i, i * 31);
+		if (!ok)
+			elog(LOG, "insert %u: failed", i);
+		ok = dynahash_insert(i, i * 31);
+		if (ok)
+			elog(LOG, "insert %u: worked twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_search_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool	ok;
+		uint32	val;
+
+		ok = dynahash_search(i, &val);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (val != i* 31)
+			elog(LOG, "search %u: found %u", i, val);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_delete_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool	ok;
+
+		ok = dynahash_delete(i);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+		ok = dynahash_delete(i);
+		if (ok)
+			elog(LOG, "delete %u: found twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_concurrent_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	uint32	val;
+	uint32	seed = MyProcPid << 16;
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_insert(seed | i, MyProcPid);
+		if (!ok)
+			elog(LOG, "insert %u: found", i);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_search(seed | i, &val);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "search %u: not found", i);
+			while (!dynahash_search(seed | i, &val))
+				++retry;
+			elog(LOG, "search %u: eventually found it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+		if (val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u",
+				 i, (unsigned) MyProcPid, val);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_delete(seed | i);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "delete %u: not found", i);
+			while (!dynahash_delete(seed | i))
+				++retry;
+			elog(LOG, "delete %u: eventually deleted it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_collision_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	uint32	val;
+
+	/* Don't stack-allocate this. */
+	static bool mine[10000];
+
+	memset(mine, 0, 10000 * sizeof(bool));
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_insert(i, MyProcPid);
+		if (ok)
+			mine[i] = true;
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		if (!mine[i])
+			continue;
+		ok = dynahash_search(i, &val);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u",
+				 i, (unsigned) MyProcPid, val);
+		ok = dynahash_delete(i);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+	}
+
+	PG_RETURN_VOID();
+}
diff --git a/contrib/hashtest/hashtest.control b/contrib/hashtest/hashtest.control
new file mode 100644
index 0000000..b8e0f01
--- /dev/null
+++ b/contrib/hashtest/hashtest.control
@@ -0,0 +1,4 @@
+comment = 'hash testing code'
+default_version = '1.0'
+module_pathname = '$libdir/hashtest'
+relocatable = true
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
index 7a38f2f..092cf8f 100644
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -21,8 +21,10 @@
  */
 #include "postgres.h"
 
+#include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
+#include "utils/chash.h"
 
 
 /* entry for buffer lookup hashtable */
@@ -32,8 +34,13 @@ typedef struct
 	int			id;				/* Associated buffer ID */
 } BufferLookupEnt;
 
-static HTAB *SharedBufHash;
-
+static CHashDescriptor SharedBufDescriptor = {
+	"buffer lookup table",
+	0,
+	sizeof(BufferLookupEnt),
+	sizeof(BufferTag)
+};
+static CHashTable SharedBufHash;
 
 /*
  * Estimate space needed for mapping hashtable
@@ -42,7 +49,13 @@ static HTAB *SharedBufHash;
 Size
 BufTableShmemSize(int size)
 {
-	return hash_estimate_size(size, sizeof(BufferLookupEnt));
+	if (SharedBufHash == NULL)
+	{
+		SharedBufDescriptor.capacity = size;
+		SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
+	}
+
+	return CHashEstimateSize(SharedBufHash);
 }
 
 /*
@@ -52,59 +65,29 @@ BufTableShmemSize(int size)
 void
 InitBufTable(int size)
 {
-	HASHCTL		info;
-
-	/* assume no locking is needed yet */
-
-	/* BufferTag maps to Buffer */
-	info.keysize = sizeof(BufferTag);
-	info.entrysize = sizeof(BufferLookupEnt);
-	info.hash = tag_hash;
-	info.num_partitions = NUM_BUFFER_PARTITIONS;
-
-	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
-								  size, size,
-								  &info,
-								  HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
-}
-
-/*
- * BufTableHashCode
- *		Compute the hash code associated with a BufferTag
- *
- * This must be passed to the lookup/insert/delete routines along with the
- * tag.  We do it like this because the callers need to know the hash code
- * in order to determine which buffer partition to lock, and we don't want
- * to do the hash computation twice (hash_any is a bit slow).
- */
-uint32
-BufTableHashCode(BufferTag *tagPtr)
-{
-	return get_hash_value(SharedBufHash, (void *) tagPtr);
+	if (SharedBufHash == NULL || !IsUnderPostmaster)
+	{
+		Assert(SharedBufDescriptor.capacity == 0 ||
+			SharedBufDescriptor.capacity == size);
+		SharedBufDescriptor.capacity = size;
+		SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
+	}
 }
 
 /*
  * BufTableLookup
  *		Lookup the given BufferTag; return buffer ID, or -1 if not found
- *
- * Caller must hold at least share lock on BufMappingLock for tag's partition
  */
 int
-BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
+BufTableLookup(BufferTag *tagPtr)
 {
-	BufferLookupEnt *result;
-
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_FIND,
-									NULL);
+	BufferLookupEnt ent;
 
-	if (!result)
+	ent.key = *tagPtr;
+	if (!CHashSearch(SharedBufHash, &ent))
 		return -1;
 
-	return result->id;
+	return ent.id;
 }
 
 /*
@@ -118,27 +101,20 @@ BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
  * Caller must hold exclusive lock on BufMappingLock for tag's partition
  */
 int
-BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
+BufTableInsert(BufferTag *tagPtr, int buf_id)
 {
-	BufferLookupEnt *result;
-	bool		found;
+	BufferLookupEnt ent;
+
+	ent.key = *tagPtr;
+	ent.id = buf_id;
 
 	Assert(buf_id >= 0);		/* -1 is reserved for not-in-table */
 	Assert(tagPtr->blockNum != P_NEW);	/* invalid tag */
 
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_ENTER,
-									&found);
-
-	if (found)					/* found something already in the table */
-		return result->id;
-
-	result->id = buf_id;
+	if (CHashInsert(SharedBufHash, &ent))
+		return -1;
 
-	return -1;
+	return ent.id;
 }
 
 /*
@@ -148,17 +124,8 @@ BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
  * Caller must hold exclusive lock on BufMappingLock for tag's partition
  */
 void
-BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
+BufTableDelete(BufferTag *tagPtr)
 {
-	BufferLookupEnt *result;
-
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_REMOVE,
-									NULL);
-
-	if (!result)				/* shouldn't happen */
+	if (!CHashDelete(SharedBufHash, tagPtr))
 		elog(ERROR, "shared buffer hash table corrupted");
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 45d1d61..437deb9 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -429,22 +429,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 	else
 	{
 		BufferTag	newTag;		/* identity of requested block */
-		uint32		newHash;	/* hash value for newTag */
-		LWLock	   *newPartitionLock;	/* buffer partition lock for it */
 		int			buf_id;
 
 		/* create a tag so we can lookup the buffer */
 		INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
 					   forkNum, blockNum);
 
-		/* determine its hash code and partition lock ID */
-		newHash = BufTableHashCode(&newTag);
-		newPartitionLock = BufMappingPartitionLock(newHash);
-
 		/* see if the block is in the buffer pool already */
-		LWLockAcquire(newPartitionLock, LW_SHARED);
-		buf_id = BufTableLookup(&newTag, newHash);
-		LWLockRelease(newPartitionLock);
+		buf_id = BufTableLookup(&newTag);
 
 		/* If not in buffers, initiate prefetch */
 		if (buf_id < 0)
@@ -822,11 +814,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			bool *foundPtr)
 {
 	BufferTag	newTag;			/* identity of requested block */
-	uint32		newHash;		/* hash value for newTag */
-	LWLock	   *newPartitionLock;		/* buffer partition lock for it */
 	BufferTag	oldTag;			/* previous identity of selected buffer */
-	uint32		oldHash;		/* hash value for oldTag */
-	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
 	BufFlags	oldFlags;
 	int			buf_id;
 	volatile BufferDesc *buf;
@@ -835,29 +823,31 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	/* create a tag so we can lookup the buffer */
 	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
 
-	/* determine its hash code and partition lock ID */
-	newHash = BufTableHashCode(&newTag);
-	newPartitionLock = BufMappingPartitionLock(newHash);
-
 	/* see if the block is in the buffer pool already */
-	LWLockAcquire(newPartitionLock, LW_SHARED);
-	buf_id = BufTableLookup(&newTag, newHash);
+start:
+	buf_id = BufTableLookup(&newTag);
 	if (buf_id >= 0)
 	{
+		BufferDesc *foundbuf;
+
 		/*
 		 * Found it.  Now, pin the buffer so no one can steal it from the
-		 * buffer pool, and check to see if the correct data has been loaded
-		 * into the buffer.
+		 * buffer pool.
 		 */
-		buf = &BufferDescriptors[buf_id];
+		foundbuf = &BufferDescriptors[buf_id];
 
-		valid = PinBuffer(buf, strategy);
+		valid = PinBuffer(foundbuf, strategy);
 
-		/* Can release the mapping lock as soon as we've pinned it */
-		LWLockRelease(newPartitionLock);
+		/* Check whether someone recycled the buffer before we pinned it. */
+		if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+		{
+			UnpinBuffer(foundbuf, true);
+			goto start;
+		}
 
 		*foundPtr = TRUE;
 
+		/* Check to see if the correct data has been loaded into the buffer. */
 		if (!valid)
 		{
 			/*
@@ -867,7 +857,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			 * own read attempt if the page is still not BM_VALID.
 			 * StartBufferIO does it all.
 			 */
-			if (StartBufferIO(buf, true))
+			if (StartBufferIO(foundbuf, true))
 			{
 				/*
 				 * If we get here, previous attempts to read the buffer must
@@ -877,15 +867,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			}
 		}
 
-		return buf;
+		return foundbuf;
 	}
 
-	/*
-	 * Didn't find it in the buffer pool.  We'll have to initialize a new
-	 * buffer.  Remember to unlock the mapping lock while doing the work.
-	 */
-	LWLockRelease(newPartitionLock);
-
 	/* Loop here in case we have to try another victim buffer */
 	for (;;)
 	{
@@ -986,42 +970,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		if (oldFlags & BM_TAG_VALID)
 		{
-			/*
-			 * Need to compute the old tag's hashcode and partition lock ID.
-			 * XXX is it worth storing the hashcode in BufferDesc so we need
-			 * not recompute it here?  Probably not.
-			 */
+			/* Save old tag. */
 			oldTag = buf->tag;
-			oldHash = BufTableHashCode(&oldTag);
-			oldPartitionLock = BufMappingPartitionLock(oldHash);
-
-			/*
-			 * Must lock the lower-numbered partition first to avoid
-			 * deadlocks.
-			 */
-			if (oldPartitionLock < newPartitionLock)
-			{
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
-			else if (oldPartitionLock > newPartitionLock)
-			{
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-			}
-			else
-			{
-				/* only one partition, only one lock */
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
-		}
-		else
-		{
-			/* if it wasn't valid, we need only the new partition */
-			LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			/* these just keep the compiler quiet about uninit variables */
-			oldHash = 0;
-			oldPartitionLock = 0;
 		}
 
 		/*
@@ -1031,32 +981,34 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * Note that we have not yet removed the hashtable entry for the old
 		 * tag.
 		 */
-		buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
+enter:
+		buf_id = BufTableInsert(&newTag, buf->buf_id);
 
 		if (buf_id >= 0)
 		{
+			BufferDesc *foundbuf;
+
 			/*
-			 * Got a collision. Someone has already done what we were about to
-			 * do. We'll just handle this as if it were found in the buffer
-			 * pool in the first place.  First, give up the buffer we were
-			 * planning to use.
+			 * We've got a collision, apparently: it looks like someone else
+			 * did what we were about to do.  We can handle this as if we had
+			 * found the buffer in the pool in the first place, but we must
+			 * recheck the buffer tag after pinning it, because it could still
+			 * get renamed under us.
+		 	 */
+			foundbuf = &BufferDescriptors[buf_id];
+			valid = PinBuffer(foundbuf, strategy);
+			if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+			{
+				UnpinBuffer(foundbuf, true);
+				goto enter;
+			}
+
+			/*
+			 * Collision confirmed.  Give up the buffer we were planning to
+			 * use.
 			 */
 			UnpinBuffer(buf, true);
 
-			/* Can give up that buffer's mapping partition lock now */
-			if ((oldFlags & BM_TAG_VALID) &&
-				oldPartitionLock != newPartitionLock)
-				LWLockRelease(oldPartitionLock);
-
-			/* remaining code should match code at top of routine */
-
-			buf = &BufferDescriptors[buf_id];
-
-			valid = PinBuffer(buf, strategy);
-
-			/* Can release the mapping lock as soon as we've pinned it */
-			LWLockRelease(newPartitionLock);
-
 			*foundPtr = TRUE;
 
 			if (!valid)
@@ -1068,7 +1020,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				 * then set up our own read attempt if the page is still not
 				 * BM_VALID.  StartBufferIO does it all.
 				 */
-				if (StartBufferIO(buf, true))
+				if (StartBufferIO(foundbuf, true))
 				{
 					/*
 					 * If we get here, previous attempts to read the buffer
@@ -1078,7 +1030,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				}
 			}
 
-			return buf;
+			return foundbuf;
 		}
 
 		/*
@@ -1097,11 +1049,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			break;
 
 		UnlockBufHdr(buf);
-		BufTableDelete(&newTag, newHash);
-		if ((oldFlags & BM_TAG_VALID) &&
-			oldPartitionLock != newPartitionLock)
-			LWLockRelease(oldPartitionLock);
-		LWLockRelease(newPartitionLock);
+		BufTableDelete(&newTag);
 		UnpinBuffer(buf, true);
 	}
 
@@ -1124,13 +1072,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	UnlockBufHdr(buf);
 
 	if (oldFlags & BM_TAG_VALID)
-	{
-		BufTableDelete(&oldTag, oldHash);
-		if (oldPartitionLock != newPartitionLock)
-			LWLockRelease(oldPartitionLock);
-	}
-
-	LWLockRelease(newPartitionLock);
+		BufTableDelete(&oldTag);
 
 	/*
 	 * Buffer contents are currently invalid.  Try to get the io_in_progress
@@ -1166,42 +1108,11 @@ static void
 InvalidateBuffer(volatile BufferDesc *buf)
 {
 	BufferTag	oldTag;
-	uint32		oldHash;		/* hash value for oldTag */
-	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
 	BufFlags	oldFlags;
 
 	/* Save the original buffer tag before dropping the spinlock */
 	oldTag = buf->tag;
 
-	UnlockBufHdr(buf);
-
-	/*
-	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
-	 * worth storing the hashcode in BufferDesc so we need not recompute it
-	 * here?  Probably not.
-	 */
-	oldHash = BufTableHashCode(&oldTag);
-	oldPartitionLock = BufMappingPartitionLock(oldHash);
-
-retry:
-
-	/*
-	 * Acquire exclusive mapping lock in preparation for changing the buffer's
-	 * association.
-	 */
-	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-
-	/* Re-lock the buffer header */
-	LockBufHdr(buf);
-
-	/* If it's changed while we were waiting for lock, do nothing */
-	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
-	{
-		UnlockBufHdr(buf);
-		LWLockRelease(oldPartitionLock);
-		return;
-	}
-
 	/*
 	 * We assume the only reason for it to be pinned is that someone else is
 	 * flushing the page out.  Wait for them to finish.  (This could be an
@@ -1211,15 +1122,21 @@ retry:
 	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 	 * be busy-looping here.)
 	 */
-	if (buf->refcount != 0)
+	while (buf->refcount != 0)
 	{
 		UnlockBufHdr(buf);
-		LWLockRelease(oldPartitionLock);
 		/* safety check: should definitely not be our *own* pin */
 		if (GetPrivateRefCount(buf->buf_id) > 0)
 			elog(ERROR, "buffer is pinned in InvalidateBuffer");
 		WaitIO(buf);
-		goto retry;
+		LockBufHdr(buf);
+
+		/* If it's changed while we were waiting for lock, do nothing */
+		if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
+		{
+			UnlockBufHdr(buf);
+			return;
+		}
 	}
 
 	/*
@@ -1237,12 +1154,7 @@ retry:
 	 * Remove the buffer from the lookup hashtable, if it was in there.
 	 */
 	if (oldFlags & BM_TAG_VALID)
-		BufTableDelete(&oldTag, oldHash);
-
-	/*
-	 * Done with mapping lock.
-	 */
-	LWLockRelease(oldPartitionLock);
+		BufTableDelete(&oldTag);
 
 	/*
 	 * Insert the buffer at the head of the list of free buffers.
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 2ea2216..38614a4 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -423,6 +423,29 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
 	return structPtr;
 }
 
+/*
+ * ShmemInitStruct -- Attach to an existing structure in shared memory.
+ */
+void *
+ShmemAttachStruct(const char *name)
+{
+	ShmemIndexEnt *result;
+	void	   *ptr;
+	bool		found;
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	result = (ShmemIndexEnt *)
+		hash_search(ShmemIndex, name, HASH_FIND, &found);
+	if (!found || result == NULL)
+		elog(ERROR, "shared memory structure %s not found", name);
+	ptr = result->location;
+	Assert(ptr != NULL);
+
+	LWLockRelease(ShmemIndexLock);
+
+	return ptr;
+}
 
 /*
  * Add two Size values, checking for overflow
diff --git a/src/backend/utils/hash/Makefile b/src/backend/utils/hash/Makefile
index 05d347c..5d53382 100644
--- a/src/backend/utils/hash/Makefile
+++ b/src/backend/utils/hash/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/hash
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = dynahash.o hashfn.o
+OBJS = chash.o dynahash.o hashfn.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c
new file mode 100644
index 0000000..0d4dc78
--- /dev/null
+++ b/src/backend/utils/hash/chash.c
@@ -0,0 +1,1075 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.c
+ *	  concurrent hash tables
+ *
+ * A concurrent hash table stores a collection of fixed-size objects.
+ * From the point of view of this module, such objects are merely an
+ * opaque array of bytes, but the caller will typically implement them as
+ * a C "struct".  Some fixed-size, leading portion of each object is
+ * designated as the key, which must be distinct for all objects in the
+ * collection.  Since PostgreSQL's shared memory model does not permit
+ * dynamic shared-memory allocation, we preallocate shared-memory space
+ * for the maximum number of entities which can be stored (plus a few
+ * extra, for reasons that will be further explained below).  This space
+ * is allocated as a single large array called the arena, and we often
+ * refer to entities by their position in the arena rather than via an
+ * ordinary pointer.  This saves a considerable amount of memory, since
+ * most modern architectures are 64-bit and therefore use 8-byte pointers,
+ * while arena offsets can be stored in a 32-bit word.  In fact, we
+ * reserve one bit in each such word as a mark bit, so the maximum size
+ * of the arena is 2^31 elements, a restriction that does not currently
+ * appear to be problematic.  An additional advantage of this representation
+ * is that aligned 32-bit loads and stores are atomic on all architectures
+ * we support, but 64-bit loads and stores are not.
+ *
+ * When an element is inserted, we copy the data from the backend-private
+ * object supplied by the caller into one of these shared-memory entities.
+ * When the hash table is searched, the caller passes a backend-private
+ * entity with just the key filled in; if a matching element is found,
+ * data is copied from the shared memory entity into the non-key portion
+ * of the user-supplied entity.  In this way, clients of this module
+ * never use pointers into shared memory directly.
+ *
+ * As normal, we structure the hash table as an array of buckets, whose
+ * size is always a power of two, so that the low-order bytes of the
+ * hash code can be used to select a bucket.  If multiple entities has
+ * to the same bucket, we use separate chaining: each entity in the
+ * arena has an 8-byte header that stores the 4-byte arena offset of the
+ * next item in the bucket and the hash value of the entity's key.
+ * Bucket chains are maintained in order by ascending hash value and
+ * then by ascending entity key (as per memcmp) so that there is
+ * precisely one legal location at which a given new item can be inserted
+ * into a bucket.
+ *
+ * Items are inserted into buckets using compare-and-swap (CAS).  Thus, this
+ * module cannot be used on architectures where we do not have 4-byte
+ * compare-and-swap.  When an item is deleted, we first set its mark bit,
+ * which is stored within the next-pointer, again using CAS.  Once this
+ * step is completed, the node is deleted.  As a cleanup operation, we then
+ * use CAS to modify the next-pointer of the previous node to point to the
+ * node following the deleted node.  Note that, even once this cleanup
+ * operation has been performed, some other backend concurrently walking the
+ * chain might still be visiting the deleted node.  Thus, we must be certain
+ * not to overwrite the deleted node's next-pointer until all concurrent
+ * bucket scans have completed.  This means, in particular, that we cannot
+ * immediately view the deleted node as available for reuse.
+ *
+ * Instead, when a delete-marked node is removed from the bucket chain, it
+ * is added to one of many garbage lists.  There is a many-to-one mapping from
+ * buckets to garbage lists, so that the choice of bucket determines the
+ * garbage list but not visca versa.  Any process which wishes to scan a bucket
+ * must first advertise in shared memory the corresponding garbage list number.
+ * When a backend wishes to move entries from a garbage list to a free list,
+ * it must first wait (by spinning) for any backends scanning that garbage
+ * list to complete their scans.
+ *
+ * Ideally, it would be nice to make this completely lock-free, but because
+ * of the above-described choice of garbage collection algorithm, it currently
+ * isn't.  For an algorithm to be lock-free, it must be possible to suspend
+ * the execution of any number of processes for an arbitrary period of time
+ * without impeding the overall progress of the system.  For this code, that
+ * is true except when garbage collection occurs.  In that case, an insert,
+ * search, or delete operation can obstruct an inserting thread attempting to
+ * perform garbage collection for an unbounded period of time.  The algorithm
+ * could be adapted as to be completely lock-free, essentially by guaranteeing
+ * that even in the worst case no combination of processes can obstruct garbage
+ * collection to a sufficient degree as to prevent an inserter from finding
+ * an available entry in a hash table containing fewer live elements than its
+ * declared maximum capacity.  However, it's not clear that this is a good
+ * idea, because it would likely slow down operation in the non-contended
+ * case to solve a problem that we hope won't happen anyway.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/hash/chash.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "access/hash.h"
+#include "storage/barrier.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/chash.h"
+#include "utils/memutils.h"
+
+/*
+ * CHashPtr represents an offset into the arena, plus a mark bit that is
+ * used to implement concurrent deletion.
+ */
+typedef uint32 CHashPtr;
+#define InvalidCHashPtr				((uint32) -2)
+#define CHashPtrIsInvalid(x)		((x) >= InvalidCHashPtr)
+#define CHashPtrIsMarked(x)			((x) & 1)
+#define CHashPtrGetOffset(x)		((x) >> 1)
+#define CHashPtrMark(x)				((x) | 1)
+#define CHashPtrUnmark(x)			((x) & ~1)
+#define MakeCHashPtr(x)				((x) << 1)
+#define CHashMaxCapacity			CHashPtrGetOffset(InvalidCHashPtr)
+
+/*
+ * Each object stored in the hash table is represented by a CHashNode, which
+ * stores a pointer to the next item in the same bucket, and the exact hash
+ * value of the current item.  Each CHashNode is followed by space for the
+ * item itself.
+ */
+typedef struct
+{
+	CHashPtr		next;			/* arena offset of next element */
+	union
+	{
+		uint32		hashcode;		/* hash(key) */
+		CHashPtr	gcnext;			/* arena offset of next garbage item */
+	} un;
+} CHashNode;
+
+#define SizeOfCHashNode		MAXALIGN(sizeof(CHashNode))
+#define CHashNodeGetItem(x)	(((char *) x) + SizeOfCHashNode)
+
+/*
+ * CHashTableData stores all the information that we need in order to access
+ * a concurrent hash table.  We store one copy of this data in shared memory,
+ * and an additional copy in the private memory of each backend accessing the
+ * table.
+ */
+typedef struct CHashTableData
+{
+	/*
+	 * These fields do not change after initialization.
+	 */
+	CHashDescriptor	desc;			/* descriptor for this hash table */
+	uint32			bucket_mask;	/* # of buckets, minus one */
+	uint8			garbage_shift;	/* log2(# of buckets/# of garbage lists) */
+	uint8			freelist_shift;	/* log2(# of garbage lists/# freelists) */
+	uint16			nfreelists;		/* # of freelists */
+	uint32			arena_limit;	/* # of arena elements */
+	uint32			arena_stride;	/* bytes allocated per arena element */
+	CHashPtr	   *bucket;			/* array with 1 entry per bucket */
+	CHashPtr	   *extra;			/* entries for garbage and free lists */
+	char		   *arena;			/* arena */
+
+	/*
+	 * These fields will be different in each backend; the shared copy is
+	 * irrelevant.
+	 */
+	int				gc_pid;			/* PID that set gc_next */
+	uint32			gc_next;		/* next garbage list to reclaim */
+	uint64			stats[CHS_NumberOfStatistics];	/* statistics */
+} CHashTableData;
+
+/* Compute # of buckets, garbage lists, or free lists. */
+#define CHashTableNBuckets(table) \
+	((table)->bucket_mask + 1)
+#define CHashTableNGarbage(table) \
+	(CHashTableNBuckets((table)) >> (table)->garbage_shift)
+#define CHashTableNFreeLists(table) \
+	((table)->nfreelists)
+
+/*
+ * Garbage lists and free lists are interleaved to reduce cache line
+ * contention on the free lists, so the calculation of where an individual
+ * list is located is a bit complex.
+ */
+#define CHashTableGetGarbageList(table, n) \
+	(&(table)->extra[(n) + ((n) >> (table)->freelist_shift)])
+#define CHashTableGetGarbageByBucket(table, n) \
+	(CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift))
+#define CHashTableGetFreeList(table, n) \
+	(&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)])
+
+/* Access macros for arena nodes. */
+#define CHashTableGetRaw(table, offset) \
+	(AssertMacro((offset) < (table)->arena_limit), \
+	 (CHashNode *) ((table)->arena + (table)->arena_stride * (offset)))
+#define CHashTableGetNode(table, ptr) \
+	(AssertMacro(!CHashPtrIsInvalid(ptr)), \
+	 CHashTableGetRaw((table), CHashPtrGetOffset((ptr))))
+
+/* Statistics macros. */
+#define CHashTableIncrementStatistic(table, stat) \
+	((table)->stats[(stat)]++)
+
+/*
+ * Bucket scan.
+ */
+typedef struct
+{
+	CHashPtr	target;
+	CHashPtr	next;
+	CHashPtr   *pointer_to_target;
+	CHashNode  *target_node;
+	bool		found;
+} CHashScanResult;
+
+/* Human-readable statistics names. */
+char *CHashStatisticsNames[] = {
+	"searches",
+	"searches failed",
+	"inserts",
+	"inserts failed",
+	"inserts retried",
+	"deletions",
+	"deletions failed",
+	"deletions retried",
+	"scan expunges",
+	"scan expunges failed",
+	"scans restarted",
+	"cleanup scans",
+	"allocations failed",
+	"garbage enqueues retried",
+	"garbage dequeues failed",
+	"garbage collections",
+	"garbage collection spins",
+	"garbage collection reclaims skipped",
+	"garbage collection fast reclaims",
+	"garbage collection reclaims retried",
+	"<end>"
+};
+
+/* Function prototypes. */
+static CHashPtr CHashAllocate(CHashTable table);
+static CHashPtr CHashAllocateViaGC(CHashTable table);
+static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c);
+static void CHashBucketScan(CHashTable table,
+				CHashPtr *start,
+				uint32 hashcode,
+				const void *key,
+				CHashScanResult *res);
+
+/*
+ * First stage of CHashTable initialization.  We fill in all the constants
+ * here, but not the pointers.
+ */
+CHashTable
+CHashBootstrap(CHashDescriptor *desc)
+{
+	CHashTable		table;
+	uint32			bucket_shift;
+
+	/* Sanity check. */
+	Assert(!strcmp(CHashStatisticsNames[CHS_NumberOfStatistics], "<end>"));
+
+	/* Allocate table and copy descriptor. */
+	table = MemoryContextAllocZero(TopMemoryContext, sizeof(CHashTableData));
+	memcpy(&table->desc, desc, sizeof(CHashDescriptor)); 
+
+	/* Sanity checks. */
+	if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity)
+		elog(ERROR, "invalid capacity for concurrent hash");
+	if (desc->key_size < 1 || desc->key_size > desc->element_size)
+		elog(ERROR, "invalid key size for concurrent hash");
+
+ 	/*
+	 * The number of buckets must be a power of two.  To avoid (as much as
+	 * possible) having to traverse long bucket chains, we aim for a load
+	 * factor <= 1.0, so this is a pretty simple calculation: we just find the
+	 * smallest power of two greater than or equal to the target capacity.
+	 */
+	bucket_shift = fls(desc->capacity) - 1;
+	table->bucket_mask = (1 << bucket_shift) - 1;
+
+	/*
+	 * We choose to have one garbage list for every 64 hash table buckets
+	 * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in
+	 * total, in which case we still have a minimum of one garbage list.
+	 *
+	 * Increasing the garbage_shift would reduce the likelihood of a backend
+	 * performing garbage collection needing to wait for a backend walking a
+	 * bucket to finish its scan.  On the other hand, decreasing the garbage
+	 * shift would allow more items to be recovered in a single garbage
+	 * collection cycle.  It's not clear what the optimal value is.
+	 */
+	table->garbage_shift = Min(bucket_shift, 6);
+	table->gc_next = 0;
+	table->gc_pid = 0;
+
+	/*
+ 	 * Experimentation reveals that the free list manipulation is both one of
+ 	 * the slowest parts of this algorithm and the most vulnerable to
+ 	 * contention.  Therefore, we want to have as many free lists as possible,
+ 	 * but there's no need to have more than the number of CPU cores, so we
+ 	 * limit the number of freelists to 64.  There might be a benefit in some
+ 	 * larger limit on a really big system, but we'll cap it here pending some
+ 	 * actual test results.  We're also limited to having no more freelists
+ 	 * than we do garbage lists.
+ 	 */
+#define LOG2_MAX_FREELIST 6
+	if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST)
+		table->freelist_shift = 0;
+	else
+		table->freelist_shift =
+			(bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST;
+	table->nfreelists =
+		1 << (bucket_shift - (table->garbage_shift + table->freelist_shift));
+
+	/*
+	 * To make garbage collection efficient, we overallocate.  Normally, we
+	 * overallocate by one-eighth, but if that would be less than 15 elements,
+	 * then we allocate 15 elements instead.  This extra capacity can actually
+	 * be used, but for best performance, it shouldn't be.  It's the caller's
+	 * responsibility to avoid this.
+	 */
+	table->arena_limit = desc->capacity;
+	if (desc->capacity < 120)
+		table->arena_limit += 15;
+	else
+		table->arena_limit += table->arena_limit / 8;
+
+	/* Each arena element must be MAXALIGN'd and include per-node space. */
+	table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size);
+
+	/* Zero out statistics. */
+	memset(table->stats, 0, sizeof(uint64) * CHS_NumberOfStatistics);
+
+	return table;
+}
+
+/*
+ * Estimate shared memory requirements.
+ */
+Size
+CHashEstimateSize(CHashTable table)
+{
+	Size		total_buckets;
+	Size		size;
+	Size		nbuckets = CHashTableNBuckets(table);
+	Size		ngarbage = CHashTableNGarbage(table);
+	Size		nfreelists = CHashTableNFreeLists(table);
+
+	Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0);
+	total_buckets = add_size(nbuckets, ngarbage);
+	total_buckets = add_size(total_buckets, nfreelists);
+
+	size = MAXALIGN(sizeof(CHashTableData));
+	size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets));
+	size = add_size(size, mul_size(table->arena_stride, table->arena_limit));
+
+	return size;
+}
+
+/*
+ * Create a concurrent hash table in shared memory, or attach to an existing
+ * table.
+ */
+CHashTable
+CHashInitialize(CHashTable table, CHashDescriptor *desc)
+{
+	Size	size;
+	bool	found;
+	void   *shmem;
+	uint32	i;
+	uint32	nbuckets;
+	uint32	nfreelists;
+	uint32	ngarbage;
+	uint32	nextra;
+
+	/*
+	 * If we're under the postmaster, this must be the EXEC_BACKEND case where
+	 * we need to attach to an existing shared-memory segment.
+	 */
+	if (IsUnderPostmaster)
+	{
+		void   *shmem;
+
+		Assert(table == NULL);
+		table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
+		shmem = ShmemAttachStruct(desc->shmem_name);
+		memcpy(table, shmem, sizeof(CHashTableData));
+		return table;
+	}
+
+	/*
+	 * Otherwise, the hash table should not already exist, and we must
+	 * create it.  But the table should already be bootstrapped, since we
+	 * must previously have computed its size when figuring out our shared
+	 * memory allocation.
+	 */
+	Assert(table != NULL);
+	size = CHashEstimateSize(table);
+	shmem = ShmemInitStruct(table->desc.shmem_name, size, &found);
+	Assert(!found);
+
+	/* Bucket, garbage, and freelist arrays follow table info. */
+	table->bucket = (CHashPtr *)
+		(((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
+	nbuckets = CHashTableNBuckets(table);
+	table->extra = &table->bucket[nbuckets];
+
+	/* Arena follows the various lists. */
+	ngarbage = CHashTableNGarbage(table);
+	nfreelists = CHashTableNFreeLists(table);
+	nextra = ngarbage + nfreelists;
+	table->arena = (void *) (&table->extra[nextra]);
+
+	/* Initialize all three sets of lists to empty. */
+	for (i = 0; i < nbuckets; ++i)
+		table->bucket[i] = InvalidCHashPtr;
+	for (i = 0; i < nextra; ++i)
+		table->extra[i] = InvalidCHashPtr;
+
+	/* Put all arena elements on the free lists. */
+	for (i = 0; i < table->arena_limit; ++i)
+	{
+		CHashPtr   *f = CHashTableGetFreeList(table, i % nfreelists);
+		CHashNode  *n = CHashTableGetRaw(table, i);
+
+		n->un.gcnext = *f;
+		*f = MakeCHashPtr(i);
+	}
+
+	/*
+	 * Copy table (with pointers now filled in) to shared memory.  This is
+	 * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway.
+	 */
+	memcpy(shmem, table, sizeof(CHashTableData));
+
+	return table;
+}
+
+/*
+ * Search a concurrent hash table.  entry should be a block of memory large
+ * enough to hold a complete entry, with just the key portion filled in.  If
+ * a matching entry is found, this function will fill in the rest of the entry
+ * from the data in the hash table and return true.  If not, it will return
+ * false.
+ */
+bool
+CHashSearch(CHashTable table, void *entry)
+{
+	uint32	hashcode = hash_any(entry, table->desc.key_size);
+	uint32	bucket = hashcode & table->bucket_mask;
+	CHashPtr	   *b = &table->bucket[bucket];
+	CHashScanResult	scan;
+
+	/* Prevent garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] == NULL);
+	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+	pg_memory_barrier();
+
+	/* Scan bucket and return data from any matching entry. */
+	CHashBucketScan(table, b, hashcode, entry, &scan);
+	if (scan.found)
+		memcpy(((char *) entry) + table->desc.key_size,
+			   CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+			   table->desc.element_size - table->desc.key_size);
+
+	/* Allow garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] != NULL);
+	pg_memory_barrier();
+	MyProc->hazard[0] = NULL;
+
+	CHashTableIncrementStatistic(table, CHS_Search);
+	if (!scan.found)
+		CHashTableIncrementStatistic(table, CHS_Search_Failed);
+	return scan.found;
+}
+
+/*
+ * Insert into a concurrent hash table.  entry should be the complete entry
+ * to be inserted.  If no duplicate entry is found in the table, this function
+ * will insert the entry and return true.  Otherwise, the duplicate entry's
+ * value will be copied into the supplied entry and the function will return
+ * false.
+ *
+ * The caller is responsible for ensuring that no inserts are performed into
+ * a hash table which is at capacity.  The behavor of such an insert is
+ * undefined (the actual behavior is that the insert may either succeed,
+ * degrading performance; or CHashAllocate may enter a tight loop until such
+ * time as an element is deleted).
+ */
+bool
+CHashInsert(CHashTable table, void *entry)
+{
+	uint32	hashcode = hash_any(entry, table->desc.key_size);
+	uint32	bucket = hashcode & table->bucket_mask;
+	CHashPtr	   *b = &table->bucket[bucket];
+	CHashPtr	new;
+	CHashNode  *nnew;
+	CHashScanResult	scan;
+
+	/*
+	 * Allocate and initialize a new entry, on the assumption that the insert
+	 * will succeed.  If it ends up failing, we must be sure to put this back
+	 * on some free list, lest it be permanently leaked.
+	 */
+	new = CHashAllocate(table);
+	nnew = CHashTableGetNode(table, new);
+	nnew->un.hashcode = hashcode;
+	memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size);
+
+	/* Prevent garbage collection for this bucket. */
+	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+	pg_memory_barrier();
+
+	/*
+	 * Scan the bucket.  If we don't find a match, use compare-and-swap to
+	 * insert the new node at the insert position.  If we do find a match,
+	 * return the data to the caller.
+	 */
+retry:
+	CHashBucketScan(table, b, hashcode, entry, &scan);
+	if (scan.found)
+		memcpy(((char *) entry) + table->desc.key_size,
+			   CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+			   table->desc.element_size - table->desc.key_size);
+	else
+	{
+		/*
+		 * We didn't find a matching element, so use compare-and-swap to
+		 * attempt to insert the new element we've prepared.  If this fails,
+		 * someone currently inserted or deleted an element.  The correct
+		 * insertion point might have changed, or the key we're trying to
+		 * insert might now be present when it wasn't before, so we'll have
+		 * to search the bucket chain anew.
+		 *
+		 * There is a risk of starvation here, but we hope it will not happen
+		 * in practice.  Contention for inserting new elements should be
+		 * spread out pretty much evenly across N+M possible insertion points,
+		 * where N is the number of buckets and M is the number of elements
+		 * in the table.  Even for a quite modestly size table this is likely
+		 * to exceed the number of CPU cores.
+		 */
+		Assert(!CHashPtrIsMarked(scan.target));
+		nnew->next = scan.target;
+		if (!__sync_bool_compare_and_swap(scan.pointer_to_target,
+										  scan.target, new))
+		{
+			CHashTableIncrementStatistic(table, CHS_Insert_Retry);
+			goto retry;
+		}
+	}
+
+	/* Allow garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] != NULL);
+	pg_memory_barrier();
+	MyProc->hazard[0] = NULL;
+
+	/*
+	 * If the insert failed, add the entry we found to the appropriate
+	 * garbage list.  We can't simply put this back on the freelist,
+	 * because that leads to an ABA problem: some other backend could
+	 * read the head of the freelist, go into the tank, and then use
+	 * CAS to pop an element.  If at that point, it finds the same
+	 * element at the head of the freelist but with a different successor,
+	 * we're hosed.  To prevent that, we ensure that elements are added
+	 * to a given freelist only after verifying that any allocations in
+	 * progress at the time we popped the freelist has completed.  This
+	 * guarantees that any allocation still in progress at the time this
+	 * element makes it back to the freelist is trying to allocate some
+	 * other node.
+	 */
+	CHashTableIncrementStatistic(table, CHS_Insert);
+	if (scan.found)
+	{
+		CHashTableIncrementStatistic(table, CHS_Insert_Failed);
+		CHashAddToGarbage(table, bucket, new);
+	}
+
+	/* The insert succeeded if and only if no duplicate was found. */
+	return !scan.found;
+}
+
+/*
+ * Delete from a concurrent hash table.  entry need only contain the key field.
+ * Returns true if we find and delete a matching key and false otherwise.
+ */
+bool
+CHashDelete(CHashTable table, void *entry)
+{
+	uint32	hashcode = hash_any(entry, table->desc.key_size);
+	uint32	bucket = hashcode & table->bucket_mask;
+	CHashPtr	   *b = &table->bucket[bucket];
+	CHashScanResult	scan;
+
+	/* Prevent garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] == NULL);
+	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+	pg_memory_barrier();
+
+	/* Scan bucket. */
+retry:
+	CHashBucketScan(table, b, hashcode, entry, &scan);
+
+	/* If we found it, try to delete it. */
+	if (scan.found)
+	{
+		Assert(!CHashPtrIsMarked(scan.next));
+
+		/* Attempt to apply delete-mark. */
+		if (!__sync_bool_compare_and_swap(&scan.target_node->next,
+										  scan.next,
+										  CHashPtrMark(scan.next)))
+		{
+			CHashTableIncrementStatistic(table, CHS_Delete_Retry);
+			goto retry;
+		}
+
+		/* Deletion is done; attempt to remove node from list. */
+		if (__sync_bool_compare_and_swap(scan.pointer_to_target,
+										 scan.target,
+										 scan.next))
+			CHashAddToGarbage(table, bucket, scan.target);
+		else
+		{
+			CHashScanResult	cleanup_scan;
+
+			/*
+			 * If we weren't able to remove the deleted item, rescan
+			 * the bucket to make sure it's really gone.  This is just
+			 * like a regular bucket scan, except that we don't care
+			 * about the results.  We're just doing it to achieve the
+			 * side-effect of removing delete-marked nodes from the
+			 * bucket chain.
+			 */
+			CHashTableIncrementStatistic(table, CHS_Cleanup_Scan);
+			CHashBucketScan(table, b, hashcode, entry, &cleanup_scan);
+		}
+	}
+
+	/* Allow garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] != NULL);
+	pg_memory_barrier();
+	MyProc->hazard[0] = NULL;
+
+	/* We're done. */
+	CHashTableIncrementStatistic(table, CHS_Delete);
+	if (!scan.found)
+		CHashTableIncrementStatistic(table, CHS_Delete_Failed);
+	return scan.found;
+}
+
+/*
+ * Provide backend-local statistics to caller.
+ */
+void
+CHashStatistics(CHashTable table, uint64 *stats)
+{
+	memcpy(stats, &table->stats, sizeof(uint64) * CHS_NumberOfStatistics);
+}
+
+/*
+ * Scan one bucket of a concurrent hash table, storing the results in a
+ * CHashResult object provided by the caller.
+ *
+ * Caller must suppress garbage collection for the target bucket before
+ * calling this function, and resume it afterwards.
+ *
+ * On return, res->found will be true if a matching item was found and false
+ * otherwise.  res->target will be a CHashPtr referencing the matching item,
+ * or the first one following the position where the matching item should have
+ * been; res->pointer_to_target will be a pointer to the memory address from
+ * which res->target was read.
+ *
+ * If res->target is not invalid, then res->target_node is a pointer to its
+ * location in memory.  If res->found, then res->next will be the next pointer
+ * of res->target_node; otherwise, it's undefined.
+ */
+static void
+CHashBucketScan(CHashTable table,
+				CHashPtr *start,
+				uint32 hashcode,
+				const void *key,
+				CHashScanResult *res)
+{
+	CHashPtr	target;
+	CHashPtr   *pointer_to_target;
+	CHashNode  *target_node = NULL;
+
+retry:
+	pointer_to_target = start;
+	target = *pointer_to_target;
+	for (;;)
+	{
+		CHashPtr	next;
+		uint32		h;
+		int			cmp;
+
+		/*
+		 * If we've reached the end of the bucket chain, stop; otherwise,
+		 * figure out the actual address of the next item.
+		 */
+		if (CHashPtrIsInvalid(target))
+		{
+			res->found = false;
+			break;
+		}
+		target_node = CHashTableGetNode(table, target);
+
+		/*
+		 * target may have been fetched from an arena entry that could be
+		 * concurrently modified, so a dependency barrier is required before
+		 * dereferencing the derived pointer.
+		 */
+		pg_read_barrier_depends();
+		next = target_node->next;
+
+		/*
+		 * For simplicity, any bucket scan, even if it's servicing a search,
+		 * will attempt to remove delete-marked items from the bucket.  This
+		 * ensures that delete-marked elements are removed from bucket chains
+		 * as quickly as possible and reduces code duplication.  See
+		 * CHashDelete for further comments about why delete-marking is
+		 * necessary and how it allows safe deletion.
+		 */
+		if (CHashPtrIsMarked(next))
+		{
+zap:
+			if (__sync_bool_compare_and_swap(pointer_to_target,
+											 target,
+											 CHashPtrUnmark(next)))
+			{
+				/*
+				 * No one else can possibly have modified target_node->next,
+				 * because such modifications are not allowed after a
+				 * delete-mark has been applied.  Thus, if we just keep
+				 * following the next pointers, we're guaranteed to visit
+				 * all non-deleted items (and possibly some deleted items)
+				 * that were present at the time we began the scan.
+				 */
+				CHashTableIncrementStatistic(table, CHS_Scan_Expunge);
+				CHashAddToGarbage(table, hashcode & table->bucket_mask,
+								  target);
+				target = CHashPtrUnmark(next);
+			}
+			else
+			{
+				/*
+				 * If the previous node has been delete-marked, we can't
+				 * further update the next-pointer, so restart the scan.
+				 * Otherwise, we know that some other backend removed one
+				 * or more deleted nodes from the bucket chain just as we
+				 * were trying to do, and we can simply continue the scan
+				 * from wherever the previous node is pointing now.  It's
+				 * possible that some concurrent inserts have happened, too,
+				 * but that's OK; we can view those as having happened "before"
+				 * whatever operation this scan is supporting.
+				 *
+				 * Although starvation is a theoretical possibility here if
+				 * we are forced to retry repeatedly, even a single retry is
+				 * vanishingly unlikely in practice.  It requires some other
+				 * backend to delete both the node that we're looking at and
+				 * the node which precedes it before we advance to the next
+				 * node.  That could certainly happen occasionally, but we'd
+				 * have to be pretty unlucky to have it happen even twice in
+				 * a row.
+				 */
+				CHashTableIncrementStatistic(table, CHS_Scan_Expunge_Fail);
+				target = *pointer_to_target;
+				if (CHashPtrIsMarked(target))
+				{
+					CHashTableIncrementStatistic(table, CHS_Scan_Restart);
+					goto retry;
+				}
+			}
+			continue;
+		}
+
+		/*
+		 * Bucket chains are kept in order, so that there is exactly one legal
+		 * point at which any given key can be inserted.  The ordering is by
+		 * hashcode first, and then by memcmp ordering of the keys involved.
+		 */
+		h = target_node->un.hashcode;
+		if (h == hashcode)
+			cmp = memcmp(CHashNodeGetItem(target_node), key,
+						 table->desc.key_size);
+		else if (h > hashcode)
+			cmp = 1;
+		else
+			cmp = -1;
+
+		/*
+		 * If cmp < 0, then we haven't yet reached the point at which we'd
+		 * expect to find the key, so we must continue the scan.  If cmp == 0,
+		 * we've found the key and can stop.  If cmp > 0, we've either passed
+		 * the point where we expect to find the key OR someone delete-marked
+		 * the item and overwrote the hashcode with a gcnext pointer.  In the
+		 * latter case we must take care not to be fooled into stopping the
+		 * scan early.
+		 */
+		if (cmp >= 0)
+		{
+			if (cmp == 0)
+			{
+				res->found = true;
+				res->next = next;
+				break;
+			}
+			else
+			{
+				/*
+				 * pg_read_barrier() prevents the reread of the next pointer
+				 * from being speculated ahead of the read of the hash value.
+				 */
+				pg_read_barrier();
+				next = target_node->next;
+				if (CHashPtrIsMarked(next))
+					goto zap;
+				res->found = false;
+				break;
+			}
+		}
+
+		/* Continue scan from next node. */
+		pointer_to_target = &target_node->next;
+		target = next;
+	}
+
+	/* Send results back to caller. */
+	res->target = target;
+	res->pointer_to_target = pointer_to_target;
+	res->target_node = target_node;
+}
+
+/*
+ * Allocate an arena slot for a new item to be inserted into a hash table.
+ *
+ * We don't want to wait until every single free-list is completely empty
+ * before beginning to garbage collect, because that could result in very
+ * fast allocation followed by a storm of garbage collection activity.
+ * It could also lead to every inserting backend ganging up on the only
+ * non-empty freelist.
+ *
+ * To avoid that, we check free lists and garbage lists in alternation.
+ * We always start with the same free list - which one is based on our
+ * backend ID - but we try to round-robin over all the available garbage
+ * lists.  Whenever we successfully garbage collect, we put the recovered
+ * items on our own free list.  In this way, if there's only one backend
+ * active, it will typically find a free buffer in the first place it looks:
+ * its own free list.  It will also settle into a pattern of garbage
+ * collecting the garbage list which it has visited least recently, which
+ * is what we want.
+ */
+static CHashPtr
+CHashAllocate(CHashTable table)
+{
+	uint32		f_current;
+	CHashPtr	new;
+
+	/* Pick a starting freelist base on our backend ID. */
+ 	f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+
+	/* If this process hasn't initialized gc_next yet, do that now. */
+	if (table->gc_pid != MyProcPid)
+	{
+		table->gc_pid = MyProcPid;
+		table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table);
+	}
+
+	/* Loop until we allocate a buffer. */
+	for (;;)
+	{
+		CHashPtr  *b;
+
+		/*
+ 		 * Try to pop a buffer from a freelist using compare-and-swap.
+ 		 *
+ 		 * Note that this is only safe if it's impossible for the next pointer
+ 		 * of the first element on the list to change between the time when
+ 		 * we read it and the time we use CAS to pop it off the list.  This
+ 		 * means that we can't allow any element that is currently on this
+ 		 * freelist to be allocated, marked as garbage, garbage collected,
+ 		 * and returned back to this freelist before we finish the CAS.
+ 		 *
+ 		 * If we attempt to pop the free-list and fail, we retry immediately
+ 		 * with the same free-list.  This reduces the frequency with which
+ 		 * we're obliged to update our hazard pointers, which is a material
+ 		 * savings due to the associated memory barrier.
+ 		 */
+		b = CHashTableGetFreeList(table, f_current);
+		MyProc->hazard[0] = b;
+		pg_memory_barrier();
+		new = *b;
+		while (!CHashPtrIsInvalid(new))
+		{
+			CHashNode  *n = CHashTableGetNode(table, new);
+
+			/*
+			 * n is computed from table->freelist[f_current], which could
+			 * be modified by concurrent activity, so we need a dependency
+			 * barrier here.
+			 */
+			pg_read_barrier_depends();
+			if (__sync_bool_compare_and_swap(b, new, n->un.gcnext))
+				return new;
+			CHashTableIncrementStatistic(table, CHS_Allocate_Fail);
+			new = *b;
+		}
+
+		/* Attempt garbage collection. */
+		new = CHashAllocateViaGC(table);
+		if (!CHashPtrIsInvalid(new))
+			return new;
+
+		/* Advance to next freelist. */
+		f_current = (f_current + 1) % CHashTableNFreeLists(table);
+	}
+}
+
+/*
+ * Attempt to satisfy an allocation request via garbage collection.
+ */
+static CHashPtr
+CHashAllocateViaGC(CHashTable table)
+{
+	uint32		f_home;
+	CHashPtr   *b;
+	CHashPtr   *fh;
+	CHashPtr	fhead;
+	CHashPtr	garbage;
+	CHashPtr	new;
+	CHashNode  *n;
+	uint32		i;
+
+	/* Pick a target freelist based on our backend ID. */
+ 	f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+	fh = CHashTableGetFreeList(table, f_home);
+
+	/* Select target garbage list. */
+	table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table);
+	b = CHashTableGetGarbageList(table, table->gc_next);
+	garbage = *b;
+
+	/* If list is empty, fail. */
+	if (CHashPtrIsInvalid(garbage))
+		return InvalidCHashPtr;
+
+	/* If we're unable to empty the list via compare-and-swap, fail. */
+	if (!__sync_bool_compare_and_swap(b, garbage, InvalidCHashPtr))
+	{
+		CHashTableIncrementStatistic(table, CHS_Garbage_Dequeue_Fail);
+		return InvalidCHashPtr;
+	}
+
+	/*
+	 * Before removing elements removed from the garbage list to the
+	 * freelist, we must wait until (1) all bucket scans that might
+	 * still see elements on the freelist as part of the bucket chain
+	 * have completed and (2) all allocations that might see an old
+	 * version of the freelist containing one of the elements to be
+	 * garbage collected have completed.
+	 *
+	 * Note: We can't begin this operation until the clearing of the
+	 * garbage list has been committed to memory, but since that was
+	 * done using an atomic operation no explicit barrier is needed
+	 * here.
+	 *
+	 * Note: We could have a "soft" version of this that merely
+	 * requeues the garbage if it's not immediately recycleable, but
+	 * it's not clear that we need such a thing.  On the flip side we
+	 * might want to eventually enter a longer sleep here, or PANIC,
+	 * but it's not clear exactly how to calibrate that.
+	 */
+	CHashTableIncrementStatistic(table, CHS_GC);
+	MyProc->hazard[0] = NULL;
+	for (i = 0; i < ProcGlobal->allProcCount; i++)
+	{
+		volatile PGPROC	*proc = &ProcGlobal->allProcs[i];
+		void	   *hazard;
+
+		hazard = proc->hazard[0];
+		if (hazard == b || hazard == fh)
+		{
+			CHashTableIncrementStatistic(table, CHS_GC_Spin);
+			do
+			{
+				hazard = proc->hazard[0];
+			} while (hazard == b || hazard == fh);
+		}
+	}
+
+	/* Remove one item from list to satisfy current allocation. */
+	new = garbage;
+	n = CHashTableGetNode(table, new);
+	pg_read_barrier_depends();
+	fhead = n->un.gcnext;
+
+	if (CHashPtrIsInvalid(fhead))
+	{
+		/*
+		 * We have reclaimed exactly node, so there's nothing to put
+		 * back on the free list.  In this case (only) we need a
+		 * memory barrier, because the reads above must complete
+		 * before we overwrite n->un.gcnext with a new hashcode.
+		 * (This is only needed when we reclaim exactly one node,
+		 * because in any other case we'll do a compare-and-swap
+		 * before returning, which implies a full barrier.)
+ 		 */
+		pg_memory_barrier();
+		CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Skipped);
+	}
+	else if (__sync_bool_compare_and_swap(fh, InvalidCHashPtr, fhead))
+	{
+		/*
+ 		 * Our free list is empty, and we've succesfully pushed the
+ 		 * reclaimed nodes onto it.  So we're done.
+ 		 */
+		CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Fast);
+	}
+	else
+	{
+		CHashPtr	fcurrent;
+		CHashPtr	fnext;
+		CHashPtr	oldhead;
+
+		/* Walk list of reclaimed elements to end. */
+		fcurrent = fhead;
+		for (;;)
+		{
+			n = CHashTableGetNode(table, fcurrent);
+			fnext = n->un.gcnext;
+			if (CHashPtrIsInvalid(fnext))
+				break;
+			fcurrent = fnext;
+		}
+
+		/* Push reclaimed elements onto home free list. */
+		for (;;)
+		{
+			oldhead = *fh;
+			n->un.gcnext = oldhead;
+			if (__sync_bool_compare_and_swap(fh, oldhead, fhead))
+				break;
+			CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Retry);
+		}
+	}
+
+	/* Return the element we saved for ourselves. */
+	return new;
+}
+
+/*
+ * Add an arena slot to the appropriate garbage list.
+ *
+ * The next garbage collection cycle for the affected bucket will move it
+ * to the free list.  We can't do that immediately because there might be
+ * someone traversing the list who is counting on being able to follow the
+ * next pointer.  It's OK to clobber the hash value, though, since a spurious
+ * failure to match an already-deleted item shouldn't cause any problems;
+ * this is why gcnext can share space with the hash value.
+ */
+static void
+CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
+{
+	CHashPtr	g;
+	CHashNode *n;
+	CHashPtr *garbage;
+
+	n = CHashTableGetNode(table, c);
+	garbage = CHashTableGetGarbageByBucket(table, bucket);
+
+	while (1)
+	{
+		g = *garbage;
+		n->un.gcnext = g;
+		if (__sync_bool_compare_and_swap(garbage, g, c))
+			break;
+		CHashTableIncrementStatistic(table, CHS_Garbage_Enqueue_Retry);
+	}
+}
diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h
index b36705b..6ef779b 100644
--- a/src/include/storage/barrier.h
+++ b/src/include/storage/barrier.h
@@ -20,4 +20,12 @@
  */
 #include "port/atomics.h"
 
+/*
+ * If dependency barriers are undefined, we define them as a no-op.  The only
+ * known platform where anything more is required is DEC Alpha.
+ */
+#if !defined(pg_read_barrier_depends)
+#define pg_read_barrier_depends()
+#endif
+
 #endif   /* BARRIER_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 0e69b63..4c6fac8 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -96,20 +96,6 @@ typedef struct buftag
 )
 
 /*
- * The shared buffer mapping table is partitioned to reduce contention.
- * To determine which partition lock a given tag requires, compute the tag's
- * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
- * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
- */
-#define BufTableHashPartition(hashcode) \
-	((hashcode) % NUM_BUFFER_PARTITIONS)
-#define BufMappingPartitionLock(hashcode) \
-	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
-		BufTableHashPartition(hashcode)].lock)
-#define BufMappingPartitionLockByIndex(i) \
-	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
-
-/*
  *	BufferDesc -- shared descriptor/state data for a single shared buffer.
  *
  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
@@ -200,9 +186,9 @@ extern void StrategyInitialize(bool init);
 extern Size BufTableShmemSize(int size);
 extern void InitBufTable(int size);
 extern uint32 BufTableHashCode(BufferTag *tagPtr);
-extern int	BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
-extern int	BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
-extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
+extern int	BufTableLookup(BufferTag *tagPtr);
+extern int	BufTableInsert(BufferTag *tagPtr, int buf_id);
+extern void BufTableDelete(BufferTag *tagPtr);
 
 /* localbuf.c */
 extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 02c8f1a..f98be4d 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -136,7 +136,7 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
  */
 
 /* Number of partitions of the shared buffer mapping hashtable */
-#define NUM_BUFFER_PARTITIONS  128
+#define NUM_BUFFER_PARTITIONS  0
 
 /* Number of partitions the shared lock tables are divided into */
 #define LOG2_NUM_LOCK_PARTITIONS  4
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index c23f4da..07f9761 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -58,6 +58,14 @@ struct XidCache
 #define		FP_LOCK_SLOTS_PER_BACKEND 16
 
 /*
+ * Some lock-free algorithms require each backend process to be able to
+ * advertise a certain number of "hazard pointers" in shared memory.
+ * Right now one per backend is enough for our purpose, but some
+ * algorithms require more.
+ */
+#define		NUM_HAZARD_POINTERS			1
+
+/*
  * Each backend has a PGPROC struct in shared memory.  There is also a list of
  * currently-unused PGPROC structs that will be reallocated to new backends.
  *
@@ -142,6 +150,12 @@ struct PGPROC
 	bool		fpVXIDLock;		/* are we holding a fast-path VXID lock? */
 	LocalTransactionId fpLocalTransactionId;	/* lxid for fast-path VXID
 												 * lock */
+
+	/*
+	 * Hazard pointers. Currently one is enough, though some algorithms
+	 * require a few more.
+	 */
+	void	   *hazard[NUM_HAZARD_POINTERS];
 };
 
 /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index 745eb7e..4ff8415 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -40,6 +40,7 @@ extern void InitShmemIndex(void);
 extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
 			  HASHCTL *infoP, int hash_flags);
 extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
+extern void *ShmemAttachStruct(const char *name);
 extern Size add_size(Size s1, Size s2);
 extern Size mul_size(Size s1, Size s2);
 
diff --git a/src/include/utils/chash.h b/src/include/utils/chash.h
new file mode 100644
index 0000000..ee0573c
--- /dev/null
+++ b/src/include/utils/chash.h
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.h
+ *	  Concurrent shared-memory hash table.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/chash.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHASH_H 
+#define CHASH_H
+
+/* Everything caller must supply to set up a concurrent hash table. */
+typedef struct
+{
+	const char *shmem_name;		/* shared memory name for this hash table */
+	uint32		capacity;		/* maximum size of hash table */
+	uint16		element_size;	/* size of each element */
+	uint16		key_size;		/* size of each key */
+} CHashDescriptor;
+
+/* Concurrent hash table statistics. */
+typedef enum
+{
+	CHS_Search,					/* search */
+	CHS_Search_Failed,			/* search failed (no such key) */
+	CHS_Insert,					/* insert */
+	CHS_Insert_Failed,			/* insert failed (duplicate key) */
+	CHS_Insert_Retry,			/* insert retried (concurrent update) */
+	CHS_Delete,					/* delete */
+	CHS_Delete_Failed,			/* delete failed (no such key) */
+	CHS_Delete_Retry,			/* delete retried (concurrent update) */
+	CHS_Scan_Expunge,			/* scan expunged deleted item */
+	CHS_Scan_Expunge_Fail,		/* scan failed to expunge */
+	CHS_Scan_Restart,			/* concurrent deletes forced a scan restart */
+	CHS_Cleanup_Scan,			/* concurrent update forced a cleanup scan */
+	CHS_Allocate_Fail,			/* allocation failed to pop freelist */
+	CHS_Garbage_Enqueue_Retry,	/* enqueue on garbage list retried */
+	CHS_Garbage_Dequeue_Fail,	/* dequeue of garbage failed */
+	CHS_GC,						/* garbage collection cycle */
+	CHS_GC_Spin,				/* GC spun waiting for concurrent process */
+	CHS_GC_Reclaim_Skipped,		/* GC recovered only one item */
+	CHS_GC_Reclaim_Fast,		/* GC put garbage on freelist via fast path */
+	CHS_GC_Reclaim_Retry,		/* enqueue of garbage on freelist retried */
+	CHS_NumberOfStatistics		/* number of statistics */
+} CHashStatisticsType;
+
+/* Human-readable names for statistics. */
+extern char *CHashStatisticsNames[];
+
+/* Opaque handle for a concurrent hash table. */
+struct CHashTableData;
+typedef struct CHashTableData *CHashTable;
+
+/* Initialization functions. */
+extern CHashTable CHashBootstrap(CHashDescriptor *desc);
+extern Size CHashEstimateSize(CHashTable table);
+extern CHashTable CHashInitialize(CHashTable table, CHashDescriptor *desc);
+
+/* Accessor functions. */
+extern bool CHashInsert(CHashTable table, void *entry);
+extern bool CHashDelete(CHashTable table, void *key);
+extern bool CHashSearch(CHashTable table, void *entry);
+extern void CHashStatistics(CHashTable table, uint64 *stats);
+
+#endif   /* CHASH_H */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] WIP: dynahash replacement for buffer table

Reply via email to