Hi,

attached you will find a new version of the patch, ported to HEAD,
fixed the mentioned bug and - hopefully - dealing the the remaining
issues.

Best regards,

-- 
 Christian Kruse               http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 14ed6c7..e7c2559 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1107,6 +1107,43 @@ include 'filename'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
+      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+      <indexterm>
+       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Enables/disables the use of huge TLB pages. Valid values are
+        <literal>try</literal> (the default), <literal>on</literal>,
+        and <literal>off</literal>.
+       </para>
+
+       <para>
+        At present, this feature is supported only on Linux. The setting
+        is ignored on other systems.
+       </para>
+
+       <para>
+        The use of huge TLB pages results in smaller page tables and
+        less CPU time spent on memory management, increasing performance. For
+        more details, see
+        <ulink url="https://wiki.debian.org/Hugepages";>the Debian wiki</ulink>.
+        Remember that you will need at least shared_buffers / huge page size +
+        1 huge TLB pages. So for example for a system with 6GB shared buffers
+        and a hugepage size of 2kb of you will need at least 3156 huge pages.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
+        the server will try to use huge pages, but fall back to using
+        normal allocation if that fails. With <literal>on</literal, failure
+        to use huge pages will prevent the server from starting up. With
+        <literal>off</literal>, huge pages will not be used.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)</term>
       <indexterm>
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 0d01617..b3b87d7 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -32,6 +32,7 @@
 #include "portability/mem.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
+#include "utils/guc.h"
 
 
 typedef key_t IpcMemoryKey;		/* shared memory key passed to shmget(2) */
@@ -41,7 +42,7 @@ typedef int IpcMemoryId;		/* shared memory ID returned by shmget(2) */
 unsigned long UsedShmemSegID = 0;
 void	   *UsedShmemSegAddr = NULL;
 static Size AnonymousShmemSize;
-static void *AnonymousShmem;
+static void *AnonymousShmem = NULL;
 
 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
 static void IpcMemoryDetach(int status, Datum shmaddr);
@@ -317,6 +318,80 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 	return true;
 }
 
+/*
+ * Creates an anonymous mmap()ed shared memory segment.
+ *
+ * Pass the desired size in *size. This function will modify *size to the
+ * actual size of the allocation, if it ends up allocating a larger than
+ * desired segment.
+ */
+#ifndef EXEC_BACKEND
+static void *
+CreateAnonymousSegment(Size *size)
+{
+	Size		allocsize;
+	void	   *ptr = MAP_FAILED;
+
+#ifndef MAP_HUGETLB
+	if (huge_tlb_pages == HUGE_TLB_ON)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("huge TLB pages not supported on this platform")));
+#else
+	if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+	{
+		/*
+		 * Round up the request size to a suitable large value.
+		 *
+		 * Some Linux kernel versions are known to have a bug, which causes
+		 * mmap() with MAP_HUGETLB to fail if the request size is not a
+		 * multiple of any supported huge page size. To work around that, we
+		 * round up the request size to nearest 2MB. 2MB is the most common
+		 * huge page page size on affected systems.
+		 *
+		 * Aside from that bug, even with a kernel that does the allocation
+		 * correctly, rounding it up ourselvees avoids wasting memory. Without
+		 * it, if we for example make an allocation of 2MB + 1 bytes, the
+		 * kernel might decide to use two 2MB huge pages for that, and waste 2
+		 * MB - 1 of memory. When we do the rounding ourselves, we can use
+		 * that space for allocations.
+		 */
+		int			hugepagesize = 2 * 1024 * 1024;
+
+		allocsize = *size;
+		if (allocsize % hugepagesize != 0)
+			allocsize += hugepagesize - (allocsize % hugepagesize);
+
+		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE,
+				   PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
+		if (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED)
+			elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m");
+	}
+#endif
+
+	if (huge_tlb_pages == HUGE_TLB_OFF ||
+		(huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED))
+	{
+		allocsize = *size;
+		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0);
+	}
+
+	if (ptr == MAP_FAILED)
+		ereport(FATAL,
+				(errmsg("could not map anonymous shared memory: %m"),
+				 (errno == ENOMEM) ?
+				 errhint("This error usually means that PostgreSQL's request "
+					"for a shared memory segment exceeded available memory, "
+					  "swap space or huge pages. To reduce the request size "
+						 "(currently  %zu bytes), reduce PostgreSQL's shared "
+					   "memory usage, perhaps by reducing shared_buffers or "
+						 "max_connections.",
+						 *size) : 0));
+
+	*size = allocsize;
+	return ptr;
+}
+#endif
 
 /*
  * PGSharedMemoryCreate
@@ -344,7 +419,14 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	PGShmemHeader *hdr;
 	IpcMemoryId shmid;
 	struct stat statbuf;
-	Size		sysvsize = size;
+	Size		sysvsize;
+
+#if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB)
+	if (huge_tlb_pages == HUGE_TLB_ON)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("huge TLB pages not supported on this platform")));
+#endif
 
 	/* Room for a header? */
 	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
@@ -359,6 +441,12 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 * to run many copies of PostgreSQL without needing to adjust system
 	 * settings.
 	 *
+	 * We assume that no one will attempt to run PostgreSQL 9.3 or later on
+	 * systems that are ancient enough that anonymous shared memory is not
+	 * supported, such as pre-2.4 versions of Linux.  If that turns out to be
+	 * false, we might need to add a run-time test here and do this only if
+	 * the running kernel supports it.
+	 *
 	 * However, we disable this logic in the EXEC_BACKEND case, and fall back
 	 * to the old method of allocating the entire segment using System V
 	 * shared memory, because there's no way to attach an mmap'd segment to a
@@ -366,44 +454,13 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 * developer use, this shouldn't be a big problem.
 	 */
 #ifndef EXEC_BACKEND
-	{
-		long		pagesize = sysconf(_SC_PAGE_SIZE);
-
-		/*
-		 * Ensure request size is a multiple of pagesize.
-		 *
-		 * pagesize will, for practical purposes, always be a power of two.
-		 * But just in case it isn't, we do it this way instead of using
-		 * TYPEALIGN().
-		 */
-		if (pagesize > 0 && size % pagesize != 0)
-			size += pagesize - (size % pagesize);
+	AnonymousShmem = CreateAnonymousSegment(&size);
+	AnonymousShmemSize = size;
 
-		/*
-		 * We assume that no one will attempt to run PostgreSQL 9.3 or later
-		 * on systems that are ancient enough that anonymous shared memory is
-		 * not supported, such as pre-2.4 versions of Linux.  If that turns
-		 * out to be false, we might need to add a run-time test here and do
-		 * this only if the running kernel supports it.
-		 */
-		AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
-							  -1, 0);
-		if (AnonymousShmem == MAP_FAILED)
-			ereport(FATAL,
-					(errmsg("could not map anonymous shared memory: %m"),
-					 (errno == ENOMEM) ?
-				errhint("This error usually means that PostgreSQL's request "
-					 "for a shared memory segment exceeded available memory "
-					  "or swap space. To reduce the request size (currently "
-					  "%zu bytes), reduce PostgreSQL's shared memory usage, "
-						"perhaps by reducing shared_buffers or "
-						"max_connections.",
-						size) : 0));
-		AnonymousShmemSize = size;
-
-		/* Now we need only allocate a minimal-sized SysV shmem block. */
-		sysvsize = sizeof(PGShmemHeader);
-	}
+	/* Now we need only allocate a minimal-sized SysV shmem block. */
+	sysvsize = sizeof(PGShmemHeader);
+#else
+	sysvsize = size;
 #endif
 
 	/* Make sure PGSharedMemoryAttach doesn't fail without need */
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index 80f1982..9b0cceb 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -128,6 +128,11 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	DWORD		size_high;
 	DWORD		size_low;
 
+	if (huge_tlb_pages == HUGE_TLB_ON)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("huge TLB pages not supported on this platform")));
+
 	/* Room for a header? */
 	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 2cc8f90..a9b9794 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -64,6 +64,7 @@
 #include "storage/dsm_impl.h"
 #include "storage/standby.h"
 #include "storage/fd.h"
+#include "storage/pg_shmem.h"
 #include "storage/proc.h"
 #include "storage/predicate.h"
 #include "tcop/tcopprot.h"
@@ -388,6 +389,23 @@ static const struct config_enum_entry synchronous_commit_options[] = {
 };
 
 /*
+ * Although only "on", "off", "try" are documented, we accept all the likely
+ * variants of "on" and "off".
+ */
+static const struct config_enum_entry huge_tlb_options[] = {
+	{"off", HUGE_TLB_OFF, false},
+	{"on", HUGE_TLB_ON, false},
+	{"try", HUGE_TLB_TRY, false},
+	{"true", HUGE_TLB_ON, true},
+	{"false", HUGE_TLB_OFF, true},
+	{"yes", HUGE_TLB_ON, true},
+	{"no", HUGE_TLB_OFF, true},
+	{"1", HUGE_TLB_ON, true},
+	{"0", HUGE_TLB_OFF, true},
+	{NULL, 0, false}
+};
+
+/*
  * Options for enum values stored in other modules
  */
 extern const struct config_enum_entry wal_level_options[];
@@ -448,6 +466,12 @@ int			tcp_keepalives_interval;
 int			tcp_keepalives_count;
 
 /*
+ * This really belongs in pg_shmem.c, but is defined here so that it doesn't
+ * need to be duplicated in all the different implementations of pg_shmem.c.
+ */
+int			huge_tlb_pages;
+
+/*
  * These variables are all dummies that don't do anything, except in some
  * cases provide the value for SHOW to display.  The real state is elsewhere
  * and is kept in sync by assign_hooks.
@@ -3430,6 +3454,15 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"huge_tlb_pages", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Use of huge TLB pages on Linux"),
+			NULL
+		},
+		&huge_tlb_pages,
+		HUGE_TLB_TRY, huge_tlb_options,
+		NULL, NULL, NULL
+	},
 
 	/* End-of-list marker */
 	{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 7ad6b7c..c8673b3 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -115,6 +115,8 @@
 
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
+#huge_tlb_pages = try			# on, off, or try
+					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 22ef901..df094e8 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -38,6 +38,16 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 #endif
 } PGShmemHeader;
 
+/* GUC variable */
+extern int huge_tlb_pages;
+
+/* Possible values for huge_tlb_pages */
+typedef enum
+{
+	HUGE_TLB_OFF,
+	HUGE_TLB_ON,
+	HUGE_TLB_TRY
+} HugeTlbType;
 
 #ifdef EXEC_BACKEND
 #ifndef WIN32

Attachment: pgpYscNjCU3Gj.pgp
Description: PGP signature

Reply via email to