The attached patch adds the MAP_HUGETLB flag to mmap() for shared memory
on systems that support it. It's based on Christian Kruse's patch from
last year, incorporating suggestions from Andres Freund.

On a system with 4GB shared_buffers, doing pgbench runs long enough for
each backend to touch most of the buffers, this patch saves nearly 8MB of
memory per backend and improves performances by just over 2% on average.

It is still WIP as there are a couple of points that Andres has pointed
out to me that haven't been addressed yet; also, the documentation is
incomplete.

Richard

-- 
Richard Poole                 http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 23ebc11..703b28f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1052,6 +1052,42 @@ include 'filename'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
+      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+      <indexterm>
+       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Enables/disables the use of huge tlb pages. Valid values are
+        <literal>on</literal>, <literal>off</literal> and <literal>try</literal>.
+        The default value is <literal>try</literal>.
+       </para>
+
+	   <para>
+	   Use of huge tlb pages reduces the cpu time spent on memory management and
+	   the amount of memory used for page tables and therefore improves performance.
+	   </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>on</literal>
+        <symbol>mmap()</symbol> will be called with <symbol>MAP_HUGETLB</symbol>.
+        If the call fails the server will fail fatally.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>off</literal> we
+        will not use <symbol>MAP_HUGETLB</symbol> at all.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>
+        we will try to use <symbol>MAP_HUGETLB</symbol> and fall back to
+        <symbol>mmap()</symbol> without <symbol>MAP_HUGETLB</symbol>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)</term>
       <indexterm>
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 20e3c32..57fff35 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -27,10 +27,14 @@
 #ifdef HAVE_SYS_SHM_H
 #include <sys/shm.h>
 #endif
+#ifdef MAP_HUGETLB
+#include <dirent.h>
+#endif
 
 #include "miscadmin.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
+#include "utils/guc.h"
 
 
 typedef key_t IpcMemoryKey;		/* shared memory key passed to shmget(2) */
@@ -61,6 +65,13 @@ typedef int IpcMemoryId;		/* shared memory ID returned by shmget(2) */
 #define MAP_FAILED ((void *) -1)
 #endif
 
+#ifdef MAP_HUGETLB
+#define PG_HUGETLB_BASE_ADDR (void *)(0x0UL)
+#define PG_MAP_HUGETLB MAP_HUGETLB
+#else
+#define PG_MAP_HUGETLB 0
+#endif
+
 
 unsigned long UsedShmemSegID = 0;
 void	   *UsedShmemSegAddr = NULL;
@@ -342,6 +353,161 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 }
 
 
+#ifdef MAP_HUGETLB
+#define HUGE_PAGE_INFO_DIR  "/sys/kernel/mm/hugepages"
+
+/*
+ *	static long InternalGetFreeHugepagesCount(const char *name)
+ *
+ * Attempt to read the number of available hugepages from
+ * /sys/kernel/mm/hugepages/hugepages-<size>/free_hugepages
+ * Will fail (return -1) if file could not be opened, 0 if no pages are available
+ * and > 0 if there are free pages
+ *
+ */
+static long
+InternalGetFreeHugepagesCount(const char *name)
+{
+	int fd;
+	char buff[1024];
+	size_t len;
+	long result;
+	char *ptr;
+
+	len = snprintf(buff, 1024, "%s/%s/free_hugepages", HUGE_PAGE_INFO_DIR, name);
+	if (len == 1024) /* I don't think that this will happen ever */
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Filename %s/%s/free_hugepages is too long", HUGE_PAGE_INFO_DIR, name),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	fd = open(buff, O_RDONLY);
+	if (fd <= 0)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not open file %s: %s", buff, strerror(errno)),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	len = read(fd, buff, 1024);
+	if (len <= 0)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Error reading from file %s: %s", buff, strerror(errno)),
+				 errcontext("while checking hugepage size")));
+		close(fd);
+		return -1;
+	}
+
+	/*
+	 * If the content of free_hugepages is longer than or equal to 1024 bytes
+	 * the rest is irrelevant; we simply want to know if there are any
+	 * hugepages left
+	 */
+	if (len == 1024)
+	{
+		buff[1023] = 0;
+	}
+	else
+	{
+		buff[len] = 0;
+	}
+
+	close(fd);
+
+	result = strtol(buff, &ptr, 10);
+
+	if (ptr == NULL)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not convert contents of file %s/%s/free_hugepages to number", HUGE_PAGE_INFO_DIR, name),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	return result;
+}
+
+/*
+ *	static long InternalGetHugepageSize()
+ *
+ * Attempt to get a valid hugepage size from /sys/kernel/mm/hugepages/ by
+ * reading directory contents
+ * Will fail (return -1) if the directory could not be opened or no valid
+ * page sizes are available. Will return the smallest hugepage size on
+ * success.
+ *
+ */
+static long
+InternalGetHugepageSize()
+{
+	struct dirent *ent;
+	DIR *dir = opendir(HUGE_PAGE_INFO_DIR);
+	long smallest_size = -1, size;
+	bool valid_size_found = false;
+	char *ptr;
+
+	if (dir == NULL)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not open directory %s: %s", HUGE_PAGE_INFO_DIR, strerror(errno)),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	/*
+	 * Linux supports multiple hugepage sizes if the hardware
+	 * supports it; for each possible size there will be a
+	 * directory in /sys/kernel/mm/hugepages consisting of the
+	 * string hugepages- and the size of the page, e.g. on x86_64:
+	 * hugepages-2048kB
+	 */
+	while((ent = readdir(dir)) != NULL)
+	{
+		if (strncmp(ent->d_name, "hugepages-", 10) == 0)
+		{
+			size = strtol(ent->d_name + 10, &ptr, 10);
+			if (ptr == NULL)
+			{
+				continue;
+			}
+
+			if (strcmp(ptr, "kB") == 0)
+			{
+				size *= 1024;
+			}
+
+			if ((smallest_size == -1 || size < smallest_size)) {
+				valid_size_found = true;
+				if(InternalGetFreeHugepagesCount(ent->d_name) > 0)
+					smallest_size = size;
+			}
+		}
+	}
+
+	closedir(dir);
+
+	if (smallest_size == -1)
+	{
+		if(valid_size_found)
+			ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+					(errmsg("No free hugepages"),
+					 errhint("There were no free huge pages of any size")));
+		else
+			ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not find a valid hugepage size"),
+				 errhint("This error usually means that either CONFIG_HUGETLB_PAGE "
+						 "is not in kernel or that your architecture does not "
+						 "support hugepages or you did not configure hugepages")));
+	}
+
+	return smallest_size;
+}
+#endif
+
 /*
  * PGSharedMemoryCreate
  *
@@ -391,7 +557,17 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 */
 #ifndef EXEC_BACKEND
 	{
+#ifdef MAP_HUGETLB
+		long	pagesize = 0;
+
+		if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+			pagesize = InternalGetHugepageSize();
+
+		if (pagesize <= 0)
+			pagesize = sysconf(_SC_PAGE_SIZE);
+#else
 		long		pagesize = sysconf(_SC_PAGE_SIZE);
+#endif
 
 		/*
 		 * Ensure request size is a multiple of pagesize.
@@ -410,8 +586,22 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 		 * out to be false, we might need to add a run-time test here and do
 		 * this only if the running kernel supports it.
 		 */
-		AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
-							  -1, 0);
+
+		if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+		{
+			AnonymousShmem = mmap(PG_HUGETLB_BASE_ADDR, size, PROT_READ|PROT_WRITE,
+								  PG_MMAP_FLAGS|PG_MAP_HUGETLB, -1, 0);
+
+			elog(DEBUG3, "mmap() tried with MAP_HUGEPAGE: %p", AnonymousShmem);
+		}
+
+		if ((AnonymousShmem == MAP_FAILED && huge_tlb_pages == HUGE_TLB_TRY)
+			|| huge_tlb_pages == HUGE_TLB_OFF)
+		{
+			AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, PG_MMAP_FLAGS,
+								  -1, 0);
+		}
+
 		if (AnonymousShmem == MAP_FAILED)
 			ereport(FATAL,
 					(errmsg("could not map anonymous shared memory: %m"),
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 7d297bc..3b26caa 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -22,6 +22,7 @@
 #include <limits.h>
 #include <unistd.h>
 #include <sys/stat.h>
+#include <sys/mman.h>
 #ifdef HAVE_SYSLOG
 #include <syslog.h>
 #endif
@@ -381,6 +382,22 @@ static const struct config_enum_entry synchronous_commit_options[] = {
 };
 
 /*
+ * huge_tlb_pages may be on|off|try, where try is the default
+ * on: try to mmap() with MAP_HUGETLB and fail when mmap() fails
+ * off: do not try tp mmap() with MAP_HUGETLB
+ * try: try to mmap() with MAP_HUGETLB and fallback to mmap()
+ *      w/o MAP_HUGETLB
+ */
+static const struct config_enum_entry huge_tlb_options[] = {
+#ifdef MAP_HUGETLB
+	{"on", HUGE_TLB_ON, false},
+	{"try", HUGE_TLB_TRY, false},
+#endif
+	{"off", HUGE_TLB_OFF, false},
+	{NULL, 0, false}
+};
+
+/*
  * Options for enum values stored in other modules
  */
 extern const struct config_enum_entry wal_level_options[];
@@ -439,6 +456,12 @@ int			tcp_keepalives_idle;
 int			tcp_keepalives_interval;
 int			tcp_keepalives_count;
 
+#ifdef MAP_HUGETLB
+int huge_tlb_pages = HUGE_TLB_TRY;
+#else
+int huge_tlb_pages = HUGE_TLB_OFF;
+#endif
+
 /*
  * These variables are all dummies that don't do anything, except in some
  * cases provide the value for SHOW to display.  The real state is elsewhere
@@ -3354,6 +3377,26 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"huge_tlb_pages",
+#ifdef MAP_HUGETLB
+			PGC_SUSET,
+#else
+			PGC_INTERNAL,
+#endif
+			RESOURCES_MEM,
+			gettext_noop("Enable/disable the use of the hugepages feature"),
+			NULL
+		},
+		&huge_tlb_pages,
+#ifdef MAP_HUGETLB
+		HUGE_TLB_TRY,
+#else
+		HUGE_TLB_OFF,
+#endif
+		huge_tlb_options,
+		NULL, NULL, NULL
+	},
 
 	/* End-of-list marker */
 	{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d69a02b..7c826d5 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -113,6 +113,7 @@
 
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
+#huge_tlb_pages = try			# try to map memory with MAP_HUGETLB (on, off, try)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 99211c1..c2fdba4 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -235,6 +235,24 @@ extern int	tcp_keepalives_idle;
 extern int	tcp_keepalives_interval;
 extern int	tcp_keepalives_count;
 
+
+/*
+ * Possible values for huge_tlb_pages; default is HUGE_TLB_TRY
+ */
+typedef enum
+{
+	HUGE_TLB_OFF,
+	HUGE_TLB_ON,
+	HUGE_TLB_TRY
+} HugeTlbType;
+
+
+/*
+ * configure the use of huge TLB pages
+ */
+extern int huge_tlb_pages;
+
+
 /*
  * Functions exported by guc.c
  */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to