At 2013-10-30 11:04:36 -0400, t...@sss.pgh.pa.us wrote:
>
> > As a compromise, perhaps we can unconditionally round the size up to be
> > a multiple of 2MB? […]
> 
> That sounds reasonably painless to me. 

Here's a patch that does that and adds a DEBUG1 log message when we try
with MAP_HUGETLB and fail and fallback to ordinary mmap.

-- Abhijit
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 707edf1..7d9d0a8 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -113,6 +113,7 @@
 
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
+#huge_tlb_pages = try			# try to map memory with MAP_HUGETLB (on, off, try)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 77a9303..e4ded7a 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1052,6 +1052,49 @@ include 'filename'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
+      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+      <indexterm>
+       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Enables/disables the use of huge TLB pages. Valid values are
+        <literal>try</literal> (the default), <literal>on</literal>,
+        and <literal>off</literal>.
+       </para>
+
+       <para>
+        At present, this feature is supported only on Linux. The setting
+        is ignored on other systems.
+       </para>
+
+       <para>
+        The use of huge TLB pages results in smaller page tables and
+        less CPU time spent on memory management. For more details, see
+        <ulink url="https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt";>hugepages.txt
+        </ulink> in the Linux kernel documentation.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
+        the server will try to use huge pages, but fall back to using
+        normal allocation if the first attempt fails.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>on</literal>,
+        the server will try to use huge pages, and treat failure as a
+        FATAL error.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>off</literal>,
+        the server will not try to use huge pages.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)</term>
       <indexterm>

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index b604407..fc0d74b 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -32,6 +32,7 @@
 #include "portability/mem.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
+#include "utils/guc.h"
 
 
 typedef key_t IpcMemoryKey;		/* shared memory key passed to shmget(2) */
@@ -367,14 +368,31 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 */
 #ifndef EXEC_BACKEND
 	{
-		long		pagesize = sysconf(_SC_PAGE_SIZE);
+		int			flags = PG_MMAP_FLAGS;
+		long		pagesize = 2*1024*1024;
+
+#ifdef MAP_HUGETLB
+		if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+		{
+			flags |= MAP_HUGETLB;
+		}
+#endif
 
 		/*
 		 * Ensure request size is a multiple of pagesize.
 		 *
-		 * pagesize will, for practical purposes, always be a power of two.
-		 * But just in case it isn't, we do it this way instead of using
-		 * TYPEALIGN().
+		 * By doing this ourselves, we maximise the chances of being
+		 * able to use huge TLB pages even on kernels that do not round
+		 * up the request size correctly, for example due to this bug:
+		 * https://bugzilla.kernel.org/show_bug.cgi?id=56881
+		 *
+		 * The default value of 2MB for pagesize is chosen based on the
+		 * most common supported huge page size. Rounding up to a larger
+		 * value (e.g. 16MB) would use even larger pages if the hardware
+		 * supported them, but would potentially waste more space.
+		 *
+		 * We round up by hand instead of using TYPEALIGN(), but for all
+		 * practical purposes, pagesize will always be a power of two.
 		 */
 		if (pagesize > 0 && size % pagesize != 0)
 			size += pagesize - (size % pagesize);
@@ -386,8 +404,21 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 		 * out to be false, we might need to add a run-time test here and do
 		 * this only if the running kernel supports it.
 		 */
-		AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
-							  -1, 0);
+
+		AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, -1, 0);
+
+#ifdef MAP_HUGETLB
+		if (huge_tlb_pages == HUGE_TLB_TRY && AnonymousShmem == MAP_FAILED)
+		{
+			elog(DEBUG1, "mmap(%lu) with MAP_HUGETLB failed with errno=%d; "
+				 "trying without", (uint64)size, errno);
+
+			flags &= ~MAP_HUGETLB;
+			AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, flags,
+								  -1, 0);
+		}
+#endif
+
 		if (AnonymousShmem == MAP_FAILED)
 			ereport(FATAL,
 					(errmsg("could not map anonymous shared memory: %m"),

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index dfc6704..8faafb4 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -381,6 +381,26 @@ static const struct config_enum_entry synchronous_commit_options[] = {
 };
 
 /*
+ * huge_tlb_pages may be on|off|try, where try is the default
+ * on: try to mmap() with MAP_HUGETLB and fail when mmap() fails
+ * off: do not try tp mmap() with MAP_HUGETLB
+ * try: try to mmap() with MAP_HUGETLB and fallback to mmap()
+ *      w/o MAP_HUGETLB
+ */
+static const struct config_enum_entry huge_tlb_options[] = {
+	{"off", HUGE_TLB_OFF, false},
+	{"on", HUGE_TLB_ON, false},
+	{"try", HUGE_TLB_TRY, false},
+	{"true", HUGE_TLB_ON, true},
+	{"false", HUGE_TLB_OFF, true},
+	{"yes", HUGE_TLB_ON, true},
+	{"no", HUGE_TLB_OFF, true},
+	{"1", HUGE_TLB_ON, true},
+	{"0", HUGE_TLB_OFF, true},
+	{NULL, 0, false}
+};
+
+/*
  * Options for enum values stored in other modules
  */
 extern const struct config_enum_entry wal_level_options[];
@@ -440,6 +460,8 @@ int			tcp_keepalives_idle;
 int			tcp_keepalives_interval;
 int			tcp_keepalives_count;
 
+int huge_tlb_pages = HUGE_TLB_TRY;
+
 /*
  * These variables are all dummies that don't do anything, except in some
  * cases provide the value for SHOW to display.  The real state is elsewhere
@@ -3377,6 +3399,18 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"huge_tlb_pages",
+			PGC_POSTMASTER,
+			RESOURCES_MEM,
+			gettext_noop("Enable/disable the use of huge TLB pages on Linux"),
+			NULL
+		},
+		&huge_tlb_pages,
+		HUGE_TLB_TRY,
+		huge_tlb_options,
+		NULL, NULL, NULL
+	},
 
 	/* End-of-list marker */
 	{

diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 3e981b3..8011e88 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -235,6 +235,24 @@ extern int	tcp_keepalives_idle;
 extern int	tcp_keepalives_interval;
 extern int	tcp_keepalives_count;
 
+
+/*
+ * Possible values for huge_tlb_pages; default is HUGE_TLB_TRY
+ */
+typedef enum
+{
+	HUGE_TLB_OFF,
+	HUGE_TLB_ON,
+	HUGE_TLB_TRY
+} HugeTlbType;
+
+
+/*
+ * configure the use of huge TLB pages
+ */
+extern int huge_tlb_pages;
+
+
 /*
  * Functions exported by guc.c
  */

diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 0250e39..051b6cf 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -532,6 +532,9 @@
 /* Define to 1 if you have the <sys/ipc.h> header file. */
 #undef HAVE_SYS_IPC_H
 
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#undef HAVE_SYS_MMAN_H
+
 /* Define to 1 if you have the <sys/poll.h> header file. */
 #undef HAVE_SYS_POLL_H
 
diff --git a/configure b/configure
index c20afde..67bc57f 100755
--- a/configure
+++ b/configure
@@ -10524,7 +10524,7 @@ done
 
 
 
-for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/mman.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do
 as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then

diff --git a/configure.in b/configure.in
index d2bab32..b755202 100644
--- a/configure.in
+++ b/configure.in
@@ -982,7 +982,7 @@ AC_SUBST(OSSP_UUID_LIBS)
 ##
 
 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/mman.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
 
 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to