At 2013-10-30 11:04:36 -0400, t...@sss.pgh.pa.us wrote: > > > As a compromise, perhaps we can unconditionally round the size up to be > > a multiple of 2MB? […] > > That sounds reasonably painless to me.
Here's a patch that does that and adds a DEBUG1 log message when we try with MAP_HUGETLB and fail and fallback to ordinary mmap. -- Abhijit
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 707edf1..7d9d0a8 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -113,6 +113,7 @@ #shared_buffers = 32MB # min 128kB # (change requires restart) +#huge_tlb_pages = try # try to map memory with MAP_HUGETLB (on, off, try) #temp_buffers = 8MB # min 800kB #max_prepared_transactions = 0 # zero disables the feature # (change requires restart) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 77a9303..e4ded7a 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1052,6 +1052,49 @@ include 'filename' </listitem> </varlistentry> + <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages"> + <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term> + <indexterm> + <primary><varname>huge_tlb_pages</> configuration parameter</primary> + </indexterm> + <listitem> + <para> + Enables/disables the use of huge TLB pages. Valid values are + <literal>try</literal> (the default), <literal>on</literal>, + and <literal>off</literal>. + </para> + + <para> + At present, this feature is supported only on Linux. The setting + is ignored on other systems. + </para> + + <para> + The use of huge TLB pages results in smaller page tables and + less CPU time spent on memory management. For more details, see + <ulink url="https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt">hugepages.txt + </ulink> in the Linux kernel documentation. + </para> + + <para> + With <varname>huge_tlb_pages</varname> set to <literal>try</literal>, + the server will try to use huge pages, but fall back to using + normal allocation if the first attempt fails. + </para> + + <para> + With <varname>huge_tlb_pages</varname> set to <literal>on</literal>, + the server will try to use huge pages, and treat failure as a + FATAL error. + </para> + + <para> + With <varname>huge_tlb_pages</varname> set to <literal>off</literal>, + the server will not try to use huge pages. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers"> <term><varname>temp_buffers</varname> (<type>integer</type>)</term> <indexterm> diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index b604407..fc0d74b 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -32,6 +32,7 @@ #include "portability/mem.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" +#include "utils/guc.h" typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ @@ -367,14 +368,31 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) */ #ifndef EXEC_BACKEND { - long pagesize = sysconf(_SC_PAGE_SIZE); + int flags = PG_MMAP_FLAGS; + long pagesize = 2*1024*1024; + +#ifdef MAP_HUGETLB + if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY) + { + flags |= MAP_HUGETLB; + } +#endif /* * Ensure request size is a multiple of pagesize. * - * pagesize will, for practical purposes, always be a power of two. - * But just in case it isn't, we do it this way instead of using - * TYPEALIGN(). + * By doing this ourselves, we maximise the chances of being + * able to use huge TLB pages even on kernels that do not round + * up the request size correctly, for example due to this bug: + * https://bugzilla.kernel.org/show_bug.cgi?id=56881 + * + * The default value of 2MB for pagesize is chosen based on the + * most common supported huge page size. Rounding up to a larger + * value (e.g. 16MB) would use even larger pages if the hardware + * supported them, but would potentially waste more space. + * + * We round up by hand instead of using TYPEALIGN(), but for all + * practical purposes, pagesize will always be a power of two. */ if (pagesize > 0 && size % pagesize != 0) size += pagesize - (size % pagesize); @@ -386,8 +404,21 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) * out to be false, we might need to add a run-time test here and do * this only if the running kernel supports it. */ - AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, - -1, 0); + + AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, -1, 0); + +#ifdef MAP_HUGETLB + if (huge_tlb_pages == HUGE_TLB_TRY && AnonymousShmem == MAP_FAILED) + { + elog(DEBUG1, "mmap(%lu) with MAP_HUGETLB failed with errno=%d; " + "trying without", (uint64)size, errno); + + flags &= ~MAP_HUGETLB; + AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, + -1, 0); + } +#endif + if (AnonymousShmem == MAP_FAILED) ereport(FATAL, (errmsg("could not map anonymous shared memory: %m"), diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index dfc6704..8faafb4 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -381,6 +381,26 @@ static const struct config_enum_entry synchronous_commit_options[] = { }; /* + * huge_tlb_pages may be on|off|try, where try is the default + * on: try to mmap() with MAP_HUGETLB and fail when mmap() fails + * off: do not try tp mmap() with MAP_HUGETLB + * try: try to mmap() with MAP_HUGETLB and fallback to mmap() + * w/o MAP_HUGETLB + */ +static const struct config_enum_entry huge_tlb_options[] = { + {"off", HUGE_TLB_OFF, false}, + {"on", HUGE_TLB_ON, false}, + {"try", HUGE_TLB_TRY, false}, + {"true", HUGE_TLB_ON, true}, + {"false", HUGE_TLB_OFF, true}, + {"yes", HUGE_TLB_ON, true}, + {"no", HUGE_TLB_OFF, true}, + {"1", HUGE_TLB_ON, true}, + {"0", HUGE_TLB_OFF, true}, + {NULL, 0, false} +}; + +/* * Options for enum values stored in other modules */ extern const struct config_enum_entry wal_level_options[]; @@ -440,6 +460,8 @@ int tcp_keepalives_idle; int tcp_keepalives_interval; int tcp_keepalives_count; +int huge_tlb_pages = HUGE_TLB_TRY; + /* * These variables are all dummies that don't do anything, except in some * cases provide the value for SHOW to display. The real state is elsewhere @@ -3377,6 +3399,18 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"huge_tlb_pages", + PGC_POSTMASTER, + RESOURCES_MEM, + gettext_noop("Enable/disable the use of huge TLB pages on Linux"), + NULL + }, + &huge_tlb_pages, + HUGE_TLB_TRY, + huge_tlb_options, + NULL, NULL, NULL + }, /* End-of-list marker */ { diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 3e981b3..8011e88 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -235,6 +235,24 @@ extern int tcp_keepalives_idle; extern int tcp_keepalives_interval; extern int tcp_keepalives_count; + +/* + * Possible values for huge_tlb_pages; default is HUGE_TLB_TRY + */ +typedef enum +{ + HUGE_TLB_OFF, + HUGE_TLB_ON, + HUGE_TLB_TRY +} HugeTlbType; + + +/* + * configure the use of huge TLB pages + */ +extern int huge_tlb_pages; + + /* * Functions exported by guc.c */ diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 0250e39..051b6cf 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -532,6 +532,9 @@ /* Define to 1 if you have the <sys/ipc.h> header file. */ #undef HAVE_SYS_IPC_H +/* Define to 1 if you have the <sys/mman.h> header file. */ +#undef HAVE_SYS_MMAN_H + /* Define to 1 if you have the <sys/poll.h> header file. */ #undef HAVE_SYS_POLL_H diff --git a/configure b/configure index c20afde..67bc57f 100755 --- a/configure +++ b/configure @@ -10524,7 +10524,7 @@ done -for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h +for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/mman.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h do as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then diff --git a/configure.in b/configure.in index d2bab32..b755202 100644 --- a/configure.in +++ b/configure.in @@ -982,7 +982,7 @@ AC_SUBST(OSSP_UUID_LIBS) ## dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES -AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h]) +AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/mman.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h]) # On BSD, test for net/if.h will fail unless sys/socket.h # is included first.
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers