Hi. This is a slightly reworked version of the patch submitted by Richard Poole last month, which was based on Christian Kruse's earlier patch.
Apart from doing various minor cleanups and documentation fixes, I also tested this patch against HEAD on a machine with 256GB of RAM. Here's an overview of the results. I set nr_hugepages to 32768 (== 64GB), which (took a very long time and) allowed me to set shared_buffers to 60GB. I then ran pgbench -s 1000 -i, and did some runs of "pgbench -c 100 -j 10 -t 1000" with huge_tlb_pages set to off and on respectively. With huge_tlb_pages=off, this is the best result I got: tps = 8680.771068 (including connections establishing) tps = 8721.504838 (excluding connections establishing) With huge_tlb_pages=on, this is the best result I got: tps = 9932.245203 (including connections establishing) tps = 9983.190304 (excluding connections establishing) (Even the worst result I got in the latter case was a smidgen faster than the best with huge_tlb_pages=off: 8796.344078 vs. 8721.504838.) >From /proc/$pid/status, VmPTE was 2880kb with huge_tlb_pages=off, and 56kb with it turned on. One open question is what to do about rounding up the size. It should not be necessary, but for the fairly recent bug described at the link in the comment (https://bugzilla.kernel.org/show_bug.cgi?id=56881). I tried it without the rounding-up, and it fails on Ubuntu's 3.5.0-28 kernel (mmap returns EINVAL). Any thoughts? -- Abhijit
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 77a9303..e4ded7a 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1052,6 +1052,49 @@ include 'filename' </listitem> </varlistentry> + <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages"> + <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term> + <indexterm> + <primary><varname>huge_tlb_pages</> configuration parameter</primary> + </indexterm> + <listitem> + <para> + Enables/disables the use of huge TLB pages. Valid values are + <literal>try</literal> (the default), <literal>on</literal>, + and <literal>off</literal>. + </para> + + <para> + At present, this feature is supported only on Linux. The setting + is ignored on other systems. + </para> + + <para> + The use of huge TLB pages results in smaller page tables and + less CPU time spent on memory management. For more details, see + <ulink url="https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt">hugepages.txt + </ulink> in the Linux kernel documentation. + </para> + + <para> + With <varname>huge_tlb_pages</varname> set to <literal>try</literal>, + the server will try to use huge pages, but fall back to using + normal allocation if the first attempt fails. + </para> + + <para> + With <varname>huge_tlb_pages</varname> set to <literal>on</literal>, + the server will try to use huge pages, and treat failure as a + FATAL error. + </para> + + <para> + With <varname>huge_tlb_pages</varname> set to <literal>off</literal>, + the server will not try to use huge pages. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers"> <term><varname>temp_buffers</varname> (<type>integer</type>)</term> <indexterm> diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 707edf1..7d9d0a8 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -113,6 +113,7 @@ #shared_buffers = 32MB # min 128kB # (change requires restart) +#huge_tlb_pages = try # try to map memory with MAP_HUGETLB (on, off, try) #temp_buffers = 8MB # min 800kB #max_prepared_transactions = 0 # zero disables the feature # (change requires restart) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index b604407..34937b2 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -27,11 +27,15 @@ #ifdef HAVE_SYS_SHM_H #include <sys/shm.h> #endif +#ifdef MAP_HUGETLB +#include <dirent.h> +#endif #include "miscadmin.h" #include "portability/mem.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" +#include "utils/guc.h" typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ @@ -318,6 +322,151 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) } +#ifdef MAP_HUGETLB +#define HUGE_PAGE_INFO_DIR "/sys/kernel/mm/hugepages" + +/* + * long InternalGetFreeHugepagesCount(const char *name) + * + * Returns the number of free hugepages of a given size, as reported by + * /sys/kernel/mm/hugepages/<name>/free_hugepages. Will fail (return -1) + * if the file could not be opened or 0 if no free pages are available. + */ +static long +InternalGetFreeHugepagesCount(const char *name) +{ + int fd; + char buff[1024]; + size_t len; + long result; + char *ptr; + + len = snprintf(buff, 1024, "%s/%s/free_hugepages", HUGE_PAGE_INFO_DIR, name); + if (len == 1024) + { + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Filename %s/%s/free_hugepages is too long", HUGE_PAGE_INFO_DIR, name), + errcontext("while checking hugepage size"))); + return -1; + } + + fd = open(buff, O_RDONLY); + if (fd <= 0) + { + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Could not open file %s: %s", buff, strerror(errno)), + errcontext("while checking hugepage size"))); + return -1; + } + + len = read(fd, buff, 1024); + if (len <= 0) + { + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Error reading from file %s: %s", buff, strerror(errno)), + errcontext("while checking hugepage size"))); + close(fd); + return -1; + } + close(fd); + + /* + * free_hugepages should contain the number of free hugepages of a + * given size. If we somehow read 1024 bytes from it above (which + * should never happen), we check 1023 bytes and ignore the rest. + */ + if (len == 1024) + len = 1023; + + buff[len] = 0; + + result = strtol(buff, &ptr, 10); + + if (ptr == NULL) + { + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Could not convert contents of file %s/%s/free_hugepages to number", HUGE_PAGE_INFO_DIR, name), + errcontext("while checking hugepage size"))); + return -1; + } + + return result; +} + +/* + * long InternalGetHugepageSize() + * + * Returns the smallest valid hugepage size by reading the contents of + * the /sys/kernel/mm/hugepages directory. Will fail (return -1) if the + * directory could not be opened or no valid page sizes are available. + */ +static long +InternalGetHugepageSize() +{ + struct dirent *ent; + DIR *dir = opendir(HUGE_PAGE_INFO_DIR); + long smallest_size = -1, size; + bool valid_size_found = false; + char *ptr; + + if (dir == NULL) + { + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Could not open directory %s: %s", HUGE_PAGE_INFO_DIR, strerror(errno)), + errcontext("while checking hugepage size"))); + return -1; + } + + /* + * Linux supports multiple hugepage sizes if the hardware + * supports it; for each possible size there will be a + * directory in /sys/kernel/mm/hugepages consisting of the + * string hugepages- and the size of the page, e.g. on x86_64: + * hugepages-2048kB + */ + while((ent = readdir(dir)) != NULL) + { + if (strncmp(ent->d_name, "hugepages-", 10) == 0) + { + size = strtol(ent->d_name + 10, &ptr, 10); + if (ptr == NULL) + { + continue; + } + + if (strcmp(ptr, "kB") == 0) + { + size *= 1024; + } + + if ((smallest_size == -1 || size < smallest_size)) { + valid_size_found = true; + if(InternalGetFreeHugepagesCount(ent->d_name) > 0) + smallest_size = size; + } + } + } + + closedir(dir); + + if (smallest_size == -1) + { + if(valid_size_found) + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("No free hugepages"), + errhint("There were no free huge pages of any size"))); + else + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Could not find a valid hugepage size"), + errhint("This error usually means that either CONFIG_HUGETLB_PAGE " + "is not in kernel or that your architecture does not " + "support hugepages or you did not configure hugepages"))); + } + + return smallest_size; +} +#endif + /* * PGSharedMemoryCreate * @@ -367,7 +516,19 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) */ #ifndef EXEC_BACKEND { - long pagesize = sysconf(_SC_PAGE_SIZE); + long pagesize = 0; + int flags = PG_MMAP_FLAGS; + +#ifdef MAP_HUGETLB + if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY) + { + flags |= MAP_HUGETLB; + pagesize = InternalGetHugepageSize(); + } +#endif + + if (pagesize <= 0) + pagesize = sysconf(_SC_PAGE_SIZE); /* * Ensure request size is a multiple of pagesize. @@ -375,6 +536,10 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) * pagesize will, for practical purposes, always be a power of two. * But just in case it isn't, we do it this way instead of using * TYPEALIGN(). + * + * The kernel should really remove the need to worry about this, + * but see https://bugzilla.kernel.org/show_bug.cgi?id=56881 for + * recent situations in which this did not work. */ if (pagesize > 0 && size % pagesize != 0) size += pagesize - (size % pagesize); @@ -386,8 +551,22 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) * out to be false, we might need to add a run-time test here and do * this only if the running kernel supports it. */ - AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, - -1, 0); + + AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, -1, 0); + +#ifdef MAP_HUGETLB + /* + * If huge_tlb_pages="try" and the allocation fails, we retry + * without the MAP_HUGETLB flag. + */ + + if (AnonymousShmem == MAP_FAILED && huge_tlb_pages == HUGE_TLB_TRY) + { + AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, + PG_MMAP_FLAGS, -1, 0); + } +#endif + if (AnonymousShmem == MAP_FAILED) ereport(FATAL, (errmsg("could not map anonymous shared memory: %m"), diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 3e981b3..8011e88 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -235,6 +235,24 @@ extern int tcp_keepalives_idle; extern int tcp_keepalives_interval; extern int tcp_keepalives_count; + +/* + * Possible values for huge_tlb_pages; default is HUGE_TLB_TRY + */ +typedef enum +{ + HUGE_TLB_OFF, + HUGE_TLB_ON, + HUGE_TLB_TRY +} HugeTlbType; + + +/* + * configure the use of huge TLB pages + */ +extern int huge_tlb_pages; + + /* * Functions exported by guc.c */ diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index dfc6704..fe09396 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -22,6 +22,9 @@ #include <limits.h> #include <unistd.h> #include <sys/stat.h> +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif #ifdef HAVE_SYSLOG #include <syslog.h> #endif @@ -381,6 +384,26 @@ static const struct config_enum_entry synchronous_commit_options[] = { }; /* + * huge_tlb_pages may be on|off|try, where try is the default + * on: try to mmap() with MAP_HUGETLB and fail when mmap() fails + * off: do not try tp mmap() with MAP_HUGETLB + * try: try to mmap() with MAP_HUGETLB and fallback to mmap() + * w/o MAP_HUGETLB + */ +static const struct config_enum_entry huge_tlb_options[] = { + {"off", HUGE_TLB_OFF, false}, + {"on", HUGE_TLB_ON, false}, + {"try", HUGE_TLB_TRY, false}, + {"true", HUGE_TLB_ON, true}, + {"false", HUGE_TLB_OFF, true}, + {"yes", HUGE_TLB_ON, true}, + {"no", HUGE_TLB_OFF, true}, + {"1", HUGE_TLB_ON, true}, + {"0", HUGE_TLB_OFF, true}, + {NULL, 0, false} +}; + +/* * Options for enum values stored in other modules */ extern const struct config_enum_entry wal_level_options[]; @@ -440,6 +463,8 @@ int tcp_keepalives_idle; int tcp_keepalives_interval; int tcp_keepalives_count; +int huge_tlb_pages = HUGE_TLB_TRY; + /* * These variables are all dummies that don't do anything, except in some * cases provide the value for SHOW to display. The real state is elsewhere @@ -3377,6 +3402,18 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"huge_tlb_pages", + PGC_POSTMASTER, + RESOURCES_MEM, + gettext_noop("Enable/disable the use of huge TLB pages on Linux"), + NULL + }, + &huge_tlb_pages, + HUGE_TLB_TRY, + huge_tlb_options, + NULL, NULL, NULL + }, /* End-of-list marker */ { diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 0250e39..051b6cf 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -532,6 +532,9 @@ /* Define to 1 if you have the <sys/ipc.h> header file. */ #undef HAVE_SYS_IPC_H +/* Define to 1 if you have the <sys/mman.h> header file. */ +#undef HAVE_SYS_MMAN_H + /* Define to 1 if you have the <sys/poll.h> header file. */ #undef HAVE_SYS_POLL_H diff --git a/configure.in b/configure.in index d2bab32..b755202 100644 --- a/configure.in +++ b/configure.in @@ -982,7 +982,7 @@ AC_SUBST(OSSP_UUID_LIBS) ## dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES -AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h]) +AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/mman.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h]) # On BSD, test for net/if.h will fail unless sys/socket.h # is included first. diff --git a/configure b/configure index c20afde..67bc57f 100755 --- a/configure +++ b/configure @@ -10524,7 +10524,7 @@ done -for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h +for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/mman.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h do as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers