The attached patch adds the MAP_HUGETLB flag to mmap() for shared memory on systems that support it. It's based on Christian Kruse's patch from last year, incorporating suggestions from Andres Freund.
On a system with 4GB shared_buffers, doing pgbench runs long enough for each backend to touch most of the buffers, this patch saves nearly 8MB of memory per backend and improves performances by just over 2% on average. It is still WIP as there are a couple of points that Andres has pointed out to me that haven't been addressed yet; also, the documentation is incomplete. Richard -- Richard Poole http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 23ebc11..703b28f 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1052,6 +1052,42 @@ include 'filename' </listitem> </varlistentry> + <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages"> + <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term> + <indexterm> + <primary><varname>huge_tlb_pages</> configuration parameter</primary> + </indexterm> + <listitem> + <para> + Enables/disables the use of huge tlb pages. Valid values are + <literal>on</literal>, <literal>off</literal> and <literal>try</literal>. + The default value is <literal>try</literal>. + </para> + + <para> + Use of huge tlb pages reduces the cpu time spent on memory management and + the amount of memory used for page tables and therefore improves performance. + </para> + + <para> + With <varname>huge_tlb_pages</varname> set to <literal>on</literal> + <symbol>mmap()</symbol> will be called with <symbol>MAP_HUGETLB</symbol>. + If the call fails the server will fail fatally. + </para> + + <para> + With <varname>huge_tlb_pages</varname> set to <literal>off</literal> we + will not use <symbol>MAP_HUGETLB</symbol> at all. + </para> + + <para> + With <varname>huge_tlb_pages</varname> set to <literal>try</literal> + we will try to use <symbol>MAP_HUGETLB</symbol> and fall back to + <symbol>mmap()</symbol> without <symbol>MAP_HUGETLB</symbol>. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers"> <term><varname>temp_buffers</varname> (<type>integer</type>)</term> <indexterm> diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 20e3c32..57fff35 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -27,10 +27,14 @@ #ifdef HAVE_SYS_SHM_H #include <sys/shm.h> #endif +#ifdef MAP_HUGETLB +#include <dirent.h> +#endif #include "miscadmin.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" +#include "utils/guc.h" typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ @@ -61,6 +65,13 @@ typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ #define MAP_FAILED ((void *) -1) #endif +#ifdef MAP_HUGETLB +#define PG_HUGETLB_BASE_ADDR (void *)(0x0UL) +#define PG_MAP_HUGETLB MAP_HUGETLB +#else +#define PG_MAP_HUGETLB 0 +#endif + unsigned long UsedShmemSegID = 0; void *UsedShmemSegAddr = NULL; @@ -342,6 +353,161 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) } +#ifdef MAP_HUGETLB +#define HUGE_PAGE_INFO_DIR "/sys/kernel/mm/hugepages" + +/* + * static long InternalGetFreeHugepagesCount(const char *name) + * + * Attempt to read the number of available hugepages from + * /sys/kernel/mm/hugepages/hugepages-<size>/free_hugepages + * Will fail (return -1) if file could not be opened, 0 if no pages are available + * and > 0 if there are free pages + * + */ +static long +InternalGetFreeHugepagesCount(const char *name) +{ + int fd; + char buff[1024]; + size_t len; + long result; + char *ptr; + + len = snprintf(buff, 1024, "%s/%s/free_hugepages", HUGE_PAGE_INFO_DIR, name); + if (len == 1024) /* I don't think that this will happen ever */ + { + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Filename %s/%s/free_hugepages is too long", HUGE_PAGE_INFO_DIR, name), + errcontext("while checking hugepage size"))); + return -1; + } + + fd = open(buff, O_RDONLY); + if (fd <= 0) + { + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Could not open file %s: %s", buff, strerror(errno)), + errcontext("while checking hugepage size"))); + return -1; + } + + len = read(fd, buff, 1024); + if (len <= 0) + { + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Error reading from file %s: %s", buff, strerror(errno)), + errcontext("while checking hugepage size"))); + close(fd); + return -1; + } + + /* + * If the content of free_hugepages is longer than or equal to 1024 bytes + * the rest is irrelevant; we simply want to know if there are any + * hugepages left + */ + if (len == 1024) + { + buff[1023] = 0; + } + else + { + buff[len] = 0; + } + + close(fd); + + result = strtol(buff, &ptr, 10); + + if (ptr == NULL) + { + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Could not convert contents of file %s/%s/free_hugepages to number", HUGE_PAGE_INFO_DIR, name), + errcontext("while checking hugepage size"))); + return -1; + } + + return result; +} + +/* + * static long InternalGetHugepageSize() + * + * Attempt to get a valid hugepage size from /sys/kernel/mm/hugepages/ by + * reading directory contents + * Will fail (return -1) if the directory could not be opened or no valid + * page sizes are available. Will return the smallest hugepage size on + * success. + * + */ +static long +InternalGetHugepageSize() +{ + struct dirent *ent; + DIR *dir = opendir(HUGE_PAGE_INFO_DIR); + long smallest_size = -1, size; + bool valid_size_found = false; + char *ptr; + + if (dir == NULL) + { + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Could not open directory %s: %s", HUGE_PAGE_INFO_DIR, strerror(errno)), + errcontext("while checking hugepage size"))); + return -1; + } + + /* + * Linux supports multiple hugepage sizes if the hardware + * supports it; for each possible size there will be a + * directory in /sys/kernel/mm/hugepages consisting of the + * string hugepages- and the size of the page, e.g. on x86_64: + * hugepages-2048kB + */ + while((ent = readdir(dir)) != NULL) + { + if (strncmp(ent->d_name, "hugepages-", 10) == 0) + { + size = strtol(ent->d_name + 10, &ptr, 10); + if (ptr == NULL) + { + continue; + } + + if (strcmp(ptr, "kB") == 0) + { + size *= 1024; + } + + if ((smallest_size == -1 || size < smallest_size)) { + valid_size_found = true; + if(InternalGetFreeHugepagesCount(ent->d_name) > 0) + smallest_size = size; + } + } + } + + closedir(dir); + + if (smallest_size == -1) + { + if(valid_size_found) + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("No free hugepages"), + errhint("There were no free huge pages of any size"))); + else + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, + (errmsg("Could not find a valid hugepage size"), + errhint("This error usually means that either CONFIG_HUGETLB_PAGE " + "is not in kernel or that your architecture does not " + "support hugepages or you did not configure hugepages"))); + } + + return smallest_size; +} +#endif + /* * PGSharedMemoryCreate * @@ -391,7 +557,17 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) */ #ifndef EXEC_BACKEND { +#ifdef MAP_HUGETLB + long pagesize = 0; + + if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY) + pagesize = InternalGetHugepageSize(); + + if (pagesize <= 0) + pagesize = sysconf(_SC_PAGE_SIZE); +#else long pagesize = sysconf(_SC_PAGE_SIZE); +#endif /* * Ensure request size is a multiple of pagesize. @@ -410,8 +586,22 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) * out to be false, we might need to add a run-time test here and do * this only if the running kernel supports it. */ - AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, - -1, 0); + + if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY) + { + AnonymousShmem = mmap(PG_HUGETLB_BASE_ADDR, size, PROT_READ|PROT_WRITE, + PG_MMAP_FLAGS|PG_MAP_HUGETLB, -1, 0); + + elog(DEBUG3, "mmap() tried with MAP_HUGEPAGE: %p", AnonymousShmem); + } + + if ((AnonymousShmem == MAP_FAILED && huge_tlb_pages == HUGE_TLB_TRY) + || huge_tlb_pages == HUGE_TLB_OFF) + { + AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, PG_MMAP_FLAGS, + -1, 0); + } + if (AnonymousShmem == MAP_FAILED) ereport(FATAL, (errmsg("could not map anonymous shared memory: %m"), diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 7d297bc..3b26caa 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -22,6 +22,7 @@ #include <limits.h> #include <unistd.h> #include <sys/stat.h> +#include <sys/mman.h> #ifdef HAVE_SYSLOG #include <syslog.h> #endif @@ -381,6 +382,22 @@ static const struct config_enum_entry synchronous_commit_options[] = { }; /* + * huge_tlb_pages may be on|off|try, where try is the default + * on: try to mmap() with MAP_HUGETLB and fail when mmap() fails + * off: do not try tp mmap() with MAP_HUGETLB + * try: try to mmap() with MAP_HUGETLB and fallback to mmap() + * w/o MAP_HUGETLB + */ +static const struct config_enum_entry huge_tlb_options[] = { +#ifdef MAP_HUGETLB + {"on", HUGE_TLB_ON, false}, + {"try", HUGE_TLB_TRY, false}, +#endif + {"off", HUGE_TLB_OFF, false}, + {NULL, 0, false} +}; + +/* * Options for enum values stored in other modules */ extern const struct config_enum_entry wal_level_options[]; @@ -439,6 +456,12 @@ int tcp_keepalives_idle; int tcp_keepalives_interval; int tcp_keepalives_count; +#ifdef MAP_HUGETLB +int huge_tlb_pages = HUGE_TLB_TRY; +#else +int huge_tlb_pages = HUGE_TLB_OFF; +#endif + /* * These variables are all dummies that don't do anything, except in some * cases provide the value for SHOW to display. The real state is elsewhere @@ -3354,6 +3377,26 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"huge_tlb_pages", +#ifdef MAP_HUGETLB + PGC_SUSET, +#else + PGC_INTERNAL, +#endif + RESOURCES_MEM, + gettext_noop("Enable/disable the use of the hugepages feature"), + NULL + }, + &huge_tlb_pages, +#ifdef MAP_HUGETLB + HUGE_TLB_TRY, +#else + HUGE_TLB_OFF, +#endif + huge_tlb_options, + NULL, NULL, NULL + }, /* End-of-list marker */ { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index d69a02b..7c826d5 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -113,6 +113,7 @@ #shared_buffers = 32MB # min 128kB # (change requires restart) +#huge_tlb_pages = try # try to map memory with MAP_HUGETLB (on, off, try) #temp_buffers = 8MB # min 800kB #max_prepared_transactions = 0 # zero disables the feature # (change requires restart) diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 99211c1..c2fdba4 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -235,6 +235,24 @@ extern int tcp_keepalives_idle; extern int tcp_keepalives_interval; extern int tcp_keepalives_count; + +/* + * Possible values for huge_tlb_pages; default is HUGE_TLB_TRY + */ +typedef enum +{ + HUGE_TLB_OFF, + HUGE_TLB_ON, + HUGE_TLB_TRY +} HugeTlbType; + + +/* + * configure the use of huge TLB pages + */ +extern int huge_tlb_pages; + + /* * Functions exported by guc.c */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers