On 22.08.2006 [19:08:40 -0700], Nishanth Aravamudan wrote:
> Hi,
>
> Here is my attempt at reinstating the mlocking guarantee for morecore.
> The issue previously was that we would fault in hugepages on the current
> node only, leading to terrible NUMA performance. Instead, we now check
> the current mempolicy and if it's DEFAULT (which acc'g to the man mbind
> page means "Unless the process policy has been changed this means to
> allocate memory on the node of the CPU that triggered the allocation.")
> we change it to INTERLEAVE. I think we want to respect the policy if
> it's BIND or PREFERRED, although maybe only the latter is really
> important.
>
> The NUMA API man-pages are really bad, so I'll probably spend some time
> now creating patches for them, based upon my reading of the
> corresponding kernel code.
>
> Unfortunately, this would introduce a dependency on libnuma, as
> otherwise the get_mempolicy() and mbind() calls have no definition :( So
> I'm emulating them with indirect syscalls.
>
> I'm going to go and test this now on a non-NUMA machine until I can find
> access to a larger NUMA machine where this might make a difference, but
> wanted to get the patch out there, because I'm not entirely sure I know
> what I'm doing :)
>
> Completely only an RFC right now, not requesting inclusion, so not
> Signed-off.
Second try, compile-tested and run-tested on a non-NUMA machine (passes
make func). Will hopefully have time to test on a NUMA box tomorrow.
Still not Signed-off.
I was really trying to avoid using libnuma, but I ended up just stealing
code from it. I've trimmed it down to the minimal amount, but am open
for further (or better) suggestions.
Thanks,
Nish
diff --git a/morecore.c b/morecore.c
index 9f13316..01ac8ae 100644
--- a/morecore.c
+++ b/morecore.c
@@ -26,6 +26,11 @@
#include
#include
#include
+#include
+#include
+#include
+#include
+#include
#include "hugetlbfs.h"
@@ -49,10 +54,94 @@ static long mapsize;
* go back to small pages and use mmap to get them. Hurrah.
*/
+#if defined(__x86_64__) || defined(__i386__)
+#define NUMA_NUM_NODES 128
+#else
+#define NUMA_NUM_NODES 2048
+#endif
+
+#ifndef BITS_PER_LONG
+#define BITS_PER_LONG (8*sizeof(unsigned long))
+#endif
+
+static int respect_policy;
+/* adapted from libnuma source */
+static int numa_is_available;
+static unsigned long nodemask[NUMA_NUM_NODES/BITS_PER_LONG];
+
+static int numa_max_node(void)
+{
+ DIR *d;
+ struct dirent *de;
+ int found, maxnode = 0;
+
+ d = opendir("/sys/devices/system/node");
+ if (!d)
+ return 0;
+ found = 0;
+ while ((de = readdir(d)) != NULL) {
+ int nd;
+ if (strncmp(de->d_name, "node", 4))
+ continue;
+ found++;
+ nd = strtoul(de->d_name+4, NULL, 0);
+ if (maxnode < nd)
+ maxnode = nd;
+ }
+ closedir(d);
+ if (found == 0)
+ return 0;
+ return maxnode;
+}
+
+static void setup_numa_if_available(void)
+{
+ int i, maxnode;
+
+ if (syscall(__NR_get_mempolicy, NULL, NULL, 0, 0, 0) < 0
+ && errno == ENOSYS) {
+ numa_is_available = 0;
+ return;
+ }
+
+ numa_is_available = 1;
+
+ maxnode = numa_max_node();
+ for (i = 0; i <= maxnode; i++)
+ nodemask[i / BITS_PER_LONG] |= (1UL<<(i%BITS_PER_LONG));
+}
+
+static int guarantee_memory(void *p, long size)
+{
+ int ret;
+
+ /*
+* Override the NUMA policy unless told not to by the environment
+*
+* Default to interleaving at fault-time to avoid having all the
+* hugepages being allocated on the current node.
+*/
+ if (numa_is_available && (respect_policy == 0))
+ if (syscall(__NR_mbind, p, size, MPOL_INTERLEAVE, nodemask,
+ NUMA_NUM_NODES+1, 0) < 0)
+ WARNING("mbind() failed: %s\n", strerror(errno));
+
+ ret = mlock(p, size);
+ if (ret < 0) {
+ WARNING("mlock() failed: %s\n",
+ strerror(errno));
+ return ret;
+ }
+ munlock(p, size);
+
+ return 0;
+}
+
static void *hugetlbfs_morecore(ptrdiff_t increment)
{
void *p;
long newsize = 0;
+ int ret;
DEBUG("hugetlbfs_morecore(%ld) = ...\n", (long)increment);
@@ -86,20 +175,14 @@ static void *hugetlbfs_morecore(ptrdiff_
return NULL;
}
-#if 0
-/* Use of mlock is disabled because it results in bad numa behavior since
- * the malloc'd memory is allocated node-local to the cpu calling morecore()
- * and not to the cpu(s) that are actually using the memory.
- */
- /* Use mlock to guarantee these pages to the process */
- ret = mlock(p, n