[Libhugetlbfs-devel] [PATCH] elflink: Smarter copysize detection v2

2006-08-23 Thread Steve Fox
On Wed, 2006-08-02 at 09:45 -0500, Adam Litke wrote:
> elflink: Be more intellegent when selecting copysize
> 
> When remapping the bss into hugepages, we assume that the entire bss is 
> zeroes.
> While this is valid when the application first starts up, by the time we get
> control, the glibc library constructor has already run.  One of the things 
> this
> constructor does is initialize a few pointers in the bss (to stdin, stdout, 
> and
> stderr).  
> 
> This patch introduces logic to ensure that the bss up to these symbols is
> always copied.  It also adds a HUGETLB_DEBUG environment variable which is
> meant to be used for enabling expensive debugging and diagnostic operations.
> The first of these is a bss scanner which checks if we missed any non-zero 
> data
> in our copy.

Here's an update of Adam's patch.



elflink: Detect uncopied global object symbols in the .bss and copy them
into hugepages.

This should prevent users from needing to use the
HUGETLB_MINIMAL_COPY=no behavior when applications crash upon startup.
This patch is based Adam Litke's copysize patch and was developed in 
collaboration with him.

Signed-off-by: Steve Fox <[EMAIL PROTECTED]>

diff -ruN libhugetlbfs/elflink.c libhugetlbfs.working/elflink.c
--- libhugetlbfs/elflink.c  2006-08-23 16:45:42.0 -0500
+++ libhugetlbfs.working/elflink.c  2006-08-23 16:45:11.0 -0500
@@ -38,9 +38,17 @@
 #ifdef __LP64__
 #define Elf_Ehdr   Elf64_Ehdr
 #define Elf_Phdr   Elf64_Phdr
+#define Elf_DynElf64_Dyn
+#define Elf_SymElf64_Sym
+#define ELF_ST_BIND(x)  ELF64_ST_BIND(x)
+#define ELF_ST_TYPE(x)  ELF64_ST_TYPE(x)
 #else
 #define Elf_Ehdr   Elf32_Ehdr
 #define Elf_Phdr   Elf32_Phdr
+#define Elf_DynElf32_Dyn
+#define Elf_SymElf32_Sym
+#define ELF_ST_BIND(x)  ELF64_ST_BIND(x)
+#define ELF_ST_TYPE(x)  ELF64_ST_TYPE(x)
 #endif
 
 #ifdef __syscall_return
@@ -174,6 +182,8 @@
 static struct seg_info htlb_seg_table[MAX_HTLB_SEGS];
 static int htlb_num_segs;
 static int minimal_copy = 1;
+int __debug = 0;
+static Elf_Ehdr *ehdr;
 
 static void parse_phdrs(Elf_Ehdr *ehdr)
 {
@@ -220,11 +230,148 @@
}
 }
 
+static void check_bss(unsigned long *start, unsigned long *end)
+{
+   unsigned long *addr;
+
+   for (addr = start; addr < end; addr++) {
+   if (*addr != 0)
+   WARNING("Non-zero BSS data @ %p: %lx\n", addr, *addr);
+   }
+}
+
+/* Subtle:  Since libhugetlbfs depends on glibc, we allow it
+ * it to be loaded before us.  As part of its init functions, it
+ * initializes stdin, stdout, and stderr in the bss.  We need to
+ * include these initialized variables in our copy.
+ */
+
+static void get_extracopy(struct seg_info *seg, void *p, 
+void **extra_start, void 
**extra_end)
+{
+   Elf_Dyn *dyntab;/* dynamic segment table */
+   Elf_Phdr *phdr; /* program header table */
+   Elf_Sym *symtab = NULL; /* dynamic symbol table */
+   Elf_Sym *sym;   /* a symbol */
+   char *strtab = NULL;/* string table for dynamic symbols */
+   int i, found_sym = 0;
+   int numsyms;/* number of symbols in dynamic symbol table */
+   void *start, *end, *start_orig, *end_orig;
+   void *sym_start, *sym_end;
+
+   end_orig = seg->vaddr + seg->memsz;
+   start_orig = seg->vaddr + seg->filesz;
+   if (seg->filesz == seg->memsz)
+   goto bail;
+   if (!minimal_copy)
+   goto bail;
+
+   /* Find dynamic section */
+   i = 1;
+   phdr = (Elf_Phdr *)((char *)ehdr + ehdr->e_phoff);
+   while ((phdr[i].p_type != PT_DYNAMIC) && (i < ehdr->e_phnum)) {
+   ++i;
+   }
+   if (phdr[i].p_type == PT_DYNAMIC) {
+   dyntab = (Elf_Dyn *)phdr[i].p_vaddr;
+   } else {
+   DEBUG("No dynamic segment found\n");
+   goto bail;
+   }
+
+   /* Find symbol and string tables */
+   i = 1;
+   while ((dyntab[i].d_tag != DT_NULL)) {
+   if (dyntab[i].d_tag == DT_SYMTAB)
+   symtab = (Elf_Sym *)dyntab[i].d_un.d_ptr;
+   else if (dyntab[i].d_tag == DT_STRTAB)
+   strtab = (char *)dyntab[i].d_un.d_ptr;
+   i++;
+   }
+   
+   if (!symtab) {
+   DEBUG("No symbol table found\n");
+   goto bail;
+   }
+   if (!strtab) {
+   DEBUG("No string table found\n");
+   goto bail;
+   }
+
+   /* WARNING - The symbol table size calculation does not follow the ELF
+*   standard, but rather exploits an assumption we enforce in
+*   our linker scripts that the string table follows
+*   immediately after the symbol table. The linker scripts

Re: [Libhugetlbfs-devel] [RFC][PATCH] morecore: fix-up NUMA allocations

2006-08-23 Thread Nishanth Aravamudan
On 22.08.2006 [19:08:40 -0700], Nishanth Aravamudan wrote:
> Hi,
> 
> Here is my attempt at reinstating the mlocking guarantee for morecore.
> The issue previously was that we would fault in hugepages on the current
> node only, leading to terrible NUMA performance. Instead, we now check
> the current mempolicy and if it's DEFAULT (which acc'g to the man mbind
> page means "Unless the process policy has been changed this means to
> allocate memory on the node of the CPU that triggered the allocation.")
> we change it to INTERLEAVE. I think we want to respect the policy if
> it's BIND or PREFERRED, although maybe only the latter is really
> important.
> 
> The NUMA API man-pages are really bad, so I'll probably spend some time
> now creating patches for them, based upon my reading of the
> corresponding kernel code.
> 
> Unfortunately, this would introduce a dependency on libnuma, as
> otherwise the get_mempolicy() and mbind() calls have no definition :( So
> I'm emulating them with indirect syscalls.
> 
> I'm going to go and test this now on a non-NUMA machine until I can find
> access to a larger NUMA machine where this might make a difference, but
> wanted to get the patch out there, because I'm not entirely sure I know
> what I'm doing :)
> 
> Completely only an RFC right now, not requesting inclusion, so not
> Signed-off.

Second try, compile-tested and run-tested on a non-NUMA machine (passes
make func). Will hopefully have time to test on a NUMA box tomorrow.

Still not Signed-off.

I was really trying to avoid using libnuma, but I ended up just stealing
code from it. I've trimmed it down to the minimal amount, but am open
for further (or better) suggestions.

Thanks,
Nish

diff --git a/morecore.c b/morecore.c
index 9f13316..01ac8ae 100644
--- a/morecore.c
+++ b/morecore.c
@@ -26,6 +26,11 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
+#include 
 
 #include "hugetlbfs.h"
 
@@ -49,10 +54,94 @@ static long mapsize;
  * go back to small pages and use mmap to get them.  Hurrah.
  */
 
+#if defined(__x86_64__) || defined(__i386__)
+#define NUMA_NUM_NODES  128
+#else
+#define NUMA_NUM_NODES  2048
+#endif
+
+#ifndef BITS_PER_LONG
+#define BITS_PER_LONG  (8*sizeof(unsigned long))
+#endif
+
+static int respect_policy;
+/* adapted from libnuma source */
+static int numa_is_available;
+static unsigned long nodemask[NUMA_NUM_NODES/BITS_PER_LONG];
+
+static int numa_max_node(void)
+{
+   DIR *d;
+   struct dirent *de;
+   int found, maxnode = 0;
+
+   d = opendir("/sys/devices/system/node");
+   if (!d)
+   return 0;
+   found = 0;
+   while ((de = readdir(d)) != NULL) {
+   int nd;
+   if (strncmp(de->d_name, "node", 4))
+   continue;
+   found++;
+   nd = strtoul(de->d_name+4, NULL, 0);
+   if (maxnode < nd)
+   maxnode = nd;
+   }
+   closedir(d);
+   if (found == 0)
+   return 0;
+   return maxnode;
+}
+
+static void setup_numa_if_available(void)
+{
+   int i, maxnode;
+
+   if (syscall(__NR_get_mempolicy, NULL, NULL, 0, 0, 0) < 0
+   && errno == ENOSYS) {
+   numa_is_available = 0;
+   return;
+   }
+
+   numa_is_available = 1;
+
+   maxnode = numa_max_node();
+   for (i = 0; i <= maxnode; i++)
+   nodemask[i / BITS_PER_LONG] |= (1UL<<(i%BITS_PER_LONG));
+}
+
+static int guarantee_memory(void *p, long size)
+{
+   int ret;
+
+   /*
+* Override the NUMA policy unless told not to by the environment
+*
+* Default to interleaving at fault-time to avoid having all the
+* hugepages being allocated on the current node.
+*/
+   if (numa_is_available && (respect_policy == 0))
+   if (syscall(__NR_mbind, p, size, MPOL_INTERLEAVE, nodemask,
+   NUMA_NUM_NODES+1, 0) < 0)
+   WARNING("mbind() failed: %s\n", strerror(errno));
+
+   ret = mlock(p, size);
+   if (ret < 0) {
+   WARNING("mlock() failed: %s\n",
+   strerror(errno));
+   return ret;
+   }
+   munlock(p, size);
+
+   return 0;
+}
+
 static void *hugetlbfs_morecore(ptrdiff_t increment)
 {
void *p;
long newsize = 0;
+   int ret;
 
DEBUG("hugetlbfs_morecore(%ld) = ...\n", (long)increment);
 
@@ -86,20 +175,14 @@ static void *hugetlbfs_morecore(ptrdiff_
return NULL;
}
 
-#if 0
-/* Use of mlock is disabled because it results in bad numa behavior since
- * the malloc'd memory is allocated node-local to the cpu calling morecore()
- * and not to the cpu(s) that are actually using the memory.
- */
-   /* Use mlock to guarantee these pages to the process */
-   ret = mlock(p, n