Re: [RFC] libibverbs: ibv_fork_init() and libhugetlbfs

2010-05-07 Thread Alexander Schmidt
On Thu, 06 May 2010 13:55:31 -0700
Roland Dreier rdre...@cisco.com wrote:
 I think that we cannot assume huge pages only come from libhugetlbfs --
 we should support an application directly enabling huge pages (possibly
 via another library too, so we can't assume that an application knows
 the page size for a memory range it is about to register).
 
 And also the 16 MB page size constant is of course not feasible -- with
 all due respect, the x86 page size of 2 MB is much more likely in
 practice :)  (Although perhaps the much slower PowerPC TLB refill makes
 users more likely to try and use hugetlb pages ;)
 
 Alex suggested parsing files in the same way as libhugetlbfs does to get
 the page size, and that seems to be the best solution, since I don't
 think the libhugetlbfs license is compatible with the BSD license for
 libibverbs.
 
 But your trick of using /proc/*/maps looks nice.  Does that only work
 for libhugetlbfs or can we recognize direct mmap of hugetlb pages?

Hi Roland, thanks for your comments!

I've reworked my patch:
 * added get_huge_page_size() to read the huge page size from
   /proc/meminfo. This is done at ibv_fork_init() time.
 * I noticed that some applications like ibv_rc_pingpong already
   get memory from libhugetlbfs when running ibv_fork_init(). So
   I changed the code for testing madvise() to allocate a huge page
   if the huge page size is set in the system.

I have not tested this code with different libraries providing huge
pages / mmaped pages yet, but I hope this can be added later on when
we have agreed on an approach to handle huge pages.

Signed-off-by: Alexander Schmidt al...@linux.vnet.ibm.com
---
 src/memory.c |  103 ++-
 1 file changed, 95 insertions(+), 8 deletions(-)

--- libibverbs-1.1.2.orig/src/memory.c
+++ libibverbs-1.1.2/src/memory.c
@@ -40,6 +40,8 @@
 #include unistd.h
 #include stdlib.h
 #include stdint.h
+#include stdio.h
+#include string.h
 
 #include ibverbs.h
 
@@ -68,12 +70,45 @@ struct ibv_mem_node {
 static struct ibv_mem_node *mm_root;
 static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
 static int page_size;
+static int huge_page_size;
 static int too_late;
 
+static int get_huge_page_size(void)
+{
+   int ret = -1;
+   FILE *file;
+   char *path = /proc/meminfo;
+   char buf[1024], type[128];
+
+   file = fopen(path, r);
+   if (!file)
+   goto out;
+
+   while (fgets(buf, sizeof(buf), file) != NULL) {
+   int n;
+   unsigned long size;
+
+   n = sscanf(buf, %127s %lu %*s, type, size);
+
+   if (n  2)
+   continue;
+
+   if (!strcmp(type, Hugepagesize:)) {
+   /* huge page size is printed in Kb */
+   ret = size * 1024;
+   break;
+   }
+   }
+   fclose(file);
+
+out:
+   return ret;
+}
+
 int ibv_fork_init(void)
 {
void *tmp;
-   int ret;
+   int ret, size;
 
if (mm_root)
return 0;
@@ -85,11 +120,18 @@ int ibv_fork_init(void)
if (page_size  0)
return errno;
 
-   if (posix_memalign(tmp, page_size, page_size))
+   huge_page_size = get_huge_page_size();
+
+   if (huge_page_size  page_size)
+   size = huge_page_size;
+   else
+   size = page_size;
+
+   if (posix_memalign(tmp, size, size))
return ENOMEM;
 
-   ret = madvise(tmp, page_size, MADV_DONTFORK) ||
- madvise(tmp, page_size, MADV_DOFORK);
+   ret = madvise(tmp, size, MADV_DONTFORK) ||
+ madvise(tmp, size, MADV_DOFORK);
 
free(tmp);
 
@@ -446,11 +488,51 @@ static struct ibv_mem_node *__mm_find_st
return node;
 }
 
+static int is_huge_page(void *base)
+{
+   int ret = 0;
+   pid_t pid;
+   FILE *file;
+   char buf[1024], lib[128];
+
+   pid = getpid();
+   snprintf(buf, sizeof(buf), /proc/%d/maps, pid);
+
+   file = fopen(buf, r);
+   if (!file)
+   goto out;
+
+   while (fgets(buf, sizeof(buf), file) != NULL) {
+   int n;
+   char *substr;
+   uintptr_t range_start, range_end;
+
+   n = sscanf(buf, %lx-%lx %*s %*x %*s %*u %127s,
+   range_start, range_end, lib);
+
+   if (n  3)
+   continue;
+
+   substr = strstr(lib, libhugetlbfs);
+   if (substr) {
+   if ((uintptr_t) base = range_start 
+   (uintptr_t) base  range_end) {
+   ret = 1;
+   break;
+   }
+   }
+   }
+   fclose(file);
+
+out:
+   return ret;
+}
+
 static int ibv_madvise_range(void *base, size_t size, int advice)
 {
uintptr_t start, end;
struct 

Re: [PATCH] ummunotify: Userspace support for MMU notifications V2

2010-05-07 Thread Jeff Squyres
On Apr 22, 2010, at 9:38 AM, Eric B Munson wrote:

 From: Roland Dreier rola...@cisco.com
 
 As discussed in http://article.gmane.org/gmane.linux.drivers.openib/61925
 and follow-up messages, libraries using RDMA would like to track
 precisely when application code changes memory mapping via free(),
 munmap(), etc.  Current pure-userspace solutions using malloc hooks
 and other tricks are not robust, and the feeling among experts is that
 the issue is unfixable without kernel help.

Sorry for not replying earlier -- just to throw in my $0.02 here: the MPI 
community is *very interested* in having this stuff in upstream kernels.  It 
solves a fairly major problem for us. 

Open MPI (www.open-mpi.org) is ready to pretty much immediately take advantage 
of these capabilities.  The code to use ummunotify is in a Mercurial branch; 
we're only waiting for ummunotify to go upstream before committing our support 
for it to our main SVN development trunk.

 We solve this not by implementing the full API proposed in the email
 linked above but rather with a simpler and more generic interface,
 which may be useful in other contexts.  Specifically, we implement a
 new character device driver, ummunotify, that creates a /dev/ummunotify
 node.  A userspace process can open this node read-only and use the fd
 as follows:
 
  1. ioctl() to register/unregister an address range to watch in the
 kernel (cf struct ummunotify_register_ioctl in linux/ummunotify.h).
 
  2. read() to retrieve events generated when a mapping in a watched
 address range is invalidated (cf struct ummunotify_event in
 linux/ummunotify.h).  select()/poll()/epoll() and SIGIO are
 handled for this IO.
 
  3. mmap() one page at offset 0 to map a kernel page that contains a
 generation counter that is incremented each time an event is
 generated.  This allows userspace to have a fast path that checks
 that no events have occurred without a system call.
 
 Thanks to Jason Gunthorpe jgunthorpe at obsidianresearch.com for
 suggestions on the interface design.  Also thanks to Jeff Squyres
 jsquyres at cisco.com for prototyping support for this in Open MPI, which
 helped find several bugs during development.
 
 Signed-off-by: Roland Dreier rola...@cisco.com
 Signed-off-by: Eric B Munson ebmun...@us.ibm.com

Acked-by: Jeff Squyers jsquy...@cisco.com

 ---
 
 Changes from V1:
 - Update Kbuild to handle test program build properly
 - Update documentation to cover questions not addressed in previous
   thread
 ---
  Documentation/Makefile  |3 +-
  Documentation/ummunotify/Makefile   |7 +
  Documentation/ummunotify/ummunotify.txt |  162 +
  Documentation/ummunotify/umn-test.c |  200 +++
  drivers/char/Kconfig|   12 +
  drivers/char/Makefile   |1 +
  drivers/char/ummunotify.c   |  567 
 +++
  include/linux/Kbuild|1 +
  include/linux/ummunotify.h  |  121 +++
  9 files changed, 1073 insertions(+), 1 deletions(-)
  create mode 100644 Documentation/ummunotify/Makefile
  create mode 100644 Documentation/ummunotify/ummunotify.txt
  create mode 100644 Documentation/ummunotify/umn-test.c
  create mode 100644 drivers/char/ummunotify.c
  create mode 100644 include/linux/ummunotify.h
 
 diff --git a/Documentation/Makefile b/Documentation/Makefile
 index 6fc7ea1..27ba76a 100644
 --- a/Documentation/Makefile
 +++ b/Documentation/Makefile
 @@ -1,3 +1,4 @@
  obj-m := DocBook/ accounting/ auxdisplay/ connector/ \
 filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \
 -   pcmcia/ spi/ timers/ video4linux/ vm/ watchdog/src/
 +   pcmcia/ spi/ timers/ video4linux/ vm/ ummunotify/ \
 +   watchdog/src/
 diff --git a/Documentation/ummunotify/Makefile 
 b/Documentation/ummunotify/Makefile
 new file mode 100644
 index 000..89f31a0
 --- /dev/null
 +++ b/Documentation/ummunotify/Makefile
 @@ -0,0 +1,7 @@
 +# List of programs to build
 +hostprogs-y := umn-test
 +
 +# Tell kbuild to always build the programs
 +always := $(hostprogs-y)
 +
 +HOSTCFLAGS_umn-test.o += -I$(objtree)/usr/include
 diff --git a/Documentation/ummunotify/ummunotify.txt 
 b/Documentation/ummunotify/ummunotify.txt
 new file mode 100644
 index 000..d6c2ccc
 --- /dev/null
 +++ b/Documentation/ummunotify/ummunotify.txt
 @@ -0,0 +1,162 @@
 +UMMUNOTIFY
 +
 +  Ummunotify relays MMU notifier events to userspace.  This is useful
 +  for libraries that need to track the memory mapping of applications;
 +  for example, MPI implementations using RDMA want to cache memory
 +  registrations for performance, but tracking all possible crazy cases
 +  such as when, say, the FORTRAN runtime frees memory is impossible
 +  without kernel help.
 +
 +Basic Model
 +
 +  A userspace process uses it by opening /dev/ummunotify, which
 +  returns a file descriptor.  Interest in address ranges is registered
 +  using