Re: faster malloc in threads

2014-07-14 Thread Theo de Raadt
> Anyway, I still like the idea, but I wonder if now right after the
> hackathon is the right time. But please continue experimenting with
> this during my vacation ;-) Changing the error message would be a good
> thing, to distinguish the various calls to munmap. 

Timing is bad.  We need to fix the tree, and move towards release.



Re: faster malloc in threads

2014-07-14 Thread Otto Moerbeek
On Sat, Jul 12, 2014 at 01:10:38PM -0400, Ted Unangst wrote:

> On Fri, Jul 11, 2014 at 08:11, Ted Unangst wrote:
> 
> >> I also think there's one simple case that can be added: the MMAP call
> >> at the bottom of map().
> 
> On further inspection, I think this needed a slight reordering to be
> safe.
> 
> I have also been seeing random munmap errors running jruby:
> java(3451) in free(): error: munmap 0x41cb4307000
> Abort trap 
> 
> I don't see where the race is, but I am now running with this slightly
> more conservative patch. The only enabled unlock sections are for
> mmap() on the way to return when internal state won't change. (On the
> bright side, I guess this means the malloc lock really does work
> normally. :))

I cannot spot it yet either, this shows how tricky this stuff is ;-)

Anyway, I still like the idea, but I wonder if now right after the
hackathon is the right time. But please continue experimenting with
this during my vacation ;-) Changing the error message would be a good
thing, to distinguish the various calls to munmap. 

-Otto

> 
> Index: malloc.c
> ===
> RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
> retrieving revision 1.170
> diff -u -p -r1.170 malloc.c
> --- malloc.c  9 Jul 2014 19:11:00 -   1.170
> +++ malloc.c  12 Jul 2014 16:52:17 -
> @@ -93,6 +93,15 @@
>  #define MQUERY(a, sz)mquery((a), (size_t)(sz), PROT_READ | 
> PROT_WRITE, \
>  MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, (off_t)0)
>  
> +#define KERNENTER() if (__isthreaded) do { \
> + malloc_active--; \
> + _MALLOC_UNLOCK(); \
> +} while (0)
> +#define KERNEXIT() if (__isthreaded) do { \
> + _MALLOC_LOCK(); \
> + malloc_active++; \
> +} while (0)
> +
>  struct region_info {
>   void *p;/* page; low bits used to mark chunks */
>   uintptr_t size; /* size for pages, or chunk_info pointer */
> @@ -312,7 +321,10 @@ unmap(struct dir_info *d, void *p, size_
>   }
>  
>   if (psz > mopts.malloc_cache) {
> - if (munmap(p, sz))
> + /* KERNENTER(); */
> + i = munmap(p, sz);
> + /* KERNEXIT(); */
> + if (i)
>   wrterror("munmap", p);
>   STATS_SUB(d->malloc_used, sz);
>   return;
> @@ -396,7 +408,9 @@ map(struct dir_info *d, size_t sz, int z
>   return MAP_FAILED;
>   }
>   if (psz > d->free_regions_size) {
> + KERNENTER();
>   p = MMAP(sz);
> + KERNEXIT();
>   if (p != MAP_FAILED)
>   STATS_ADD(d->malloc_used, sz);
>   /* zero fill not needed */
> @@ -408,18 +422,20 @@ map(struct dir_info *d, size_t sz, int z
>   if (r->p != NULL) {
>   if (r->size == psz) {
>   p = r->p;
> + r->p = NULL;
> + r->size = 0;
> + d->free_regions_size -= psz;
> + /* KERNENTER(); */
>   if (mopts.malloc_freeunmap)
>   mprotect(p, sz, PROT_READ | PROT_WRITE);
>   if (mopts.malloc_hint)
>   madvise(p, sz, MADV_NORMAL);
> - r->p = NULL;
> - r->size = 0;
> - d->free_regions_size -= psz;
>   if (zero_fill)
>   memset(p, 0, sz);
>   else if (mopts.malloc_junk == 2 &&
>   mopts.malloc_freeunmap)
>   memset(p, SOME_FREEJUNK, sz);
> + /* KERNEXIT(); */
>   return p;
>   } else if (r->size > psz)
>   big = r;
> @@ -440,11 +456,13 @@ map(struct dir_info *d, size_t sz, int z
>   memset(p, SOME_FREEJUNK, sz);
>   return p;
>   }
> + if (d->free_regions_size > mopts.malloc_cache)
> + wrterror("malloc cache", NULL);
> + KERNENTER();
>   p = MMAP(sz);
> + KERNEXIT();
>   if (p != MAP_FAILED)
>   STATS_ADD(d->malloc_used, sz);
> - if (d->free_regions_size > mopts.malloc_cache)
> - wrterror("malloc cache", NULL);
>   /* zero fill not needed */
>   return p;
>  }



Re: faster malloc in threads

2014-07-12 Thread Ted Unangst
On Fri, Jul 11, 2014 at 08:11, Ted Unangst wrote:

>> I also think there's one simple case that can be added: the MMAP call
>> at the bottom of map().

On further inspection, I think this needed a slight reordering to be
safe.

I have also been seeing random munmap errors running jruby:
java(3451) in free(): error: munmap 0x41cb4307000
Abort trap 

I don't see where the race is, but I am now running with this slightly
more conservative patch. The only enabled unlock sections are for
mmap() on the way to return when internal state won't change. (On the
bright side, I guess this means the malloc lock really does work
normally. :))

Index: malloc.c
===
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.170
diff -u -p -r1.170 malloc.c
--- malloc.c9 Jul 2014 19:11:00 -   1.170
+++ malloc.c12 Jul 2014 16:52:17 -
@@ -93,6 +93,15 @@
 #define MQUERY(a, sz)  mquery((a), (size_t)(sz), PROT_READ | PROT_WRITE, \
 MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, (off_t)0)
 
+#define KERNENTER() if (__isthreaded) do { \
+   malloc_active--; \
+   _MALLOC_UNLOCK(); \
+} while (0)
+#define KERNEXIT() if (__isthreaded) do { \
+   _MALLOC_LOCK(); \
+   malloc_active++; \
+} while (0)
+
 struct region_info {
void *p;/* page; low bits used to mark chunks */
uintptr_t size; /* size for pages, or chunk_info pointer */
@@ -312,7 +321,10 @@ unmap(struct dir_info *d, void *p, size_
}
 
if (psz > mopts.malloc_cache) {
-   if (munmap(p, sz))
+   /* KERNENTER(); */
+   i = munmap(p, sz);
+   /* KERNEXIT(); */
+   if (i)
wrterror("munmap", p);
STATS_SUB(d->malloc_used, sz);
return;
@@ -396,7 +408,9 @@ map(struct dir_info *d, size_t sz, int z
return MAP_FAILED;
}
if (psz > d->free_regions_size) {
+   KERNENTER();
p = MMAP(sz);
+   KERNEXIT();
if (p != MAP_FAILED)
STATS_ADD(d->malloc_used, sz);
/* zero fill not needed */
@@ -408,18 +422,20 @@ map(struct dir_info *d, size_t sz, int z
if (r->p != NULL) {
if (r->size == psz) {
p = r->p;
+   r->p = NULL;
+   r->size = 0;
+   d->free_regions_size -= psz;
+   /* KERNENTER(); */
if (mopts.malloc_freeunmap)
mprotect(p, sz, PROT_READ | PROT_WRITE);
if (mopts.malloc_hint)
madvise(p, sz, MADV_NORMAL);
-   r->p = NULL;
-   r->size = 0;
-   d->free_regions_size -= psz;
if (zero_fill)
memset(p, 0, sz);
else if (mopts.malloc_junk == 2 &&
mopts.malloc_freeunmap)
memset(p, SOME_FREEJUNK, sz);
+   /* KERNEXIT(); */
return p;
} else if (r->size > psz)
big = r;
@@ -440,11 +456,13 @@ map(struct dir_info *d, size_t sz, int z
memset(p, SOME_FREEJUNK, sz);
return p;
}
+   if (d->free_regions_size > mopts.malloc_cache)
+   wrterror("malloc cache", NULL);
+   KERNENTER();
p = MMAP(sz);
+   KERNEXIT();
if (p != MAP_FAILED)
STATS_ADD(d->malloc_used, sz);
-   if (d->free_regions_size > mopts.malloc_cache)
-   wrterror("malloc cache", NULL);
/* zero fill not needed */
return p;
 }



Re: faster malloc in threads

2014-07-11 Thread Ted Unangst
On Fri, Jul 11, 2014 at 13:56, Otto Moerbeek wrote:
> On Fri, Jul 11, 2014 at 06:28:04AM -0400, Ted Unangst wrote:
> 
>> We don't need to hold the malloc lock when making syscalls like mmap
>> and munmap if we're just a little careful about the order of
>> operations. This will allow other threads to concurrently allocate
>> perhaps smaller chunks while the first thread is in the kernel.
>>
>> This makes a huge difference in a simple benchmark that allocates
>> chunks in one thread and pages in a second thread. The chunk thread
>> finishes almost immediately, instead of contending for the lock and
>> running as slowly as the page thread. Admittedly contrived benchmark,
>> but the changes are very simple so I think it's worth it.
>>
>> There are some other possibly expensive operations to tweak, but this
>> covers the smallest, simplest sections.
> 
> I very much like the idea, athough it is tricky.
> 
> The realloc case is seems wrong: if the hash table is extended during
> during MQUERY/MMAPA, r points to garbage and the r->size assignment is
> wrong.
> 
> I also think there's one simple case that can be added: the MMAP call
> at the bottom of map().

thank you. agreed.

Index: stdlib/malloc.c
===
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.170
diff -u -p -r1.170 malloc.c
--- stdlib/malloc.c 9 Jul 2014 19:11:00 -   1.170
+++ stdlib/malloc.c 11 Jul 2014 12:09:28 -
@@ -93,6 +93,15 @@
 #define MQUERY(a, sz)  mquery((a), (size_t)(sz), PROT_READ | PROT_WRITE, \
 MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, (off_t)0)
 
+#define KERNENTER() if (__isthreaded) do { \
+   malloc_active--; \
+   _MALLOC_UNLOCK(); \
+} while (0)
+#define KERNEXIT() if (__isthreaded) do { \
+   _MALLOC_LOCK(); \
+   malloc_active++; \
+} while (0)
+
 struct region_info {
void *p;/* page; low bits used to mark chunks */
uintptr_t size; /* size for pages, or chunk_info pointer */
@@ -312,7 +321,10 @@ unmap(struct dir_info *d, void *p, size_
}
 
if (psz > mopts.malloc_cache) {
-   if (munmap(p, sz))
+   KERNENTER();
+   i = munmap(p, sz);
+   KERNEXIT();
+   if (i)
wrterror("munmap", p);
STATS_SUB(d->malloc_used, sz);
return;
@@ -396,7 +408,9 @@ map(struct dir_info *d, size_t sz, int z
return MAP_FAILED;
}
if (psz > d->free_regions_size) {
+   KERNENTER();
p = MMAP(sz);
+   KERNEXIT();
if (p != MAP_FAILED)
STATS_ADD(d->malloc_used, sz);
/* zero fill not needed */
@@ -408,18 +422,20 @@ map(struct dir_info *d, size_t sz, int z
if (r->p != NULL) {
if (r->size == psz) {
p = r->p;
+   r->p = NULL;
+   r->size = 0;
+   d->free_regions_size -= psz;
+   KERNENTER();
if (mopts.malloc_freeunmap)
mprotect(p, sz, PROT_READ | PROT_WRITE);
if (mopts.malloc_hint)
madvise(p, sz, MADV_NORMAL);
-   r->p = NULL;
-   r->size = 0;
-   d->free_regions_size -= psz;
if (zero_fill)
memset(p, 0, sz);
else if (mopts.malloc_junk == 2 &&
mopts.malloc_freeunmap)
memset(p, SOME_FREEJUNK, sz);
+   KERNEXIT();
return p;
} else if (r->size > psz)
big = r;
@@ -440,7 +456,9 @@ map(struct dir_info *d, size_t sz, int z
memset(p, SOME_FREEJUNK, sz);
return p;
}
+   KERNENTER();
p = MMAP(sz);
+   KERNEXIT();
if (p != MAP_FAILED)
STATS_ADD(d->malloc_used, sz);
if (d->free_regions_size > mopts.malloc_cache)



Re: faster malloc in threads

2014-07-11 Thread Otto Moerbeek
On Fri, Jul 11, 2014 at 06:28:04AM -0400, Ted Unangst wrote:

> We don't need to hold the malloc lock when making syscalls like mmap
> and munmap if we're just a little careful about the order of
> operations. This will allow other threads to concurrently allocate
> perhaps smaller chunks while the first thread is in the kernel.
> 
> This makes a huge difference in a simple benchmark that allocates
> chunks in one thread and pages in a second thread. The chunk thread
> finishes almost immediately, instead of contending for the lock and
> running as slowly as the page thread. Admittedly contrived benchmark,
> but the changes are very simple so I think it's worth it.
> 
> There are some other possibly expensive operations to tweak, but this
> covers the smallest, simplest sections.

I very much like the idea, athough it is tricky. 

The realloc case is seems wrong: if the hash table is extended during
during MQUERY/MMAPA, r points to garbage and the r->size assignment is
wrong. 

I also think there's one simple case that can be added: the MMAP call
at the bottom of map(). 

-Otto

> 
> Index: malloc.c
> ===
> RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
> retrieving revision 1.170
> diff -u -p -r1.170 malloc.c
> --- malloc.c  9 Jul 2014 19:11:00 -   1.170
> +++ malloc.c  11 Jul 2014 10:23:10 -
> @@ -93,6 +93,15 @@
>  #define MQUERY(a, sz)mquery((a), (size_t)(sz), PROT_READ | 
> PROT_WRITE, \
>  MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, (off_t)0)
>  
> +#define KERNENTER() if (__isthreaded) do { \
> + malloc_active--; \
> + _MALLOC_UNLOCK(); \
> +} while (0)
> +#define KERNEXIT() if (__isthreaded) do { \
> + _MALLOC_LOCK(); \
> + malloc_active++; \
> +} while (0)
> +
>  struct region_info {
>   void *p;/* page; low bits used to mark chunks */
>   uintptr_t size; /* size for pages, or chunk_info pointer */
> @@ -312,7 +321,10 @@ unmap(struct dir_info *d, void *p, size_
>   }
>  
>   if (psz > mopts.malloc_cache) {
> - if (munmap(p, sz))
> + KERNENTER();
> + i = munmap(p, sz);
> + KERNEXIT();
> + if (i)
>   wrterror("munmap", p);
>   STATS_SUB(d->malloc_used, sz);
>   return;
> @@ -396,7 +408,9 @@ map(struct dir_info *d, size_t sz, int z
>   return MAP_FAILED;
>   }
>   if (psz > d->free_regions_size) {
> + KERNENTER();
>   p = MMAP(sz);
> + KERNEXIT();
>   if (p != MAP_FAILED)
>   STATS_ADD(d->malloc_used, sz);
>   /* zero fill not needed */
> @@ -408,18 +422,20 @@ map(struct dir_info *d, size_t sz, int z
>   if (r->p != NULL) {
>   if (r->size == psz) {
>   p = r->p;
> + r->p = NULL;
> + r->size = 0;
> + d->free_regions_size -= psz;
> + KERNENTER();
>   if (mopts.malloc_freeunmap)
>   mprotect(p, sz, PROT_READ | PROT_WRITE);
>   if (mopts.malloc_hint)
>   madvise(p, sz, MADV_NORMAL);
> - r->p = NULL;
> - r->size = 0;
> - d->free_regions_size -= psz;
>   if (zero_fill)
>   memset(p, 0, sz);
>   else if (mopts.malloc_junk == 2 &&
>   mopts.malloc_freeunmap)
>   memset(p, SOME_FREEJUNK, sz);
> + KERNEXIT();
>   return p;
>   } else if (r->size > psz)
>   big = r;
> @@ -1317,11 +1333,13 @@ orealloc(void *p, size_t newsz, void *f)
>  
>   STATS_INC(pool->cheap_realloc_tries);
>   zapcacheregion(pool, hint, needed);
> + KERNENTER();
>   q = MQUERY(hint, needed);
>   if (q == hint)
>   q = MMAPA(hint, needed);
>   else
>   q = MAP_FAILED;
> + KERNEXIT();
>   if (q == hint) {
>   STATS_ADD(pool->malloc_used, needed);
>   if (mopts.malloc_junk == 2)
> 



faster malloc in threads

2014-07-11 Thread Ted Unangst
We don't need to hold the malloc lock when making syscalls like mmap
and munmap if we're just a little careful about the order of
operations. This will allow other threads to concurrently allocate
perhaps smaller chunks while the first thread is in the kernel.

This makes a huge difference in a simple benchmark that allocates
chunks in one thread and pages in a second thread. The chunk thread
finishes almost immediately, instead of contending for the lock and
running as slowly as the page thread. Admittedly contrived benchmark,
but the changes are very simple so I think it's worth it.

There are some other possibly expensive operations to tweak, but this
covers the smallest, simplest sections.

Index: malloc.c
===
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.170
diff -u -p -r1.170 malloc.c
--- malloc.c9 Jul 2014 19:11:00 -   1.170
+++ malloc.c11 Jul 2014 10:23:10 -
@@ -93,6 +93,15 @@
 #define MQUERY(a, sz)  mquery((a), (size_t)(sz), PROT_READ | PROT_WRITE, \
 MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, (off_t)0)
 
+#define KERNENTER() if (__isthreaded) do { \
+   malloc_active--; \
+   _MALLOC_UNLOCK(); \
+} while (0)
+#define KERNEXIT() if (__isthreaded) do { \
+   _MALLOC_LOCK(); \
+   malloc_active++; \
+} while (0)
+
 struct region_info {
void *p;/* page; low bits used to mark chunks */
uintptr_t size; /* size for pages, or chunk_info pointer */
@@ -312,7 +321,10 @@ unmap(struct dir_info *d, void *p, size_
}
 
if (psz > mopts.malloc_cache) {
-   if (munmap(p, sz))
+   KERNENTER();
+   i = munmap(p, sz);
+   KERNEXIT();
+   if (i)
wrterror("munmap", p);
STATS_SUB(d->malloc_used, sz);
return;
@@ -396,7 +408,9 @@ map(struct dir_info *d, size_t sz, int z
return MAP_FAILED;
}
if (psz > d->free_regions_size) {
+   KERNENTER();
p = MMAP(sz);
+   KERNEXIT();
if (p != MAP_FAILED)
STATS_ADD(d->malloc_used, sz);
/* zero fill not needed */
@@ -408,18 +422,20 @@ map(struct dir_info *d, size_t sz, int z
if (r->p != NULL) {
if (r->size == psz) {
p = r->p;
+   r->p = NULL;
+   r->size = 0;
+   d->free_regions_size -= psz;
+   KERNENTER();
if (mopts.malloc_freeunmap)
mprotect(p, sz, PROT_READ | PROT_WRITE);
if (mopts.malloc_hint)
madvise(p, sz, MADV_NORMAL);
-   r->p = NULL;
-   r->size = 0;
-   d->free_regions_size -= psz;
if (zero_fill)
memset(p, 0, sz);
else if (mopts.malloc_junk == 2 &&
mopts.malloc_freeunmap)
memset(p, SOME_FREEJUNK, sz);
+   KERNEXIT();
return p;
} else if (r->size > psz)
big = r;
@@ -1317,11 +1333,13 @@ orealloc(void *p, size_t newsz, void *f)
 
STATS_INC(pool->cheap_realloc_tries);
zapcacheregion(pool, hint, needed);
+   KERNENTER();
q = MQUERY(hint, needed);
if (q == hint)
q = MMAPA(hint, needed);
else
q = MAP_FAILED;
+   KERNEXIT();
if (q == hint) {
STATS_ADD(pool->malloc_used, needed);
if (mopts.malloc_junk == 2)