Re: faster malloc in threads
> Anyway, I still like the idea, but I wonder if now right after the > hackathon is the right time. But please continue experimenting with > this during my vacation ;-) Changing the error message would be a good > thing, to distinguish the various calls to munmap. Timing is bad. We need to fix the tree, and move towards release.
Re: faster malloc in threads
On Sat, Jul 12, 2014 at 01:10:38PM -0400, Ted Unangst wrote: > On Fri, Jul 11, 2014 at 08:11, Ted Unangst wrote: > > >> I also think there's one simple case that can be added: the MMAP call > >> at the bottom of map(). > > On further inspection, I think this needed a slight reordering to be > safe. > > I have also been seeing random munmap errors running jruby: > java(3451) in free(): error: munmap 0x41cb4307000 > Abort trap > > I don't see where the race is, but I am now running with this slightly > more conservative patch. The only enabled unlock sections are for > mmap() on the way to return when internal state won't change. (On the > bright side, I guess this means the malloc lock really does work > normally. :)) I cannot spot it yet either, this shows how tricky this stuff is ;-) Anyway, I still like the idea, but I wonder if now right after the hackathon is the right time. But please continue experimenting with this during my vacation ;-) Changing the error message would be a good thing, to distinguish the various calls to munmap. -Otto > > Index: malloc.c > === > RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v > retrieving revision 1.170 > diff -u -p -r1.170 malloc.c > --- malloc.c 9 Jul 2014 19:11:00 - 1.170 > +++ malloc.c 12 Jul 2014 16:52:17 - > @@ -93,6 +93,15 @@ > #define MQUERY(a, sz)mquery((a), (size_t)(sz), PROT_READ | > PROT_WRITE, \ > MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, (off_t)0) > > +#define KERNENTER() if (__isthreaded) do { \ > + malloc_active--; \ > + _MALLOC_UNLOCK(); \ > +} while (0) > +#define KERNEXIT() if (__isthreaded) do { \ > + _MALLOC_LOCK(); \ > + malloc_active++; \ > +} while (0) > + > struct region_info { > void *p;/* page; low bits used to mark chunks */ > uintptr_t size; /* size for pages, or chunk_info pointer */ > @@ -312,7 +321,10 @@ unmap(struct dir_info *d, void *p, size_ > } > > if (psz > mopts.malloc_cache) { > - if (munmap(p, sz)) > + /* KERNENTER(); */ > + i = munmap(p, sz); > + /* KERNEXIT(); */ > + if (i) > wrterror("munmap", p); > STATS_SUB(d->malloc_used, sz); > return; > @@ -396,7 +408,9 @@ map(struct dir_info *d, size_t sz, int z > return MAP_FAILED; > } > if (psz > d->free_regions_size) { > + KERNENTER(); > p = MMAP(sz); > + KERNEXIT(); > if (p != MAP_FAILED) > STATS_ADD(d->malloc_used, sz); > /* zero fill not needed */ > @@ -408,18 +422,20 @@ map(struct dir_info *d, size_t sz, int z > if (r->p != NULL) { > if (r->size == psz) { > p = r->p; > + r->p = NULL; > + r->size = 0; > + d->free_regions_size -= psz; > + /* KERNENTER(); */ > if (mopts.malloc_freeunmap) > mprotect(p, sz, PROT_READ | PROT_WRITE); > if (mopts.malloc_hint) > madvise(p, sz, MADV_NORMAL); > - r->p = NULL; > - r->size = 0; > - d->free_regions_size -= psz; > if (zero_fill) > memset(p, 0, sz); > else if (mopts.malloc_junk == 2 && > mopts.malloc_freeunmap) > memset(p, SOME_FREEJUNK, sz); > + /* KERNEXIT(); */ > return p; > } else if (r->size > psz) > big = r; > @@ -440,11 +456,13 @@ map(struct dir_info *d, size_t sz, int z > memset(p, SOME_FREEJUNK, sz); > return p; > } > + if (d->free_regions_size > mopts.malloc_cache) > + wrterror("malloc cache", NULL); > + KERNENTER(); > p = MMAP(sz); > + KERNEXIT(); > if (p != MAP_FAILED) > STATS_ADD(d->malloc_used, sz); > - if (d->free_regions_size > mopts.malloc_cache) > - wrterror("malloc cache", NULL); > /* zero fill not needed */ > return p; > }
Re: faster malloc in threads
On Fri, Jul 11, 2014 at 08:11, Ted Unangst wrote: >> I also think there's one simple case that can be added: the MMAP call >> at the bottom of map(). On further inspection, I think this needed a slight reordering to be safe. I have also been seeing random munmap errors running jruby: java(3451) in free(): error: munmap 0x41cb4307000 Abort trap I don't see where the race is, but I am now running with this slightly more conservative patch. The only enabled unlock sections are for mmap() on the way to return when internal state won't change. (On the bright side, I guess this means the malloc lock really does work normally. :)) Index: malloc.c === RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v retrieving revision 1.170 diff -u -p -r1.170 malloc.c --- malloc.c9 Jul 2014 19:11:00 - 1.170 +++ malloc.c12 Jul 2014 16:52:17 - @@ -93,6 +93,15 @@ #define MQUERY(a, sz) mquery((a), (size_t)(sz), PROT_READ | PROT_WRITE, \ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, (off_t)0) +#define KERNENTER() if (__isthreaded) do { \ + malloc_active--; \ + _MALLOC_UNLOCK(); \ +} while (0) +#define KERNEXIT() if (__isthreaded) do { \ + _MALLOC_LOCK(); \ + malloc_active++; \ +} while (0) + struct region_info { void *p;/* page; low bits used to mark chunks */ uintptr_t size; /* size for pages, or chunk_info pointer */ @@ -312,7 +321,10 @@ unmap(struct dir_info *d, void *p, size_ } if (psz > mopts.malloc_cache) { - if (munmap(p, sz)) + /* KERNENTER(); */ + i = munmap(p, sz); + /* KERNEXIT(); */ + if (i) wrterror("munmap", p); STATS_SUB(d->malloc_used, sz); return; @@ -396,7 +408,9 @@ map(struct dir_info *d, size_t sz, int z return MAP_FAILED; } if (psz > d->free_regions_size) { + KERNENTER(); p = MMAP(sz); + KERNEXIT(); if (p != MAP_FAILED) STATS_ADD(d->malloc_used, sz); /* zero fill not needed */ @@ -408,18 +422,20 @@ map(struct dir_info *d, size_t sz, int z if (r->p != NULL) { if (r->size == psz) { p = r->p; + r->p = NULL; + r->size = 0; + d->free_regions_size -= psz; + /* KERNENTER(); */ if (mopts.malloc_freeunmap) mprotect(p, sz, PROT_READ | PROT_WRITE); if (mopts.malloc_hint) madvise(p, sz, MADV_NORMAL); - r->p = NULL; - r->size = 0; - d->free_regions_size -= psz; if (zero_fill) memset(p, 0, sz); else if (mopts.malloc_junk == 2 && mopts.malloc_freeunmap) memset(p, SOME_FREEJUNK, sz); + /* KERNEXIT(); */ return p; } else if (r->size > psz) big = r; @@ -440,11 +456,13 @@ map(struct dir_info *d, size_t sz, int z memset(p, SOME_FREEJUNK, sz); return p; } + if (d->free_regions_size > mopts.malloc_cache) + wrterror("malloc cache", NULL); + KERNENTER(); p = MMAP(sz); + KERNEXIT(); if (p != MAP_FAILED) STATS_ADD(d->malloc_used, sz); - if (d->free_regions_size > mopts.malloc_cache) - wrterror("malloc cache", NULL); /* zero fill not needed */ return p; }
Re: faster malloc in threads
On Fri, Jul 11, 2014 at 13:56, Otto Moerbeek wrote: > On Fri, Jul 11, 2014 at 06:28:04AM -0400, Ted Unangst wrote: > >> We don't need to hold the malloc lock when making syscalls like mmap >> and munmap if we're just a little careful about the order of >> operations. This will allow other threads to concurrently allocate >> perhaps smaller chunks while the first thread is in the kernel. >> >> This makes a huge difference in a simple benchmark that allocates >> chunks in one thread and pages in a second thread. The chunk thread >> finishes almost immediately, instead of contending for the lock and >> running as slowly as the page thread. Admittedly contrived benchmark, >> but the changes are very simple so I think it's worth it. >> >> There are some other possibly expensive operations to tweak, but this >> covers the smallest, simplest sections. > > I very much like the idea, athough it is tricky. > > The realloc case is seems wrong: if the hash table is extended during > during MQUERY/MMAPA, r points to garbage and the r->size assignment is > wrong. > > I also think there's one simple case that can be added: the MMAP call > at the bottom of map(). thank you. agreed. Index: stdlib/malloc.c === RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v retrieving revision 1.170 diff -u -p -r1.170 malloc.c --- stdlib/malloc.c 9 Jul 2014 19:11:00 - 1.170 +++ stdlib/malloc.c 11 Jul 2014 12:09:28 - @@ -93,6 +93,15 @@ #define MQUERY(a, sz) mquery((a), (size_t)(sz), PROT_READ | PROT_WRITE, \ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, (off_t)0) +#define KERNENTER() if (__isthreaded) do { \ + malloc_active--; \ + _MALLOC_UNLOCK(); \ +} while (0) +#define KERNEXIT() if (__isthreaded) do { \ + _MALLOC_LOCK(); \ + malloc_active++; \ +} while (0) + struct region_info { void *p;/* page; low bits used to mark chunks */ uintptr_t size; /* size for pages, or chunk_info pointer */ @@ -312,7 +321,10 @@ unmap(struct dir_info *d, void *p, size_ } if (psz > mopts.malloc_cache) { - if (munmap(p, sz)) + KERNENTER(); + i = munmap(p, sz); + KERNEXIT(); + if (i) wrterror("munmap", p); STATS_SUB(d->malloc_used, sz); return; @@ -396,7 +408,9 @@ map(struct dir_info *d, size_t sz, int z return MAP_FAILED; } if (psz > d->free_regions_size) { + KERNENTER(); p = MMAP(sz); + KERNEXIT(); if (p != MAP_FAILED) STATS_ADD(d->malloc_used, sz); /* zero fill not needed */ @@ -408,18 +422,20 @@ map(struct dir_info *d, size_t sz, int z if (r->p != NULL) { if (r->size == psz) { p = r->p; + r->p = NULL; + r->size = 0; + d->free_regions_size -= psz; + KERNENTER(); if (mopts.malloc_freeunmap) mprotect(p, sz, PROT_READ | PROT_WRITE); if (mopts.malloc_hint) madvise(p, sz, MADV_NORMAL); - r->p = NULL; - r->size = 0; - d->free_regions_size -= psz; if (zero_fill) memset(p, 0, sz); else if (mopts.malloc_junk == 2 && mopts.malloc_freeunmap) memset(p, SOME_FREEJUNK, sz); + KERNEXIT(); return p; } else if (r->size > psz) big = r; @@ -440,7 +456,9 @@ map(struct dir_info *d, size_t sz, int z memset(p, SOME_FREEJUNK, sz); return p; } + KERNENTER(); p = MMAP(sz); + KERNEXIT(); if (p != MAP_FAILED) STATS_ADD(d->malloc_used, sz); if (d->free_regions_size > mopts.malloc_cache)
Re: faster malloc in threads
On Fri, Jul 11, 2014 at 06:28:04AM -0400, Ted Unangst wrote: > We don't need to hold the malloc lock when making syscalls like mmap > and munmap if we're just a little careful about the order of > operations. This will allow other threads to concurrently allocate > perhaps smaller chunks while the first thread is in the kernel. > > This makes a huge difference in a simple benchmark that allocates > chunks in one thread and pages in a second thread. The chunk thread > finishes almost immediately, instead of contending for the lock and > running as slowly as the page thread. Admittedly contrived benchmark, > but the changes are very simple so I think it's worth it. > > There are some other possibly expensive operations to tweak, but this > covers the smallest, simplest sections. I very much like the idea, athough it is tricky. The realloc case is seems wrong: if the hash table is extended during during MQUERY/MMAPA, r points to garbage and the r->size assignment is wrong. I also think there's one simple case that can be added: the MMAP call at the bottom of map(). -Otto > > Index: malloc.c > === > RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v > retrieving revision 1.170 > diff -u -p -r1.170 malloc.c > --- malloc.c 9 Jul 2014 19:11:00 - 1.170 > +++ malloc.c 11 Jul 2014 10:23:10 - > @@ -93,6 +93,15 @@ > #define MQUERY(a, sz)mquery((a), (size_t)(sz), PROT_READ | > PROT_WRITE, \ > MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, (off_t)0) > > +#define KERNENTER() if (__isthreaded) do { \ > + malloc_active--; \ > + _MALLOC_UNLOCK(); \ > +} while (0) > +#define KERNEXIT() if (__isthreaded) do { \ > + _MALLOC_LOCK(); \ > + malloc_active++; \ > +} while (0) > + > struct region_info { > void *p;/* page; low bits used to mark chunks */ > uintptr_t size; /* size for pages, or chunk_info pointer */ > @@ -312,7 +321,10 @@ unmap(struct dir_info *d, void *p, size_ > } > > if (psz > mopts.malloc_cache) { > - if (munmap(p, sz)) > + KERNENTER(); > + i = munmap(p, sz); > + KERNEXIT(); > + if (i) > wrterror("munmap", p); > STATS_SUB(d->malloc_used, sz); > return; > @@ -396,7 +408,9 @@ map(struct dir_info *d, size_t sz, int z > return MAP_FAILED; > } > if (psz > d->free_regions_size) { > + KERNENTER(); > p = MMAP(sz); > + KERNEXIT(); > if (p != MAP_FAILED) > STATS_ADD(d->malloc_used, sz); > /* zero fill not needed */ > @@ -408,18 +422,20 @@ map(struct dir_info *d, size_t sz, int z > if (r->p != NULL) { > if (r->size == psz) { > p = r->p; > + r->p = NULL; > + r->size = 0; > + d->free_regions_size -= psz; > + KERNENTER(); > if (mopts.malloc_freeunmap) > mprotect(p, sz, PROT_READ | PROT_WRITE); > if (mopts.malloc_hint) > madvise(p, sz, MADV_NORMAL); > - r->p = NULL; > - r->size = 0; > - d->free_regions_size -= psz; > if (zero_fill) > memset(p, 0, sz); > else if (mopts.malloc_junk == 2 && > mopts.malloc_freeunmap) > memset(p, SOME_FREEJUNK, sz); > + KERNEXIT(); > return p; > } else if (r->size > psz) > big = r; > @@ -1317,11 +1333,13 @@ orealloc(void *p, size_t newsz, void *f) > > STATS_INC(pool->cheap_realloc_tries); > zapcacheregion(pool, hint, needed); > + KERNENTER(); > q = MQUERY(hint, needed); > if (q == hint) > q = MMAPA(hint, needed); > else > q = MAP_FAILED; > + KERNEXIT(); > if (q == hint) { > STATS_ADD(pool->malloc_used, needed); > if (mopts.malloc_junk == 2) >
faster malloc in threads
We don't need to hold the malloc lock when making syscalls like mmap and munmap if we're just a little careful about the order of operations. This will allow other threads to concurrently allocate perhaps smaller chunks while the first thread is in the kernel. This makes a huge difference in a simple benchmark that allocates chunks in one thread and pages in a second thread. The chunk thread finishes almost immediately, instead of contending for the lock and running as slowly as the page thread. Admittedly contrived benchmark, but the changes are very simple so I think it's worth it. There are some other possibly expensive operations to tweak, but this covers the smallest, simplest sections. Index: malloc.c === RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v retrieving revision 1.170 diff -u -p -r1.170 malloc.c --- malloc.c9 Jul 2014 19:11:00 - 1.170 +++ malloc.c11 Jul 2014 10:23:10 - @@ -93,6 +93,15 @@ #define MQUERY(a, sz) mquery((a), (size_t)(sz), PROT_READ | PROT_WRITE, \ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, (off_t)0) +#define KERNENTER() if (__isthreaded) do { \ + malloc_active--; \ + _MALLOC_UNLOCK(); \ +} while (0) +#define KERNEXIT() if (__isthreaded) do { \ + _MALLOC_LOCK(); \ + malloc_active++; \ +} while (0) + struct region_info { void *p;/* page; low bits used to mark chunks */ uintptr_t size; /* size for pages, or chunk_info pointer */ @@ -312,7 +321,10 @@ unmap(struct dir_info *d, void *p, size_ } if (psz > mopts.malloc_cache) { - if (munmap(p, sz)) + KERNENTER(); + i = munmap(p, sz); + KERNEXIT(); + if (i) wrterror("munmap", p); STATS_SUB(d->malloc_used, sz); return; @@ -396,7 +408,9 @@ map(struct dir_info *d, size_t sz, int z return MAP_FAILED; } if (psz > d->free_regions_size) { + KERNENTER(); p = MMAP(sz); + KERNEXIT(); if (p != MAP_FAILED) STATS_ADD(d->malloc_used, sz); /* zero fill not needed */ @@ -408,18 +422,20 @@ map(struct dir_info *d, size_t sz, int z if (r->p != NULL) { if (r->size == psz) { p = r->p; + r->p = NULL; + r->size = 0; + d->free_regions_size -= psz; + KERNENTER(); if (mopts.malloc_freeunmap) mprotect(p, sz, PROT_READ | PROT_WRITE); if (mopts.malloc_hint) madvise(p, sz, MADV_NORMAL); - r->p = NULL; - r->size = 0; - d->free_regions_size -= psz; if (zero_fill) memset(p, 0, sz); else if (mopts.malloc_junk == 2 && mopts.malloc_freeunmap) memset(p, SOME_FREEJUNK, sz); + KERNEXIT(); return p; } else if (r->size > psz) big = r; @@ -1317,11 +1333,13 @@ orealloc(void *p, size_t newsz, void *f) STATS_INC(pool->cheap_realloc_tries); zapcacheregion(pool, hint, needed); + KERNENTER(); q = MQUERY(hint, needed); if (q == hint) q = MMAPA(hint, needed); else q = MAP_FAILED; + KERNEXIT(); if (q == hint) { STATS_ADD(pool->malloc_used, needed); if (mopts.malloc_junk == 2)