Re: per-cpu caches for pools

David Gwynne Tue, 01 Nov 2016 01:16:19 -0700

Nice, thanks for testing. It's good to know it's reliable. Were you
watching pool stats while running the tests?


Either way, I'm going to commit this so it will get broader testing.

Cheers,
dlg

On 1 Nov 2016 4:14 a.m., "Simon Mages" <mages.si...@googlemail.com> wrote:

> Hi,
>
> today i did some performance messurements on OpenBSD-current, with and
> without your diff.
>
> I use a custom HTTP proxy. Performance in this case is requests/s and
> bandwidth.
> To messure requests/s i use requests with a very small response. To
> messure bandwidth
> i use big responses. This custom proxy uses socket splicing to improve
> the performace for
> bigger junks of data. The following results are a average over
> multiple test runs.
>
> without your diff:
> requests/s = 3929
> bandwidth in Mbit/s = 1196
>
> with your diff:
> requests/s = 4093
> bandwidth in Mbit/s = 1428
>
> Just ask if you want more details on the testsetup.
>
> BR
>
> Simon
>
> 2016-10-27 5:54 GMT+02:00, David Gwynne <da...@gwynne.id.au>:
> > On Tue, Oct 25, 2016 at 10:35:45AM +1000, David Gwynne wrote:
> >> On Mon, Oct 24, 2016 at 04:24:13PM +1000, David Gwynne wrote:
> >> > ive posted this before as part of a much bigger diff, but smaller
> >> > is better.
> >> >
> >> > it basically lets things ask for per-cpu item caches to be enabled
> >> > on pools. the most obvious use case for this is the mbuf pools.
> >> >
> >> > the caches are modelled on whats described in the "Magazines and
> >> > Vmem: Extending the Slab Allocator to Many CPUs and Arbitrary
> >> > Resources" paper by Jeff Bonwick and Jonathan Adams. pools are
> >> > modelled on slabs, which bonwick described in a previous paper.
> >> >
> >> > the main inspiration the paper provided was around how many objects
> >> > to cache on each cpu, and how often to move sets of objects between
> >> > the cpu caches and a global list of objects. unlike the paper we
> >> > do not care about maintaining constructed objects on the free lists,
> >> > so we reuse the objects themselves to build the free list.
> >> >
> >> > id like to get this in so we can tinker with it in tree. the things
> >> > i think we need to tinker with are what poisioning we can get away
> >> > with on the per cpu caches, and what limits can we enforce at the
> >> > pool level.
> >> >
> >> > i think poisioning will be relatively simple to add. the limits one
> >> > is more challenging because we dont want the pools to have to
> >> > coordinate between cpus for every get or put operation. my thought
> >> > there was to limit the number of pages that a pool can allocate
> >> > from its backend rather than limit the items the pool can provide.
> >> > limiting the pages could also be done at a lower level. eg, the
> >> > mbuf clusters could share a common backend that limits the pages
> >> > the pools can get, rather than have the cluster pools account for
> >> > pages separately.
> >> >
> >> > anyway, either way i would like to get this in so we can work on
> >> > this stuff.
> >> >
> >> > ok?
> >>
> >> this adds per-cpu caches to the mbuf pools so people can actually
> >> try and see if the code works or not.
> >
> > this fixes a crash hrvoje and i found independently. avoid holding
> > a mutex when calling yield().
> >
> > also some whitespace fixes.
> >
> > Index: kern/subr_pool.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/subr_pool.c,v
> > retrieving revision 1.198
> > diff -u -p -r1.198 subr_pool.c
> > --- kern/subr_pool.c  15 Sep 2016 02:00:16 -0000      1.198
> > +++ kern/subr_pool.c  27 Oct 2016 03:51:10 -0000
> > @@ -42,6 +42,7 @@
> >  #include <sys/sysctl.h>
> >  #include <sys/task.h>
> >  #include <sys/timeout.h>
> > +#include <sys/percpu.h>
> >
> >  #include <uvm/uvm_extern.h>
> >
> > @@ -96,6 +97,33 @@ struct pool_item {
> >  };
> >  #define POOL_IMAGIC(ph, pi) ((u_long)(pi) ^ (ph)->ph_magic)
> >
> > +#ifdef MULTIPROCESSOR
> > +struct pool_list {
> > +     struct pool_list        *pl_next;       /* next in list */
> > +     unsigned long            pl_cookie;
> > +     struct pool_list        *pl_nextl;      /* next list */
> > +     unsigned long            pl_nitems;     /* items in list */
> > +};
> > +
> > +struct pool_cache {
> > +     struct pool_list        *pc_actv;
> > +     unsigned long            pc_nactv;      /* cache pc_actv nitems */
> > +     struct pool_list        *pc_prev;
> > +
> > +     uint64_t                 pc_gen;        /* generation number */
> > +     uint64_t                 pc_gets;
> > +     uint64_t                 pc_puts;
> > +     uint64_t                 pc_fails;
> > +
> > +     int                      pc_nout;
> > +};
> > +
> > +void *pool_cache_get(struct pool *);
> > +void  pool_cache_put(struct pool *, void *);
> > +void  pool_cache_destroy(struct pool *);
> > +#endif
> > +void  pool_cache_info(struct pool *, struct kinfo_pool *);
> > +
> >  #ifdef POOL_DEBUG
> >  int  pool_debug = 1;
> >  #else
> > @@ -355,6 +383,11 @@ pool_destroy(struct pool *pp)
> >       struct pool_item_header *ph;
> >       struct pool *prev, *iter;
> >
> > +#ifdef MULTIPROCESSOR
> > +     if (pp->pr_cache != NULL)
> > +             pool_cache_destroy(pp);
> > +#endif
> > +
> >  #ifdef DIAGNOSTIC
> >       if (pp->pr_nout != 0)
> >               panic("%s: pool busy: still out: %u", __func__,
> pp->pr_nout);
> > @@ -421,6 +454,14 @@ pool_get(struct pool *pp, int flags)
> >       void *v = NULL;
> >       int slowdown = 0;
> >
> > +#ifdef MULTIPROCESSOR
> > +     if (pp->pr_cache != NULL) {
> > +             v = pool_cache_get(pp);
> > +             if (v != NULL)
> > +                     goto good;
> > +     }
> > +#endif
> > +
> >       KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
> >
> >       mtx_enter(&pp->pr_mtx);
> > @@ -453,6 +494,9 @@ pool_get(struct pool *pp, int flags)
> >               v = mem.v;
> >       }
> >
> > +#ifdef MULTIPROCESSOR
> > +good:
> > +#endif
> >       if (ISSET(flags, PR_ZERO))
> >               memset(v, 0, pp->pr_size);
> >
> > @@ -631,6 +675,13 @@ pool_put(struct pool *pp, void *v)
> >               panic("%s: NULL item", __func__);
> >  #endif
> >
> > +#ifdef MULTIPROCESSOR
> > +     if (pp->pr_cache != NULL && TAILQ_EMPTY(&pp->pr_requests)) {
> > +             pool_cache_put(pp, v);
> > +             return;
> > +     }
> > +#endif
> > +
> >       mtx_enter(&pp->pr_mtx);
> >
> >       splassert(pp->pr_ipl);
> > @@ -1333,6 +1384,8 @@ sysctl_dopool(int *name, u_int namelen,
> >               pi.pr_nidle = pp->pr_nidle;
> >               mtx_leave(&pp->pr_mtx);
> >
> > +             pool_cache_info(pp, &pi);
> > +
> >               rv = sysctl_rdstruct(oldp, oldlenp, NULL, &pi, sizeof(pi));
> >               break;
> >       }
> > @@ -1499,3 +1552,265 @@ pool_multi_free_ni(struct pool *pp, void
> >       km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
> >       KERNEL_UNLOCK();
> >  }
> > +
> > +#ifdef MULTIPROCESSOR
> > +
> > +struct pool pool_caches; /* per cpu cache entries */
> > +
> > +void
> > +pool_cache_init(struct pool *pp)
> > +{
> > +     struct cpumem *cm;
> > +     struct pool_cache *pc;
> > +     struct cpumem_iter i;
> > +
> > +     if (pool_caches.pr_size == 0) {
> > +             pool_init(&pool_caches, sizeof(struct pool_cache), 64,
> > +                 IPL_NONE, PR_WAITOK, "plcache", NULL);
> > +     }
> > +
> > +     KASSERT(pp->pr_size >= sizeof(*pc));
> > +
> > +     cm = cpumem_get(&pool_caches);
> > +
> > +     mtx_init(&pp->pr_cache_mtx, pp->pr_ipl);
> > +     pp->pr_cache_list = NULL;
> > +     pp->pr_cache_nlist = 0;
> > +     pp->pr_cache_items = 8;
> > +     pp->pr_cache_contention = 0;
> > +
> > +     CPUMEM_FOREACH(pc, &i, cm) {
> > +             pc->pc_actv = NULL;
> > +             pc->pc_nactv = 0;
> > +             pc->pc_prev = NULL;
> > +
> > +             pc->pc_gets = 0;
> > +             pc->pc_puts = 0;
> > +             pc->pc_fails = 0;
> > +             pc->pc_nout = 0;
> > +     }
> > +
> > +     pp->pr_cache = cm;
> > +}
> > +
> > +static inline void
> > +pool_list_enter(struct pool *pp)
> > +{
> > +     if (mtx_enter_try(&pp->pr_cache_mtx) == 0) {
> > +             mtx_enter(&pp->pr_cache_mtx);
> > +             pp->pr_cache_contention++;
> > +     }
> > +}
> > +
> > +static inline void
> > +pool_list_leave(struct pool *pp)
> > +{
> > +     mtx_leave(&pp->pr_cache_mtx);
> > +}
> > +
> > +static inline struct pool_list *
> > +pool_list_alloc(struct pool *pp, struct pool_cache *pc)
> > +{
> > +     struct pool_list *pl;
> > +
> > +     pool_list_enter(pp);
> > +     pl = pp->pr_cache_list;
> > +     if (pl != NULL) {
> > +             pp->pr_cache_list = pl->pl_nextl;
> > +             pp->pr_cache_nlist--;
> > +     }
> > +
> > +     pp->pr_cache_nout += pc->pc_nout;
> > +     pc->pc_nout = 0;
> > +     pool_list_leave(pp);
> > +
> > +     return (pl);
> > +}
> > +
> > +static inline void
> > +pool_list_free(struct pool *pp, struct pool_cache *pc, struct pool_list
> > *pl)
> > +{
> > +     pool_list_enter(pp);
> > +     pl->pl_nextl = pp->pr_cache_list;
> > +     pp->pr_cache_list = pl;
> > +     pp->pr_cache_nlist++;
> > +
> > +     pp->pr_cache_nout += pc->pc_nout;
> > +     pc->pc_nout = 0;
> > +     pool_list_leave(pp);
> > +}
> > +
> > +static inline struct pool_cache *
> > +pool_cache_enter(struct pool *pp, int *s)
> > +{
> > +     struct pool_cache *pc;
> > +
> > +     pc = cpumem_enter(pp->pr_cache);
> > +     *s = splraise(pp->pr_ipl);
> > +     pc->pc_gen++;
> > +
> > +     return (pc);
> > +}
> > +
> > +static inline void
> > +pool_cache_leave(struct pool *pp, struct pool_cache *pc, int s)
> > +{
> > +     pc->pc_gen++;
> > +     splx(s);
> > +     cpumem_leave(pp->pr_cache, pc);
> > +}
> > +
> > +void *
> > +pool_cache_get(struct pool *pp)
> > +{
> > +     struct pool_cache *pc;
> > +     struct pool_list *pl;
> > +     int s;
> > +
> > +     pc = pool_cache_enter(pp, &s);
> > +
> > +     if (pc->pc_actv != NULL) {
> > +             pl = pc->pc_actv;
> > +     } else if (pc->pc_prev != NULL) {
> > +             pl = pc->pc_prev;
> > +             pc->pc_prev = NULL;
> > +     } else if ((pl = pool_list_alloc(pp, pc)) == NULL) {
> > +             pc->pc_fails++;
> > +             goto done;
> > +     }
> > +
> > +     pc->pc_actv = pl->pl_next;
> > +     pc->pc_nactv = pl->pl_nitems - 1;
> > +     pc->pc_gets++;
> > +     pc->pc_nout++;
> > +done:
> > +     pool_cache_leave(pp, pc, s);
> > +
> > +     return (pl);
> > +}
> > +
> > +void
> > +pool_cache_put(struct pool *pp, void *v)
> > +{
> > +     struct pool_cache *pc;
> > +     struct pool_list *pl = v;
> > +     unsigned long cache_items = pp->pr_cache_items;
> > +     unsigned long nitems;
> > +     int s;
> > +
> > +     pc = pool_cache_enter(pp, &s);
> > +
> > +     nitems = pc->pc_nactv;
> > +     if (nitems >= cache_items) {
> > +             if (pc->pc_prev != NULL)
> > +                     pool_list_free(pp, pc, pc->pc_prev);
> > +
> > +             pc->pc_prev = pc->pc_actv;
> > +
> > +             pc->pc_actv = NULL;
> > +             pc->pc_nactv = 0;
> > +             nitems = 0;
> > +     }
> > +
> > +     pl->pl_next = pc->pc_actv;
> > +     pl->pl_nitems = ++nitems;
> > +
> > +     pc->pc_actv = pl;
> > +     pc->pc_nactv = nitems;
> > +
> > +     pc->pc_puts++;
> > +     pc->pc_nout--;
> > +
> > +     pool_cache_leave(pp, pc, s);
> > +}
> > +
> > +struct pool_list *
> > +pool_list_put(struct pool *pp, struct pool_list *pl)
> > +{
> > +     struct pool_list *rpl, *npl;
> > +
> > +     if (pl == NULL)
> > +             return (NULL);
> > +
> > +     rpl = (struct pool_list *)pl->pl_next;
> > +
> > +     do {
> > +             npl = pl->pl_next;
> > +             pool_put(pp, pl);
> > +             pl = npl;
> > +     } while (pl != NULL);
> > +
> > +     return (rpl);
> > +}
> > +
> > +void
> > +pool_cache_destroy(struct pool *pp)
> > +{
> > +     struct pool_cache *pc;
> > +     struct pool_list *pl;
> > +     struct cpumem_iter i;
> > +     struct cpumem *cm;
> > +
> > +     cm = pp->pr_cache;
> > +     pp->pr_cache = NULL; /* make pool_put avoid the cache */
> > +
> > +     CPUMEM_FOREACH(pc, &i, cm) {
> > +             pool_list_put(pp, pc->pc_actv);
> > +             pool_list_put(pp, pc->pc_prev);
> > +     }
> > +
> > +     cpumem_put(&pool_caches, cm);
> > +
> > +     pl = pp->pr_cache_list;
> > +     while (pl != NULL)
> > +             pl = pool_list_put(pp, pl);
> > +}
> > +
> > +void
> > +pool_cache_info(struct pool *pp, struct kinfo_pool *pi)
> > +{
> > +     struct pool_cache *pc;
> > +     struct cpumem_iter i;
> > +
> > +     if (pp->pr_cache == NULL)
> > +             return;
> > +
> > +     /* loop through the caches twice to collect stats */
> > +
> > +     /* once without the mtx so we can yield while reading nget/nput */
> > +     CPUMEM_FOREACH(pc, &i, pp->pr_cache) {
> > +             uint64_t gen, nget, nput;
> > +
> > +             do {
> > +                     while ((gen = pc->pc_gen) & 1)
> > +                             yield();
> > +
> > +                     nget = pc->pc_gets;
> > +                     nput = pc->pc_puts;
> > +             } while (gen != pc->pc_gen);
> > +
> > +             pi->pr_nget += nget;
> > +             pi->pr_nput += nput;
> > +     }
> > +
> > +     /* and once with the mtx so we can get consistent nout values */
> > +     mtx_enter(&pp->pr_cache_mtx);
> > +     CPUMEM_FOREACH(pc, &i, pp->pr_cache)
> > +             pi->pr_nout += pc->pc_nout;
> > +
> > +     pi->pr_nout += pp->pr_cache_nout;
> > +     mtx_leave(&pp->pr_cache_mtx);
> > +}
> > +#else /* MULTIPROCESSOR */
> > +void
> > +pool_cache_init(struct pool *pp)
> > +{
> > +     /* nop */
> > +}
> > +
> > +void
> > +pool_cache_info(struct pool *pp, struct kinfo_pool *pi)
> > +{
> > +     /* nop */
> > +}
> > +#endif /* MULTIPROCESSOR */
> > Index: kern/uipc_mbuf.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
> > retrieving revision 1.237
> > diff -u -p -r1.237 uipc_mbuf.c
> > --- kern/uipc_mbuf.c  27 Oct 2016 03:29:55 -0000      1.237
> > +++ kern/uipc_mbuf.c  27 Oct 2016 03:51:10 -0000
> > @@ -186,7 +186,15 @@ mbinit(void)
> >  void
> >  mbcpuinit()
> >  {
> > +     int i;
> > +
> >       mbstat = counters_alloc_ncpus(mbstat, MBSTAT_COUNT, M_DEVBUF);
> > +
> > +     pool_cache_init(&mbpool);
> > +     pool_cache_init(&mtagpool);
> > +
> > +     for (i = 0; i < nitems(mclsizes); i++)
> > +             pool_cache_init(&mclpools[i]);
> >  }
> >
> >  void
> > Index: sys/pool.h
> > ===================================================================
> > RCS file: /cvs/src/sys/sys/pool.h,v
> > retrieving revision 1.63
> > diff -u -p -r1.63 pool.h
> > --- sys/pool.h        15 Sep 2016 02:00:16 -0000      1.63
> > +++ sys/pool.h        27 Oct 2016 03:51:10 -0000
> > @@ -84,6 +84,9 @@ struct pool_allocator {
> >
> >  TAILQ_HEAD(pool_pagelist, pool_item_header);
> >
> > +struct pool_list;
> > +struct cpumem;
> > +
> >  struct pool {
> >       struct mutex    pr_mtx;
> >       SIMPLEQ_ENTRY(pool)
> > @@ -124,6 +127,15 @@ struct pool {
> >       RBT_HEAD(phtree, pool_item_header)
> >                       pr_phtree;
> >
> > +     struct cpumem * pr_cache;
> > +     struct mutex    pr_cache_mtx;
> > +     struct pool_list *
> > +                     pr_cache_list;
> > +     u_int           pr_cache_nlist;
> > +     u_int           pr_cache_items;
> > +     u_int           pr_cache_contention;
> > +     int             pr_cache_nout;
> > +
> >       u_int           pr_align;
> >       u_int           pr_maxcolors;   /* Cache coloring */
> >       int             pr_phoffset;    /* Offset in page of page header */
> > @@ -175,6 +187,7 @@ struct pool_request {
> >
> >  void         pool_init(struct pool *, size_t, u_int, int, int,
> >                   const char *, struct pool_allocator *);
> > +void         pool_cache_init(struct pool *);
> >  void         pool_destroy(struct pool *);
> >  void         pool_setlowat(struct pool *, int);
> >  void         pool_sethiwat(struct pool *, int);
> >
> >
>

Re: per-cpu caches for pools

Reply via email to