> On 22 Nov 2016, at 21:20, Mark Kettenis <mark.kette...@xs4all.nl> wrote:
> 
>> Date: Tue, 22 Nov 2016 12:42:39 +1000
>> From: David Gwynne <da...@gwynne.id.au>
>> 
>> right now pools that make up mbufs are each limited individually.
>> 
>> the following diff instead has the mbuf layer have a global limit
>> on the amount of memory that can be allocated to the pools. this
>> is enforced by wrapping the multi page pool allocator with something
>> that checks the mbuf memory limit first.
>> 
>> this means all mbufs will use a max of 2k * nmbclust bytes instead
>> of each pool being able to use that amount each.
>> 
>> ok?
> 
> Mostly makes sense to me.  Not sure the complixty of copying the
> supported page sizes from the multi-page pool allocator is worth the
> additional complication.  I'd probably just initialize it the same way
> using POOL_ALLOC_SIZES(PAGE_SIZE, 1UL<<31, POOL_ALLOC_ALIGNED).

id have to remember to change both if the on in subr_pool gets changed though.

> Wouldn't it make sense to use atomic operations to keep track of the
> amount of memory that was allocated?

mutexes are full of atomic ops. to conditionally add to mbuf_mem_alloc id have 
to use a cas loop, which isnt easy to read. eg, instead of:

        if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit)
                return (NULL);

        mtx_enter(&m_pool_mtx);
        if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit)
                avail = 0;
        else
                mbuf_mem_alloc += pp->pr_pgsize;
        mtx_leave(&m_pool_mtx);


id need:

  unsigned int cur, new;

  cur = mbuf_mem_alloc;
  for (;;) {
    new = cur + pp->pr_pgsize;
    if (new > mbuf_mem_limit)
      return (NULL);

    new = atomic_cas_uint(&mbuf_mem_alloc, cur, new);
    if (new == cur)
      break;
    cur = new;
  }

or something.

the mtx may be slow, but it is a slow path id like to keep simple.

> 
> Long run I suppose we want to drop nmbclust and let users tune the
> total amount of memory available for clusters and set the initial
> amount to a percentage of physical memory?

i do want to move to specifying memory in terms of bytes instead of 2k clusters.

how much should be available by default is a complicated issue.

> 
> 
>> Index: sys/pool.h
>> ===================================================================
>> RCS file: /cvs/src/sys/sys/pool.h,v
>> retrieving revision 1.68
>> diff -u -p -r1.68 pool.h
>> --- sys/pool.h       21 Nov 2016 01:44:06 -0000      1.68
>> +++ sys/pool.h       22 Nov 2016 02:31:47 -0000
>> @@ -205,6 +205,7 @@ struct pool {
>> #ifdef _KERNEL
>> 
>> extern struct pool_allocator pool_allocator_single;
>> +extern struct pool_allocator pool_allocator_multi;
>> 
>> struct pool_request {
>>      TAILQ_ENTRY(pool_request) pr_entry;
>> Index: sys/mbuf.h
>> ===================================================================
>> RCS file: /cvs/src/sys/sys/mbuf.h,v
>> retrieving revision 1.222
>> diff -u -p -r1.222 mbuf.h
>> --- sys/mbuf.h       24 Oct 2016 04:38:44 -0000      1.222
>> +++ sys/mbuf.h       22 Nov 2016 02:31:47 -0000
>> @@ -416,6 +416,7 @@ struct mbuf_queue {
>> };
>> 
>> #ifdef       _KERNEL
>> +struct pool;
>> 
>> extern       int nmbclust;                   /* limit on the # of clusters */
>> extern       int mblowat;                    /* mbuf low water mark */
>> @@ -444,6 +445,7 @@ int      m_leadingspace(struct mbuf *);
>> int  m_trailingspace(struct mbuf *);
>> struct mbuf *m_clget(struct mbuf *, int, u_int);
>> void m_extref(struct mbuf *, struct mbuf *);
>> +void        m_pool_init(struct pool *, u_int, u_int, const char *);
>> void m_extfree_pool(caddr_t, u_int, void *);
>> void m_adj(struct mbuf *, int);
>> int  m_copyback(struct mbuf *, int, int, const void *, int);
>> Index: kern/uipc_mbuf.c
>> ===================================================================
>> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
>> retrieving revision 1.238
>> diff -u -p -r1.238 uipc_mbuf.c
>> --- kern/uipc_mbuf.c 9 Nov 2016 08:55:11 -0000       1.238
>> +++ kern/uipc_mbuf.c 22 Nov 2016 02:31:47 -0000
>> @@ -133,6 +133,19 @@ void    m_extfree(struct mbuf *);
>> void nmbclust_update(void);
>> void m_zero(struct mbuf *);
>> 
>> +struct mutex m_pool_mtx = MUTEX_INITIALIZER(IPL_NET);
>> +unsigned int mbuf_mem_limit; /* how much memory can we allocated */
>> +unsigned int mbuf_mem_alloc; /* how much memory has been allocated */
>> +
>> +void        *m_pool_alloc(struct pool *, int, int *);
>> +void        m_pool_free(struct pool *, void *);
>> +
>> +struct pool_allocator m_pool_allocator = {
>> +    m_pool_alloc,
>> +    m_pool_free,
>> +    0 /* will be copied from pool_allocator_multi */
>> +};
>> +
>> static void (*mextfree_fns[4])(caddr_t, u_int, void *);
>> static u_int num_extfree_fns;
>> 
>> @@ -148,6 +161,11 @@ mbinit(void)
>>      int i;
>>      unsigned int lowbits;
>> 
>> +    m_pool_allocator.pa_pagesz = pool_allocator_multi.pa_pagesz;
>> +
>> +    nmbclust_update();
>> +    mbuf_mem_alloc = 0;
>> +
>> #if DIAGNOSTIC
>>      if (mclsizes[0] != MCLBYTES)
>>              panic("mbinit: the smallest cluster size != MCLBYTES");
>> @@ -155,9 +173,7 @@ mbinit(void)
>>              panic("mbinit: the largest cluster size != MAXMCLBYTES");
>> #endif
>> 
>> -    pool_init(&mbpool, MSIZE, 0, IPL_NET, 0, "mbufpl", NULL);
>> -    pool_set_constraints(&mbpool, &kp_dma_contig);
>> -    pool_setlowat(&mbpool, mblowat);
>> +    m_pool_init(&mbpool, MSIZE, 64, "mbufpl");
>> 
>>      pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
>>          IPL_NET, 0, "mtagpl", NULL);
>> @@ -171,47 +187,32 @@ mbinit(void)
>>                      snprintf(mclnames[i], sizeof(mclnames[0]), "mcl%dk",
>>                          mclsizes[i] >> 10);
>>              }
>> -            pool_init(&mclpools[i], mclsizes[i], 64, IPL_NET, 0,
>> -                mclnames[i], NULL);
>> -            pool_set_constraints(&mclpools[i], &kp_dma_contig);
>> -            pool_setlowat(&mclpools[i], mcllowat);
>> +
>> +            m_pool_init(&mclpools[i], mclsizes[i], 64, mclnames[i]);
>>      }
>> 
>>      (void)mextfree_register(m_extfree_pool);
>>      KASSERT(num_extfree_fns == 1);
>> -
>> -    nmbclust_update();
>> }
>> 
>> void
>> mbcpuinit()
>> {
>> +    int i;
>> +
>>      mbstat = counters_alloc_ncpus(mbstat, MBSTAT_COUNT, M_DEVBUF);
>> +
>> +    pool_cache_init(&mbpool);
>> +    pool_cache_init(&mtagpool);
>> +
>> +    for (i = 0; i < nitems(mclsizes); i++)
>> +            pool_cache_init(&mclpools[i]);
>> }
>> 
>> void
>> nmbclust_update(void)
>> {
>> -    unsigned int i, n;
>> -
>> -    /*
>> -     * Set the hard limit on the mclpools to the number of
>> -     * mbuf clusters the kernel is to support.  Log the limit
>> -     * reached message max once a minute.
>> -     */
>> -    for (i = 0; i < nitems(mclsizes); i++) {
>> -            n = (unsigned long long)nmbclust * MCLBYTES / mclsizes[i];
>> -            (void)pool_sethardlimit(&mclpools[i], n, mclpool_warnmsg, 60);
>> -            /*
>> -             * XXX this needs to be reconsidered.
>> -             * Setting the high water mark to nmbclust is too high
>> -             * but we need to have enough spare buffers around so that
>> -             * allocations in interrupt context don't fail or mclgeti()
>> -             * drivers may end up with empty rings.
>> -             */
>> -            pool_sethiwat(&mclpools[i], n);
>> -    }
>> -    pool_sethiwat(&mbpool, nmbclust);
>> +    mbuf_mem_limit = nmbclust * MCLBYTES;
>> }
>> 
>> /*
>> @@ -1377,6 +1378,52 @@ m_dup_pkt(struct mbuf *m0, unsigned int 
>> fail:
>>      m_freem(m);
>>      return (NULL);
>> +}
>> +
>> +void *
>> +m_pool_alloc(struct pool *pp, int flags, int *slowdown)
>> +{
>> +    void *v = NULL;
>> +    int avail = 1;
>> +
>> +    if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit)
>> +            return (NULL);
>> +
>> +    mtx_enter(&m_pool_mtx);
>> +    if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit)
>> +            avail = 0;
>> +    else
>> +            mbuf_mem_alloc += pp->pr_pgsize;
>> +    mtx_leave(&m_pool_mtx);
>> +
>> +    if (avail) {
>> +            v = (*pool_allocator_multi.pa_alloc)(pp, flags, slowdown);
>> +
>> +            if (v == NULL) {
>> +                    mtx_enter(&m_pool_mtx);
>> +                    mbuf_mem_alloc -= pp->pr_pgsize;
>> +                    mtx_leave(&m_pool_mtx);
>> +            }
>> +    }
>> +
>> +    return (v);
>> +}
>> +
>> +void
>> +m_pool_free(struct pool *pp, void *v)
>> +{
>> +    (*pool_allocator_multi.pa_free)(pp, v);
>> +
>> +    mtx_enter(&m_pool_mtx);
>> +    mbuf_mem_alloc -= pp->pr_pgsize;
>> +    mtx_leave(&m_pool_mtx);
>> +}
>> +
>> +void
>> +m_pool_init(struct pool *pp, u_int size, u_int align, const char *wmesg)
>> +{
>> +    pool_init(pp, size, align, IPL_NET, 0, wmesg, &m_pool_allocator);
>> +    pool_set_constraints(pp, &kp_dma_contig);
>> }
>> 
>> #ifdef DDB
>> Index: dev/pci/if_myx.c
>> ===================================================================
>> RCS file: /cvs/src/sys/dev/pci/if_myx.c,v
>> retrieving revision 1.99
>> diff -u -p -r1.99 if_myx.c
>> --- dev/pci/if_myx.c 31 Oct 2016 01:38:57 -0000      1.99
>> +++ dev/pci/if_myx.c 22 Nov 2016 02:31:47 -0000
>> @@ -294,8 +294,6 @@ myx_attach(struct device *parent, struct
>> 
>>      /* this is sort of racy */
>>      if (myx_mcl_pool == NULL) {
>> -            extern struct kmem_pa_mode kp_dma_contig;
>> -
>>              myx_mcl_pool = malloc(sizeof(*myx_mcl_pool), M_DEVBUF,
>>                  M_WAITOK);
>>              if (myx_mcl_pool == NULL) {
>> @@ -303,9 +301,9 @@ myx_attach(struct device *parent, struct
>>                          DEVNAME(sc));
>>                      goto unmap;
>>              }
>> -            pool_init(myx_mcl_pool, MYX_RXBIG_SIZE, MYX_BOUNDARY, IPL_NET,
>> -                0, "myxmcl", NULL);
>> -            pool_set_constraints(myx_mcl_pool, &kp_dma_contig);
>> +
>> +            m_pool_init(myx_mcl_pool, MYX_RXBIG_SIZE, MYX_BOUNDARY,
>> +                "myxmcl");
>>      }
>> 
>>      if (myx_pcie_dc(sc, pa) != 0)
>> 
>> 

Reply via email to