> On 22 Nov 2016, at 21:20, Mark Kettenis <mark.kette...@xs4all.nl> wrote: > >> Date: Tue, 22 Nov 2016 12:42:39 +1000 >> From: David Gwynne <da...@gwynne.id.au> >> >> right now pools that make up mbufs are each limited individually. >> >> the following diff instead has the mbuf layer have a global limit >> on the amount of memory that can be allocated to the pools. this >> is enforced by wrapping the multi page pool allocator with something >> that checks the mbuf memory limit first. >> >> this means all mbufs will use a max of 2k * nmbclust bytes instead >> of each pool being able to use that amount each. >> >> ok? > > Mostly makes sense to me. Not sure the complixty of copying the > supported page sizes from the multi-page pool allocator is worth the > additional complication. I'd probably just initialize it the same way > using POOL_ALLOC_SIZES(PAGE_SIZE, 1UL<<31, POOL_ALLOC_ALIGNED).
id have to remember to change both if the on in subr_pool gets changed though. > Wouldn't it make sense to use atomic operations to keep track of the > amount of memory that was allocated? mutexes are full of atomic ops. to conditionally add to mbuf_mem_alloc id have to use a cas loop, which isnt easy to read. eg, instead of: if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit) return (NULL); mtx_enter(&m_pool_mtx); if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit) avail = 0; else mbuf_mem_alloc += pp->pr_pgsize; mtx_leave(&m_pool_mtx); id need: unsigned int cur, new; cur = mbuf_mem_alloc; for (;;) { new = cur + pp->pr_pgsize; if (new > mbuf_mem_limit) return (NULL); new = atomic_cas_uint(&mbuf_mem_alloc, cur, new); if (new == cur) break; cur = new; } or something. the mtx may be slow, but it is a slow path id like to keep simple. > > Long run I suppose we want to drop nmbclust and let users tune the > total amount of memory available for clusters and set the initial > amount to a percentage of physical memory? i do want to move to specifying memory in terms of bytes instead of 2k clusters. how much should be available by default is a complicated issue. > > >> Index: sys/pool.h >> =================================================================== >> RCS file: /cvs/src/sys/sys/pool.h,v >> retrieving revision 1.68 >> diff -u -p -r1.68 pool.h >> --- sys/pool.h 21 Nov 2016 01:44:06 -0000 1.68 >> +++ sys/pool.h 22 Nov 2016 02:31:47 -0000 >> @@ -205,6 +205,7 @@ struct pool { >> #ifdef _KERNEL >> >> extern struct pool_allocator pool_allocator_single; >> +extern struct pool_allocator pool_allocator_multi; >> >> struct pool_request { >> TAILQ_ENTRY(pool_request) pr_entry; >> Index: sys/mbuf.h >> =================================================================== >> RCS file: /cvs/src/sys/sys/mbuf.h,v >> retrieving revision 1.222 >> diff -u -p -r1.222 mbuf.h >> --- sys/mbuf.h 24 Oct 2016 04:38:44 -0000 1.222 >> +++ sys/mbuf.h 22 Nov 2016 02:31:47 -0000 >> @@ -416,6 +416,7 @@ struct mbuf_queue { >> }; >> >> #ifdef _KERNEL >> +struct pool; >> >> extern int nmbclust; /* limit on the # of clusters */ >> extern int mblowat; /* mbuf low water mark */ >> @@ -444,6 +445,7 @@ int m_leadingspace(struct mbuf *); >> int m_trailingspace(struct mbuf *); >> struct mbuf *m_clget(struct mbuf *, int, u_int); >> void m_extref(struct mbuf *, struct mbuf *); >> +void m_pool_init(struct pool *, u_int, u_int, const char *); >> void m_extfree_pool(caddr_t, u_int, void *); >> void m_adj(struct mbuf *, int); >> int m_copyback(struct mbuf *, int, int, const void *, int); >> Index: kern/uipc_mbuf.c >> =================================================================== >> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v >> retrieving revision 1.238 >> diff -u -p -r1.238 uipc_mbuf.c >> --- kern/uipc_mbuf.c 9 Nov 2016 08:55:11 -0000 1.238 >> +++ kern/uipc_mbuf.c 22 Nov 2016 02:31:47 -0000 >> @@ -133,6 +133,19 @@ void m_extfree(struct mbuf *); >> void nmbclust_update(void); >> void m_zero(struct mbuf *); >> >> +struct mutex m_pool_mtx = MUTEX_INITIALIZER(IPL_NET); >> +unsigned int mbuf_mem_limit; /* how much memory can we allocated */ >> +unsigned int mbuf_mem_alloc; /* how much memory has been allocated */ >> + >> +void *m_pool_alloc(struct pool *, int, int *); >> +void m_pool_free(struct pool *, void *); >> + >> +struct pool_allocator m_pool_allocator = { >> + m_pool_alloc, >> + m_pool_free, >> + 0 /* will be copied from pool_allocator_multi */ >> +}; >> + >> static void (*mextfree_fns[4])(caddr_t, u_int, void *); >> static u_int num_extfree_fns; >> >> @@ -148,6 +161,11 @@ mbinit(void) >> int i; >> unsigned int lowbits; >> >> + m_pool_allocator.pa_pagesz = pool_allocator_multi.pa_pagesz; >> + >> + nmbclust_update(); >> + mbuf_mem_alloc = 0; >> + >> #if DIAGNOSTIC >> if (mclsizes[0] != MCLBYTES) >> panic("mbinit: the smallest cluster size != MCLBYTES"); >> @@ -155,9 +173,7 @@ mbinit(void) >> panic("mbinit: the largest cluster size != MAXMCLBYTES"); >> #endif >> >> - pool_init(&mbpool, MSIZE, 0, IPL_NET, 0, "mbufpl", NULL); >> - pool_set_constraints(&mbpool, &kp_dma_contig); >> - pool_setlowat(&mbpool, mblowat); >> + m_pool_init(&mbpool, MSIZE, 64, "mbufpl"); >> >> pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0, >> IPL_NET, 0, "mtagpl", NULL); >> @@ -171,47 +187,32 @@ mbinit(void) >> snprintf(mclnames[i], sizeof(mclnames[0]), "mcl%dk", >> mclsizes[i] >> 10); >> } >> - pool_init(&mclpools[i], mclsizes[i], 64, IPL_NET, 0, >> - mclnames[i], NULL); >> - pool_set_constraints(&mclpools[i], &kp_dma_contig); >> - pool_setlowat(&mclpools[i], mcllowat); >> + >> + m_pool_init(&mclpools[i], mclsizes[i], 64, mclnames[i]); >> } >> >> (void)mextfree_register(m_extfree_pool); >> KASSERT(num_extfree_fns == 1); >> - >> - nmbclust_update(); >> } >> >> void >> mbcpuinit() >> { >> + int i; >> + >> mbstat = counters_alloc_ncpus(mbstat, MBSTAT_COUNT, M_DEVBUF); >> + >> + pool_cache_init(&mbpool); >> + pool_cache_init(&mtagpool); >> + >> + for (i = 0; i < nitems(mclsizes); i++) >> + pool_cache_init(&mclpools[i]); >> } >> >> void >> nmbclust_update(void) >> { >> - unsigned int i, n; >> - >> - /* >> - * Set the hard limit on the mclpools to the number of >> - * mbuf clusters the kernel is to support. Log the limit >> - * reached message max once a minute. >> - */ >> - for (i = 0; i < nitems(mclsizes); i++) { >> - n = (unsigned long long)nmbclust * MCLBYTES / mclsizes[i]; >> - (void)pool_sethardlimit(&mclpools[i], n, mclpool_warnmsg, 60); >> - /* >> - * XXX this needs to be reconsidered. >> - * Setting the high water mark to nmbclust is too high >> - * but we need to have enough spare buffers around so that >> - * allocations in interrupt context don't fail or mclgeti() >> - * drivers may end up with empty rings. >> - */ >> - pool_sethiwat(&mclpools[i], n); >> - } >> - pool_sethiwat(&mbpool, nmbclust); >> + mbuf_mem_limit = nmbclust * MCLBYTES; >> } >> >> /* >> @@ -1377,6 +1378,52 @@ m_dup_pkt(struct mbuf *m0, unsigned int >> fail: >> m_freem(m); >> return (NULL); >> +} >> + >> +void * >> +m_pool_alloc(struct pool *pp, int flags, int *slowdown) >> +{ >> + void *v = NULL; >> + int avail = 1; >> + >> + if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit) >> + return (NULL); >> + >> + mtx_enter(&m_pool_mtx); >> + if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit) >> + avail = 0; >> + else >> + mbuf_mem_alloc += pp->pr_pgsize; >> + mtx_leave(&m_pool_mtx); >> + >> + if (avail) { >> + v = (*pool_allocator_multi.pa_alloc)(pp, flags, slowdown); >> + >> + if (v == NULL) { >> + mtx_enter(&m_pool_mtx); >> + mbuf_mem_alloc -= pp->pr_pgsize; >> + mtx_leave(&m_pool_mtx); >> + } >> + } >> + >> + return (v); >> +} >> + >> +void >> +m_pool_free(struct pool *pp, void *v) >> +{ >> + (*pool_allocator_multi.pa_free)(pp, v); >> + >> + mtx_enter(&m_pool_mtx); >> + mbuf_mem_alloc -= pp->pr_pgsize; >> + mtx_leave(&m_pool_mtx); >> +} >> + >> +void >> +m_pool_init(struct pool *pp, u_int size, u_int align, const char *wmesg) >> +{ >> + pool_init(pp, size, align, IPL_NET, 0, wmesg, &m_pool_allocator); >> + pool_set_constraints(pp, &kp_dma_contig); >> } >> >> #ifdef DDB >> Index: dev/pci/if_myx.c >> =================================================================== >> RCS file: /cvs/src/sys/dev/pci/if_myx.c,v >> retrieving revision 1.99 >> diff -u -p -r1.99 if_myx.c >> --- dev/pci/if_myx.c 31 Oct 2016 01:38:57 -0000 1.99 >> +++ dev/pci/if_myx.c 22 Nov 2016 02:31:47 -0000 >> @@ -294,8 +294,6 @@ myx_attach(struct device *parent, struct >> >> /* this is sort of racy */ >> if (myx_mcl_pool == NULL) { >> - extern struct kmem_pa_mode kp_dma_contig; >> - >> myx_mcl_pool = malloc(sizeof(*myx_mcl_pool), M_DEVBUF, >> M_WAITOK); >> if (myx_mcl_pool == NULL) { >> @@ -303,9 +301,9 @@ myx_attach(struct device *parent, struct >> DEVNAME(sc)); >> goto unmap; >> } >> - pool_init(myx_mcl_pool, MYX_RXBIG_SIZE, MYX_BOUNDARY, IPL_NET, >> - 0, "myxmcl", NULL); >> - pool_set_constraints(myx_mcl_pool, &kp_dma_contig); >> + >> + m_pool_init(myx_mcl_pool, MYX_RXBIG_SIZE, MYX_BOUNDARY, >> + "myxmcl"); >> } >> >> if (myx_pcie_dc(sc, pa) != 0) >> >>