The branch main has been updated by glebius: URL: https://cgit.FreeBSD.org/src/commit/?id=fec8a8c7cbe4384c7e61d376f3aa5be5ac895915
commit fec8a8c7cbe4384c7e61d376f3aa5be5ac895915 Author: Gleb Smirnoff <gleb...@freebsd.org> AuthorDate: 2022-01-03 18:15:22 +0000 Commit: Gleb Smirnoff <gleb...@freebsd.org> CommitDate: 2022-01-03 18:17:46 +0000 inpcb: use global UMA zones for protocols Provide structure inpcbstorage, that holds zones and lock names for a protocol. Initialize it with global protocol init using macro INPCBSTORAGE_DEFINE(). Then, at VNET protocol init supply it as the main argument to the in_pcbinfo_init(). Each VNET pcbinfo uses its private hash, but they all use same zone to allocate and SMR section to synchronize. Note: there is kern.ipc.maxsockets sysctl, which controls UMA limit on the socket zone, which was always global. Historically same maxsockets value is applied also to every PCB zone. Important fact: you can't create a pcb without a socket! A pcb may outlive its socket, however. Given that there are multiple protocols, and only one socket zone, the per pcb zone limits seem to have little value. Under very special conditions it may trigger a little bit earlier than socket zone limit, but in most setups the socket zone limit will be triggered earlier. When VIMAGE was added to the kernel PCB zones became per-VNET. This magnified existing disbalance further: now we have multiple pcb zones in multiple vnets limited to maxsockets, but every pcb requires a socket allocated from the global zone also limited by maxsockets. IMHO, this per pcb zone limit doesn't bring any value, so this patch drops it. If anybody explains value of this limit, it can be restored very easy - just 2 lines change to in_pcbstorage_init(). Differential revision: https://reviews.freebsd.org/D33542 --- sys/netinet/in_pcb.c | 59 ++++++++++++++++++++++++++++++++---------------- sys/netinet/in_pcb.h | 45 ++++++++++++++++++++++++++++++++---- sys/netinet/ip_divert.c | 23 ++----------------- sys/netinet/raw_ip.c | 28 ++--------------------- sys/netinet/tcp_subr.c | 27 +++------------------- sys/netinet/udp_usrreq.c | 39 +++++--------------------------- 6 files changed, 93 insertions(+), 128 deletions(-) diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 03250d5101ff..97471ae7800c 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -518,19 +518,16 @@ abort_with_hash_wlock: CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); /* - * Initialize an inpcbinfo -- we should be able to reduce the number of - * arguments in time. + * Initialize an inpcbinfo - a per-VNET instance of connections db. */ -static void inpcb_dtor(void *, int, void *); -static void inpcb_fini(void *, int); void -in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, - u_int hash_nelements, int porthash_nelements, char *inpcbzone_name, - uma_init inpcbzone_init) +in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, + u_int hash_nelements, u_int porthash_nelements) { - mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF); - mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF); + mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); + mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, + NULL, MTX_DEF); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif @@ -543,16 +540,9 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); - pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), - NULL, inpcb_dtor, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, - UMA_ZONE_SMR); - uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); - uma_zone_set_warning(pcbinfo->ipi_zone, - "kern.ipc.maxsockets limit reached"); + pcbinfo->ipi_zone = pcbstor->ips_zone; + pcbinfo->ipi_portzone = pcbstor->ips_portzone; pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); - pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name, - sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr); } /* @@ -570,12 +560,41 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); - uma_zdestroy(pcbinfo->ipi_zone); - uma_zdestroy(pcbinfo->ipi_portzone); mtx_destroy(&pcbinfo->ipi_hash_lock); mtx_destroy(&pcbinfo->ipi_lock); } +/* + * Initialize a pcbstorage - per protocol zones to allocate inpcbs. + */ +static void inpcb_dtor(void *, int, void *); +static void inpcb_fini(void *, int); +void +in_pcbstorage_init(void *arg) +{ + struct inpcbstorage *pcbstor = arg; + + pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, + sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit, + inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR); + pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, + sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + uma_zone_set_smr(pcbstor->ips_portzone, + uma_zone_get_smr(pcbstor->ips_zone)); +} + +/* + * Destroy a pcbstorage - used by unloadable protocols. + */ +void +in_pcbstorage_destroy(void *arg) +{ + struct inpcbstorage *pcbstor = arg; + + uma_zdestroy(pcbstor->ips_zone); + uma_zdestroy(pcbstor->ips_portzone); +} + /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 0e87a68e81fa..1f7efd19e868 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -375,8 +375,8 @@ void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *); #ifdef _KERNEL /* - * Global data structure for each high-level protocol (UDP, TCP, ...) in both - * IPv4 and IPv6. Holds inpcb lists and information for managing them. + * Per-VNET pcb database for each high-level protocol (UDP, TCP, ...) in both + * IPv4 and IPv6. * * The pcbs are protected with SMR section and thus all lists in inpcbinfo * are CK-lists. Locking is required to insert a pcb into database. Two @@ -445,6 +445,41 @@ struct inpcbinfo { struct vnet *ipi_vnet; /* (c) */ }; +/* + * Global allocation storage for each high-level protocol (UDP, TCP, ...). + * Each corresponding per-VNET inpcbinfo points into this one. + */ +struct inpcbstorage { + uma_zone_t ips_zone; + uma_zone_t ips_portzone; + uma_init ips_pcbinit; + const char * ips_zone_name; + const char * ips_portzone_name; + const char * ips_infolock_name; + const char * ips_hashlock_name; +}; + +#define INPCBSTORAGE_DEFINE(prot, lname, zname, iname, hname) \ +static int \ +prot##_inpcb_init(void *mem, int size __unused, int flags __unused) \ +{ \ + struct inpcb *inp = mem; \ + \ + rw_init_flags(&inp->inp_lock, lname, RW_RECURSE | RW_DUPOK); \ + return (0); \ +} \ +static struct inpcbstorage prot = { \ + .ips_pcbinit = prot##_inpcb_init, \ + .ips_zone_name = zname, \ + .ips_portzone_name = zname " ports", \ + .ips_infolock_name = iname, \ + .ips_hashlock_name = hname, \ +}; \ +SYSINIT(prot##_inpcbstorage_init, SI_SUB_PROTO_DOMAIN, \ + SI_ORDER_SECOND, in_pcbstorage_init, &prot); \ +SYSUNINIT(prot##_inpcbstorage_uninit, SI_SUB_PROTO_DOMAIN, \ + SI_ORDER_SECOND, in_pcbstorage_destroy, &prot) + /* * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group * (or unique address:port combination) can be re-used at most @@ -688,9 +723,11 @@ VNET_DECLARE(int, ipport_tcpallocs); #define V_ipport_stoprandom VNET(ipport_stoprandom) #define V_ipport_tcpallocs VNET(ipport_tcpallocs) +void in_pcbinfo_init(struct inpcbinfo *, struct inpcbstorage *, + u_int, u_int); void in_pcbinfo_destroy(struct inpcbinfo *); -void in_pcbinfo_init(struct inpcbinfo *, const char *, u_int, int, char *, - uma_init); +void in_pcbstorage_init(void *); +void in_pcbstorage_destroy(void *); int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi); diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index 8f972f49b604..6c4d85b03e6a 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -117,8 +117,6 @@ VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo); static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ -static eventhandler_tag ip_divert_event_tag; - static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m, struct sockaddr_in *sin); static int div_output_outbound(int family, struct socket *so, struct mbuf *m); @@ -126,21 +124,7 @@ static int div_output_outbound(int family, struct socket *so, struct mbuf *m); /* * Initialize divert connection block queue. */ -static void -div_zone_change(void *tag) -{ - - uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); -} - -static int -div_inpcb_init(void *mem, int size, int flags) -{ - struct inpcb *inp = mem; - - INP_LOCK_INIT(inp, "inp", "divinp"); - return (0); -} +INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash"); static void div_init(void *arg __unused) @@ -151,7 +135,7 @@ div_init(void *arg __unused) * allocate one-entry hash lists than it is to check all over the * place for hashbase == NULL. */ - in_pcbinfo_init(&V_divcbinfo, "div", 1, 1, "divcb", div_inpcb_init); + in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1); } VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL); @@ -794,8 +778,6 @@ div_modevent(module_t mod, int type, void *unused) if (err != 0) return (err); ip_divert_ptr = divert_packet; - ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change, - div_zone_change, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_QUIESCE: /* @@ -829,7 +811,6 @@ div_modevent(module_t mod, int type, void *unused) #ifndef VIMAGE div_destroy(NULL); #endif - EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag); break; default: err = EOPNOTSUPP; diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index b7d338306b49..7c495745806e 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -182,37 +182,13 @@ rip_delhash(struct inpcb *inp) } #endif /* INET */ -/* - * Raw interface to IP protocol. - */ - -/* - * Initialize raw connection block q. - */ -static void -rip_zone_change(void *tag) -{ - - uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); -} - -static int -rip_inpcb_init(void *mem, int size, int flags) -{ - struct inpcb *inp = mem; - - INP_LOCK_INIT(inp, "inp", "rawinp"); - return (0); -} +INPCBSTORAGE_DEFINE(ripcbstor, "rawinp", "ripcb", "rip", "riphash"); static void rip_init(void *arg __unused) { - in_pcbinfo_init(&V_ripcbinfo, "rip", INP_PCBHASH_RAW_SIZE, 1, "ripcb", - rip_inpcb_init); - EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, - EVENTHANDLER_PRI_ANY); + in_pcbinfo_init(&V_ripcbinfo, &ripcbstor, INP_PCBHASH_RAW_SIZE, 1); } VNET_SYSINIT(rip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rip_init, NULL); diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 3a02988e5b6d..47b6ff173afe 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1146,26 +1146,7 @@ static struct mtx isn_mtx; #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) -/* - * TCP initialization. - */ -static void -tcp_zone_change(void *tag) -{ - - uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); - uma_zone_set_max(V_tcpcb_zone, maxsockets); - tcp_tw_zone_change(); -} - -static int -tcp_inpcb_init(void *mem, int size, int flags) -{ - struct inpcb *inp = mem; - - INP_LOCK_INIT(inp, "inp", "tcpinp"); - return (0); -} +INPCBSTORAGE_DEFINE(tcpcbstor, "tcpinp", "tcp_inpcb", "tcp", "tcphash"); /* * Take a value and get the next power of 2 that doesn't overflow. @@ -1439,8 +1420,8 @@ tcp_vnet_init(void *arg __unused) printf("%s: WARNING: unable to initialise TCP stats\n", __func__); #endif - in_pcbinfo_init(&V_tcbinfo, "tcp", tcp_tcbhashsize, tcp_tcbhashsize, - "tcp_inpcb", tcp_inpcb_init); + in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize, + tcp_tcbhashsize); /* * These have to be type stable for the benefit of the timers. @@ -1526,8 +1507,6 @@ tcp_init(void *arg __unused) ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); - EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, - EVENTHANDLER_PRI_ANY); tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK); diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 582486ffbc92..ad5a2df7d4aa 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -170,33 +170,9 @@ static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *, int); #endif -static void -udp_zone_change(void *tag) -{ - - uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); - uma_zone_set_max(V_udpcb_zone, maxsockets); -} - -static int -udp_inpcb_init(void *mem, int size, int flags) -{ - struct inpcb *inp; - - inp = mem; - INP_LOCK_INIT(inp, "inp", "udpinp"); - return (0); -} - -static int -udplite_inpcb_init(void *mem, int size, int flags) -{ - struct inpcb *inp; - - inp = mem; - INP_LOCK_INIT(inp, "inp", "udpliteinp"); - return (0); -} +INPCBSTORAGE_DEFINE(udpcbstor, "udpinp", "udp_inpcb", "udp", "udphash"); +INPCBSTORAGE_DEFINE(udplitecbstor, "udpliteinp", "udplite_inpcb", "udplite", + "udplitehash"); static void udp_init(void *arg __unused) @@ -209,18 +185,15 @@ udp_init(void *arg __unused) * Once we can calculate the flowid that way and re-establish * a 4-tuple, flip this to 4-tuple. */ - in_pcbinfo_init(&V_udbinfo, "udp", UDBHASHSIZE, UDBHASHSIZE, - "udp_inpcb", udp_inpcb_init); + in_pcbinfo_init(&V_udbinfo, &udpcbstor, UDBHASHSIZE, UDBHASHSIZE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_udpcb_zone, maxsockets); uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached"); - EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, - EVENTHANDLER_PRI_ANY); /* Additional pcbinfo for UDP-Lite */ - in_pcbinfo_init(&V_ulitecbinfo, "udplite", UDBHASHSIZE, - UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init); + in_pcbinfo_init(&V_ulitecbinfo, &udplitecbstor, UDBHASHSIZE, + UDBHASHSIZE); } VNET_SYSINIT(udp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp_init, NULL);