svn commit: r368819 - in head: share/man/man4 sys/netinet sys/netinet6
Author: gallatin Date: Sat Dec 19 22:04:46 2020 New Revision: 368819 URL: https://svnweb.freebsd.org/changeset/base/368819 Log: Filter TCP connections to SO_REUSEPORT_LB listen sockets by NUMA domain In order to efficiently serve web traffic on a NUMA machine, one must avoid as many NUMA domain crossings as possible. With SO_REUSEPORT_LB, a number of workers can share a listen socket. However, even if a worker sets affinity to a core or set of cores on a NUMA domain, it will receive connections associated with all NUMA domains in the system. This will lead to cross-domain traffic when the server writes to the socket or calls sendfile(), and memory is allocated on the server's local NUMA node, but transmitted on the NUMA node associated with the TCP connection. Similarly, when the server reads from the socket, he will likely be reading memory allocated on the NUMA domain associated with the TCP connection. This change provides a new socket ioctl, TCP_REUSPORT_LB_NUMA. A server can now tell the kernel to filter traffic so that only incoming connections associated with the desired NUMA domain are given to the server. (Of course, in the case where there are no servers sharing the listen socket on some domain, then as a fallback, traffic will be hashed as normal to all servers sharing the listen socket regardless of domain). This allows a server to deal only with traffic that is local to its NUMA domain, and avoids cross-domain traffic in most cases. This patch, and a corresponding small patch to nginx to use TCP_REUSPORT_LB_NUMA allows us to serve 190Gb/s of kTLS encrypted https media content from dual-socket Xeons with only 13% (as measured by pcm.x) cross domain traffic on the memory controller. Reviewed by: jhb, bz (earlier version), bcr (man page) Tested by: gonzo Sponsored by: Netfix Differential Revision:https://reviews.freebsd.org/D21636 Modified: head/share/man/man4/tcp.4 head/sys/netinet/in_pcb.c head/sys/netinet/in_pcb.h head/sys/netinet/tcp.h head/sys/netinet/tcp_usrreq.c head/sys/netinet6/in6_pcb.c head/sys/netinet6/in6_pcb.h Modified: head/share/man/man4/tcp.4 == --- head/share/man/man4/tcp.4 Sat Dec 19 21:46:09 2020(r368818) +++ head/share/man/man4/tcp.4 Sat Dec 19 22:04:46 2020(r368819) @@ -34,7 +34,7 @@ .\" From: @(#)tcp.48.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd November 25, 2020 +.Dd December 19, 2020 .Dt TCP 4 .Os .Sh NAME @@ -314,6 +314,21 @@ Enable in-kernel TLS for data read from this socket. See .Xr ktls 4 for more details. +.It Dv TCP_REUSPORT_LB_NUMA +Changes NUMA affinity filtering for an established TCP listen +socket. +This option takes a single integer argument which specifies +the NUMA domain to filter on for this listen socket. +The argument can also have the follwing special values: +.Bl -tag -width "Dv TCP_REUSPORT_LB_NUMA" +.It Dv TCP_REUSPORT_LB_NUMA_NODOM +Remove NUMA filtering for this listen socket. +.It Dv TCP_REUSPORT_LB_NUMA_CURDOM +Filter traffic associated with the domain where the calling thread is +currently executing. +This is typically used after a process or thread inherits a listen +socket from its parent, and sets its CPU affinity to a particular core. +.El .El .Pp The option level for the Modified: head/sys/netinet/in_pcb.c == --- head/sys/netinet/in_pcb.c Sat Dec 19 21:46:09 2020(r368818) +++ head/sys/netinet/in_pcb.c Sat Dec 19 22:04:46 2020(r368819) @@ -75,6 +75,7 @@ __FBSDID("$FreeBSD$"); #endif #include +#include #include #include @@ -150,7 +151,8 @@ static void in_pcbremlists(struct inpcb *inp); static struct inpcb*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, - int lookupflags, struct ifnet *ifp); + int lookupflags, struct ifnet *ifp, + uint8_t numa_domain); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ @@ -248,7 +250,8 @@ SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, -uint16_t port, const union in_dependaddr *addr, int size) +uint16_t port, const union in_dependaddr *addr, int size, +uint8_t numa_domain) { struct inpcblbgroup *grp; size_t bytes; @@ -259,6 +262,7 @@ in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_ch return (NULL); grp->il_vflag = vflag; grp->il_lport = port; + grp->il_numa_domain = numa_domain; grp->il_dependladdr = *addr; grp->il_inpsiz = size;
svn commit: r368818 - head/sys/kern
Author: gallatin Date: Sat Dec 19 21:46:09 2020 New Revision: 368818 URL: https://svnweb.freebsd.org/changeset/base/368818 Log: Optionally bind ktls threads to NUMA domains When ktls_bind_thread is 2, we pick a ktls worker thread that is bound to the same domain as the TCP connection associated with the socket. We use roughly the same code as netinet/tcp_hpts.c to do this. This allows crypto to run on the same domain as the TCP connection is associated with. Assuming TCP_REUSPORT_LB_NUMA (D21636) is in place & in use, this ensures that the crypto source and destination buffers are local to the same NUMA domain as we're running crypto on. This change (when TCP_REUSPORT_LB_NUMA, D21636, is used) reduces cross-domain traffic from over 37% down to about 13% as measured by pcm.x on a dual-socket Xeon using nginx and a Netflix workload. Reviewed by: jhb Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D21648 Modified: head/sys/kern/uipc_ktls.c Modified: head/sys/kern/uipc_ktls.c == --- head/sys/kern/uipc_ktls.c Sat Dec 19 14:54:28 2020(r368817) +++ head/sys/kern/uipc_ktls.c Sat Dec 19 21:46:09 2020(r368818) @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -83,6 +84,12 @@ struct ktls_wq { boolrunning; } __aligned(CACHE_LINE_SIZE); +struct ktls_domain_info { + int count; + int cpu[MAXCPU]; +}; + +struct ktls_domain_info ktls_domains[MAXMEMDOM]; static struct ktls_wq *ktls_wq; static struct proc *ktls_proc; LIST_HEAD(, ktls_crypto_backend) ktls_backends; @@ -316,6 +323,9 @@ static u_int ktls_get_cpu(struct socket *so) { struct inpcb *inp; +#ifdef NUMA + struct ktls_domain_info *di; +#endif u_int cpuid; inp = sotoinpcb(so); @@ -330,7 +340,13 @@ ktls_get_cpu(struct socket *so) * serialization provided by having the same connection use * the same queue. */ - cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads]; +#ifdef NUMA + if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) { + di = _domains[inp->inp_numa_domain]; + cpuid = di->cpu[inp->inp_flowid % di->count]; + } else +#endif + cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads]; return (cpuid); } #endif @@ -341,7 +357,7 @@ ktls_init(void *dummy __unused) struct thread *td; struct pcpu *pc; cpuset_t mask; - int error, i; + int count, domain, error, i; ktls_tasks_active = counter_u64_alloc(M_WAITOK); ktls_cnt_tx_queued = counter_u64_alloc(M_WAITOK); @@ -397,7 +413,11 @@ ktls_init(void *dummy __unused) if (ktls_bind_threads) { if (ktls_bind_threads > 1) { pc = pcpu_find(i); - CPU_COPY(_domain[pc->pc_domain], ); + domain = pc->pc_domain; + CPU_COPY(_domain[domain], ); + count = ktls_domains[domain].count; + ktls_domains[domain].cpu[count] = i; + ktls_domains[domain].count++; } else { CPU_SETOF(i, ); } @@ -410,6 +430,18 @@ ktls_init(void *dummy __unused) ktls_cpuid_lookup[ktls_number_threads] = i; ktls_number_threads++; } + + /* +* If we somehow have an empty domain, fall back to choosing +* among all KTLS threads. +*/ + for (i = 0; i < vm_ndomains; i++) { + if (ktls_domains[i].count == 0) { + ktls_bind_threads = 0; + break; + } + } + printf("KTLS: Initialized %d threads\n", ktls_number_threads); } SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL); @@ -2093,6 +2125,10 @@ ktls_work_thread(void *ctx) STAILQ_HEAD(, mbuf) local_m_head; STAILQ_HEAD(, socket) local_so_head; + if (ktls_bind_threads > 1) { + curthread->td_domain.dr_policy = + DOMAINSET_PREF(PCPU_GET(domain)); + } #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) fpu_kern_thread(0); #endif ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r368721 - head/stand/efi/loader
On 12/17/20 2:49 PM, Kyle Evans wrote: On Thu, Dec 17, 2020 at 1:47 PM Andrew Gallatin wrote: On 12/17/20 12:02 PM, Warner Losh wrote: Author: imp Date: Thu Dec 17 17:02:09 2020 New Revision: 368721 URL: https://urldefense.com/v3/__https://svnweb.freebsd.org/changeset/base/368721__;!!OToaGQ!5c1mLnhtRtEV6Cv_MTWpzXWaGZEYYDp4TJ6wVDzjVZiehAItts7ZWC15uNnQYRa5Fg$ Log: Drop EFI_STAGING_SIZE back down to 64M vmware can't cope with anything larger than 64MB. Drop this back to 64MB everywhere but arm. There were all kinds of booting problems before this was bumped up. In fact, I still have EFI_STAGING_SIZE=128 in src.conf because I needed it be be able to boot when using Nvidia graphics. By reducing this, I feel like we're just playing whack-a-mole. IIRC those have long since become OBE as we'll now grow the staging area to accommodate nvidia. Ah, OK. cool! Thanks & sorry for the noise. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r368721 - head/stand/efi/loader
On 12/17/20 12:02 PM, Warner Losh wrote: Author: imp Date: Thu Dec 17 17:02:09 2020 New Revision: 368721 URL: https://urldefense.com/v3/__https://svnweb.freebsd.org/changeset/base/368721__;!!OToaGQ!5c1mLnhtRtEV6Cv_MTWpzXWaGZEYYDp4TJ6wVDzjVZiehAItts7ZWC15uNnQYRa5Fg$ Log: Drop EFI_STAGING_SIZE back down to 64M vmware can't cope with anything larger than 64MB. Drop this back to 64MB everywhere but arm. There were all kinds of booting problems before this was bumped up. In fact, I still have EFI_STAGING_SIZE=128 in src.conf because I needed it be be able to boot when using Nvidia graphics. By reducing this, I feel like we're just playing whack-a-mole. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r367797 - head/sys/net
Author: gallatin Date: Wed Nov 18 14:55:49 2020 New Revision: 367797 URL: https://svnweb.freebsd.org/changeset/base/367797 Log: LACP: When suppressing distributing, return ENOBUFS When links come and go, lacp goes into a "suppress distributing" mode where it drops traffic for 3 seconds. When in this mode, lagg/lacp historiclally drops traffic with ENETDOWN. That return value causes TCP to close any connection where it gets that value back from the lower parts of the stack. This means that any TCP connection with active traffic during a 3-second windown when an LACP link comes or goes would get closed. TCP treats return values of ENOBUFS as transient errors, and re-schedules transmission later. So rather than returning ENETDOWN, lets return ENOBUFS instead. This allows TCP connections to be preserved. I've tested this by repeatedly bouncing links on a Netlfix CDN server under a moderate (20Gb/s) load and overved ENOBUFS reported back to the TCP stack (as reported by a RACK TCP sysctl). Reviewed by: jhb, jtl, rrs Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D27188 Modified: head/sys/net/ieee8023ad_lacp.c head/sys/net/ieee8023ad_lacp.h head/sys/net/if_lagg.c Modified: head/sys/net/ieee8023ad_lacp.c == --- head/sys/net/ieee8023ad_lacp.c Wed Nov 18 14:54:55 2020 (r367796) +++ head/sys/net/ieee8023ad_lacp.c Wed Nov 18 14:55:49 2020 (r367797) @@ -832,7 +832,8 @@ lacp_stop(struct lagg_softc *sc) } struct lagg_port * -lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t hash, uint8_t numa_domain) +lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t hash, +uint8_t numa_domain, int *err) { struct lacp_softc *lsc = LACP_SOFTC(sc); struct lacp_portmap *pm; @@ -842,12 +843,14 @@ lacp_select_tx_port_by_hash(struct lagg_softc *sc, uin if (__predict_false(lsc->lsc_suppress_distributing)) { LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); + *err = ENOBUFS; return (NULL); } pm = >lsc_pmap[lsc->lsc_activemap]; if (pm->pm_count == 0) { LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__)); + *err = ENETDOWN; return (NULL); } @@ -879,7 +882,7 @@ lacp_select_tx_port_by_hash(struct lagg_softc *sc, uin } struct lagg_port * -lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) +lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m, int *err) { struct lacp_softc *lsc = LACP_SOFTC(sc); uint32_t hash; @@ -892,7 +895,7 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey); numa_domain = m->m_pkthdr.numa_domain; - return (lacp_select_tx_port_by_hash(sc, hash, numa_domain)); + return (lacp_select_tx_port_by_hash(sc, hash, numa_domain, err)); } /* Modified: head/sys/net/ieee8023ad_lacp.h == --- head/sys/net/ieee8023ad_lacp.h Wed Nov 18 14:54:55 2020 (r367796) +++ head/sys/net/ieee8023ad_lacp.h Wed Nov 18 14:55:49 2020 (r367797) @@ -292,8 +292,10 @@ struct lacp_softc { #define LACP_LOCK_ASSERT(_lsc) mtx_assert(&(_lsc)->lsc_mtx, MA_OWNED) struct mbuf*lacp_input(struct lagg_port *, struct mbuf *); -struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *); -struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t, uint8_t); +struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *, +int *); +struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t, +uint8_t, int *); void lacp_attach(struct lagg_softc *); void lacp_detach(void *); void lacp_init(struct lagg_softc *); Modified: head/sys/net/if_lagg.c == --- head/sys/net/if_lagg.c Wed Nov 18 14:54:55 2020(r367796) +++ head/sys/net/if_lagg.c Wed Nov 18 14:55:49 2020(r367797) @@ -1763,6 +1763,7 @@ lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid struct lagg_port *lp; struct lagg_lb *lb; uint32_t hash, p; + int err; sc = ifp->if_softc; @@ -1783,7 +1784,7 @@ lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid flowtype == M_HASHTYPE_NONE) return (NULL); hash = flowid >> sc->flowid_shift; - return (lacp_select_tx_port_by_hash(sc, hash, numa_domain)); + return (lacp_select_tx_port_by_hash(sc, hash, numa_domain, )); default: return (NULL); } @@ -2580,12 +2581,13 @@ static int
Re: svn commit: r367288 - head/sys/compat/linux
On 11/2/20 8:19 PM, Conrad Meyer wrote: Log: linux(4): Emulate Linux SOL_SOCKET:SO_PASSCRED This is required by some major linux applications, such as Chrome and Firefox. (As well as Electron-using applications, which are essentially a bundled version of Chrome.) Awesome! Does this get electron apps working? Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r365071 - in head/sys: net net/altq net/route net80211 netgraph netgraph/atm netgraph/atm/ccatm netgraph/atm/sscfu netgraph/atm/sscop netgraph/atm/uni netgraph/bluetooth/common netgrap
On 2020-09-02 22:42, Alexey Dokuchaev wrote: I want to understand which rules have to be followed (and why). In general, FreeBSD code we write should follow style(9); it specifically mentions "do not add whitespace at the end of a line" and "... followed by one blank line" but doesn't go as far as explicitly forbidding multiple consecutive newlines. To me it's pretty obvious, and while others might have different sens esthe'tique, usually it is lack thereof (no offense) or mere ignorance. ./danfe P.S. Old-school tools like indent(1) or `uncrustify' were never widely popular, I guess, because they did not possess enough knowledge of the language to always produce correct results. Perhaps new era tools, like clang-format, could bring this to a whole new level. I do the upstream sync between the Netflix tree and FreeBSD-current about every 3 weeks (unless glebius beats me to the punch and does it first :). I anticipate that this blank line sweep will cause lots of conflicts for us. I understand this is progress, and I don't object, and I'm not asking for a revert, but please understand that cleanups like this do have hidden costs. I expect that other commercial entities who contribute to FreeBSD will have the same issue, and I also anticipate it will cause problems with MFCs Rather than doing more sweeps like this, is it possible to come up with a clang-format rule that's 95% of style(9), do just one more sweep of the tree to apply that rule, add that rule as a pre-commit hook, and be done forever with style(9) related changes? Thanks, Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r365331 - head/sys/kern
Author: gallatin Date: Fri Sep 4 17:36:15 2020 New Revision: 365331 URL: https://svnweb.freebsd.org/changeset/base/365331 Log: ktls: Check for a NULL send tag in ktls_cleanup() When using ifnet ktls, and when ktls_reset_send_tag() fails to allocate a replacement tag, it leaves the tls session's snd_tag pointer NULL. ktls_cleanup() tries to release the send tag, and will trip over this NULL pointer and panic unless NULL is checked for. Reviewed by: jhb Sponsored by: Netflix Modified: head/sys/kern/uipc_ktls.c Modified: head/sys/kern/uipc_ktls.c == --- head/sys/kern/uipc_ktls.c Fri Sep 4 13:19:18 2020(r365330) +++ head/sys/kern/uipc_ktls.c Fri Sep 4 17:36:15 2020(r365331) @@ -680,7 +680,8 @@ ktls_cleanup(struct ktls_session *tls) counter_u64_add(ktls_ifnet_gcm, -1); break; } - m_snd_tag_rele(tls->snd_tag); + if (tls->snd_tag != NULL) + m_snd_tag_rele(tls->snd_tag); break; #ifdef TCP_OFFLOAD case TCP_TLS_MODE_TOE: ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r364986 - head/sys/kern
Author: gallatin Date: Mon Aug 31 13:53:14 2020 New Revision: 364986 URL: https://svnweb.freebsd.org/changeset/base/364986 Log: make m_getm2() resilient to zone_jumbop exhaustion When the zone_jumbop is exhausted, most things using using sosend* (like sshd) will eventually fail or hang if allocations are limited to the depleted jumbop zone. This makes it imossible to communicate with a box which is under an attach which exhausts the jumbop zone. Rather than depending on the page size zone, also try cluster allocations to satisfy larger requests. This allows me to ssh to, and serve 100Gb/s of traffic from a server which under attack and has had its page-sized zone exhausted. Reviewed by: glebius, markj, rmacklem Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D26150 Modified: head/sys/kern/kern_mbuf.c Modified: head/sys/kern/kern_mbuf.c == --- head/sys/kern/kern_mbuf.c Mon Aug 31 12:14:20 2020(r364985) +++ head/sys/kern/kern_mbuf.c Mon Aug 31 13:53:14 2020(r364986) @@ -1423,21 +1423,28 @@ m_getm2(struct mbuf *m, int len, int how, short type, /* Loop and append maximum sized mbufs to the chain tail. */ while (len > 0) { - if (len > MCLBYTES) - mb = m_getjcl(how, type, (flags & M_PKTHDR), + mb = NULL; + if (len > MCLBYTES) { + mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR), MJUMPAGESIZE); - else if (len >= MINCLSIZE) - mb = m_getcl(how, type, (flags & M_PKTHDR)); - else if (flags & M_PKTHDR) - mb = m_gethdr(how, type); - else - mb = m_get(how, type); - /* Fail the whole operation if one mbuf can't be allocated. */ + } if (mb == NULL) { - if (nm != NULL) + if (len >= MINCLSIZE) + mb = m_getcl(how, type, (flags & M_PKTHDR)); + else if (flags & M_PKTHDR) + mb = m_gethdr(how, type); + else + mb = m_get(how, type); + + /* +* Fail the whole operation if one mbuf can't be +* allocated. +*/ + if (mb == NULL) { m_freem(nm); - return (NULL); + return (NULL); + } } /* Book keeping. */ ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r364460 - head/sys/vm
Author: gallatin Date: Fri Aug 21 18:31:57 2020 New Revision: 364460 URL: https://svnweb.freebsd.org/changeset/base/364460 Log: uma: record allocation failures due to zone limits The zone limit mechanism was recently reworked, and allocation failures due to limits being exceeded were inadvertently no longer being recorded. This would lead to, for example, mbuf allocation failures not being indicated in netstat -m or vmstat -z Reviewed by: markj Sponsored by: Netflix Modified: head/sys/vm/uma_core.c Modified: head/sys/vm/uma_core.c == --- head/sys/vm/uma_core.c Fri Aug 21 17:45:17 2020(r364459) +++ head/sys/vm/uma_core.c Fri Aug 21 18:31:57 2020(r364460) @@ -3952,8 +3952,10 @@ zone_alloc_item(uma_zone_t zone, void *udata, int doma { void *item; - if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) + if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) { + counter_u64_add(zone->uz_fails, 1); return (NULL); + } /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r364405 - in head/sys/netinet: . tcp_stacks
Author: gallatin Date: Wed Aug 19 17:59:06 2020 New Revision: 364405 URL: https://svnweb.freebsd.org/changeset/base/364405 Log: TCP: remove special treatment for hardware (ifnet) TLS Remove most special treatment for ifnet TLS in the TCP stack, except for code to avoid mixing handshakes and bulk data. This code made heroic efforts to send down entire TLS records to NICs. It was added to improve the PCIe bus efficiency of older TLS offload NICs which did not keep state per-session, and so would need to re-DMA the first part(s) of a TLS record if a TLS record was sent in multiple TCP packets or TSOs. Newer TLS offload NICs do not need this feature. At Netflix, we've run extensive QoE tests which show that this feature reduces client quality metrics, presumably because the effort to send TLS records atomically causes the server to both wait too long to send data (leading to buffers running dry), and to send too much data at once (leading to packet loss). Reviewed by: hselasky, jhb, rrs Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D26103 Modified: head/sys/netinet/tcp_output.c head/sys/netinet/tcp_stacks/bbr.c head/sys/netinet/tcp_stacks/rack.c Modified: head/sys/netinet/tcp_output.c == --- head/sys/netinet/tcp_output.c Wed Aug 19 17:52:06 2020 (r364404) +++ head/sys/netinet/tcp_output.c Wed Aug 19 17:59:06 2020 (r364405) @@ -1957,17 +1957,6 @@ tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *ple *pkthdrlen = len_cp; break; } - - /* -* Don't end a send in the middle of a TLS -* record if it spans multiple TLS records. -*/ - if (tls != NULL && (m != start) && len < m->m_len) { - *plen = len_cp; - if (pkthdrlen != NULL) - *pkthdrlen = len_cp; - break; - } } #endif mlen = min(len, m->m_len - off); Modified: head/sys/netinet/tcp_stacks/bbr.c == --- head/sys/netinet/tcp_stacks/bbr.c Wed Aug 19 17:52:06 2020 (r364404) +++ head/sys/netinet/tcp_stacks/bbr.c Wed Aug 19 17:59:06 2020 (r364405) @@ -38,7 +38,6 @@ __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include "opt_ratelimit.h" -#include "opt_kern_tls.h" #include #include #include @@ -52,9 +51,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#ifdef KERN_TLS -#include -#endif #include #include #ifdef STATS @@ -4600,15 +4596,6 @@ bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, bbr_set_state(tp, bbr, 0); BBR_STAT_INC(bbr_tlp_tot); maxseg = tp->t_maxseg - bbr->rc_last_options; -#ifdef KERN_TLS - if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { - /* -* For hardware TLS we do *not* want to send -* new data. -*/ - goto need_retran; - } -#endif /* * A TLP timer has expired. We have been idle for 2 rtts. So we now * need to figure out how to force a full MSS segment out. @@ -5802,8 +5789,6 @@ tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t c * Note we do set anything TSO size until we are past the initial * window. Before that we gnerally use either a single MSS * or we use the full IW size (so we burst a IW at a time) -* Also note that Hardware-TLS is special and does alternate -* things to minimize PCI Bus Bandwidth use. */ if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) { @@ -5811,19 +5796,12 @@ tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t c } else { maxseg = BBR_MIN_SEG - bbr->rc_last_options; } -#ifdef KERN_TLS - if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { - tls_seg = ctf_get_opt_tls_size(bbr->rc_inp->inp_socket, bbr->rc_tp->snd_wnd); - bbr->r_ctl.rc_pace_min_segs = (tls_seg + bbr->rc_last_options); - } -#endif old_tso = bbr->r_ctl.rc_pace_max_segs; if (bbr->rc_past_init_win == 0) { /* * Not enough data has been acknowledged to make a -* judgement unless we are hardware TLS. Set up -* the initial TSO based on if we are sending a -* full IW at once or not. +* judgement. Set up the initial TSO based on if we +* are sending a full IW at once or not. */
svn commit: r362789 - head/sys/kern
Author: gallatin Date: Mon Jun 29 21:35:50 2020 New Revision: 362789 URL: https://svnweb.freebsd.org/changeset/base/362789 Log: Fix a panic when unloading firmware LIST_FOREACH_SAFE() is not safe in the presence of other threads removing list entries when a mutex is released. This is not in the critical path, so just restart the scan each time we drop the lock, rather than using a marker. Reviewed by: jhb, markj Sponsored by: Netflix Modified: head/sys/kern/subr_firmware.c Modified: head/sys/kern/subr_firmware.c == --- head/sys/kern/subr_firmware.c Mon Jun 29 19:30:35 2020 (r362788) +++ head/sys/kern/subr_firmware.c Mon Jun 29 21:35:50 2020 (r362789) @@ -394,14 +394,12 @@ EVENTHANDLER_DEFINE(mountroot, firmware_mountroot, NUL static void unloadentry(void *unused1, int unused2) { - struct priv_fw *fp, *tmp; + struct priv_fw *fp; int err; - bool changed; mtx_lock(_mtx); - changed = false; restart: - LIST_FOREACH_SAFE(fp, _table, link, tmp) { + LIST_FOREACH(fp, _table, link) { if (fp->file == NULL || fp->refcnt != 0 || (fp->flags & FW_UNLOAD) == 0) continue; @@ -412,7 +410,6 @@ restart: * 2. clear FW_UNLOAD so we don't try this entry again. * 3. release the lock while trying to unload the module. */ - changed = true; fp->flags &= ~FW_UNLOAD;/* do not try again */ /* @@ -422,9 +419,11 @@ restart: mtx_unlock(_mtx); err = linker_release_module(NULL, NULL, fp->file); mtx_lock(_mtx); - } - if (changed) { - changed = false; + + /* +* When we dropped the lock, another thread could have +* removed an element, so we must restart the scan. +*/ goto restart; } mtx_unlock(_mtx); ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r362112 - head/sys/x86/x86
Author: gallatin Date: Fri Jun 12 18:41:12 2020 New Revision: 362112 URL: https://svnweb.freebsd.org/changeset/base/362112 Log: x86: Bump default msi/msix vector limit to 2048 Given that 64c/128t CPUs are currently available, and that many devices (nvme, many NICs) desire to map 1 MSI-X vector per core, or even 1 per-thread, it is becoming far easier to see MSI-X interrupt setup fail due to msi vector exhaustion, and devices fail to attach at boot on large system. This bump costs 12KB on amd64 (and 6KB on i386), which seems worth the trade off for a better out of the box experience on high end hardware. Reviewed by: jhb MFC after:21 days Sponsored by: Netflix Modified: head/sys/x86/x86/msi.c Modified: head/sys/x86/x86/msi.c == --- head/sys/x86/x86/msi.c Fri Jun 12 18:13:32 2020(r362111) +++ head/sys/x86/x86/msi.c Fri Jun 12 18:41:12 2020(r362112) @@ -156,7 +156,7 @@ u_int first_msi_irq; SYSCTL_UINT(_machdep, OID_AUTO, first_msi_irq, CTLFLAG_RD, _msi_irq, 0, "Number of first IRQ reserved for MSI and MSI-X interrupts"); -u_int num_msi_irqs = 512; +u_int num_msi_irqs = 2048; SYSCTL_UINT(_machdep, OID_AUTO, num_msi_irqs, CTLFLAG_RDTUN, _msi_irqs, 0, "Number of IRQs reserved for MSI and MSI-X interrupts"); ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r347418 - head/sys/net
On 2020-05-19 04:21, Kristof Provost wrote: The if_bnxt driver initialises |.isc_nrxd_max = {INT32_MAX, INT32_MAX, INT32_MAX},|, so presumably that’s the cause. I don’t know what a sane value would be though. I’ve defaulted to 4096 (because that’s what some other iflib users seems to do) for now, and that seems to work. It doesn’t panic and I can get traffic through it at least: You seem to be setting the max, not the default, and 4K max descriptors on a 100g device is going to basically cripple it. How about setting to the next power of 2 below max int so as to keep with the authors intent? If we don't already have a macro, something like (INT32_MAX >> 1) + 1 Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r360982 - head/sys/netinet6
Author: gallatin Date: Tue May 12 17:18:44 2020 New Revision: 360982 URL: https://svnweb.freebsd.org/changeset/base/360982 Log: IPv6: Fix a panic in the nd6 code with unmapped mbufs. If the neighbor entry for an IPv6 TCP session using unmapped mbufs times out, IPv6 will send an icmp6 dest. unreachable message. In doing this, it will try to do a software checksum on the reflected packet. If this is a TCP session using unmapped mbufs, then there will be a kernel panic. To fix this, just free packets with unmapped mbufs, rather than sending the icmp. Reviewed by: np, rrs Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D24821 Modified: head/sys/netinet6/nd6.c Modified: head/sys/netinet6/nd6.c == --- head/sys/netinet6/nd6.c Tue May 12 17:07:28 2020(r360981) +++ head/sys/netinet6/nd6.c Tue May 12 17:18:44 2020(r360982) @@ -821,9 +821,27 @@ nd6_llinfo_timer(void *arg) clear_llinfo_pqueue(ln); } nd6_free(, 0); - if (m != NULL) - icmp6_error2(m, ICMP6_DST_UNREACH, - ICMP6_DST_UNREACH_ADDR, 0, ifp); + if (m != NULL) { + struct mbuf *n = m; + + /* +* if there are any ummapped mbufs, we +* must free them, rather than using +* them for an ICMP, as they cannot be +* checksummed. +*/ + while ((n = n->m_next) != NULL) { + if (n->m_flags & M_EXTPG) + break; + } + if (n != NULL) { + m_freem(m); + m = NULL; + } else { + icmp6_error2(m, ICMP6_DST_UNREACH, + ICMP6_DST_UNREACH_ADDR, 0, ifp); + } + } } break; case ND6_LLINFO_REACHABLE: ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r360961 - head/sys/netinet6
Author: gallatin Date: Tue May 12 14:01:12 2020 New Revision: 360961 URL: https://svnweb.freebsd.org/changeset/base/360961 Log: IPv6: sync IP_NO_SND_TAG_RL support from IPv4 The IP_NO_SND_TAG_RL flag to ip{,6}_output() means that the packets being sent should bypass hardware rate limiting. This is typically used by modern TCP stacks for rexmits. This support was added to IPv4 in r352657, but never added to IPv6, even though rack and bbr call ip6_output() with this flag. Reviewed by: rrs Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D24822 Modified: head/sys/netinet6/ip6_output.c Modified: head/sys/netinet6/ip6_output.c == --- head/sys/netinet6/ip6_output.c Tue May 12 13:23:25 2020 (r360960) +++ head/sys/netinet6/ip6_output.c Tue May 12 14:01:12 2020 (r360961) @@ -322,7 +322,8 @@ ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int h static int ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp, -struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro) +struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro, +bool stamp_tag) { #ifdef KERN_TLS struct ktls_session *tls = NULL; @@ -353,6 +354,10 @@ ip6_output_send(struct inpcb *inp, struct ifnet *ifp, error = EAGAIN; goto done; } + /* +* Always stamp tags that include NIC ktls. +*/ + stamp_tag = true; } #endif #ifdef RATELIMIT @@ -366,7 +371,7 @@ ip6_output_send(struct inpcb *inp, struct ifnet *ifp, mst = inp->inp_snd_tag; } #endif - if (mst != NULL) { + if (stamp_tag && mst != NULL) { KASSERT(m->m_pkthdr.rcvif == NULL, ("trying to add a send tag to a forwarded packet")); if (mst->ifp != ifp) { @@ -1165,7 +1170,8 @@ passout: m->m_pkthdr.len); ifa_free(>ia_ifa); } - error = ip6_output_send(inp, ifp, origifp, m, dst, ro); + error = ip6_output_send(inp, ifp, origifp, m, dst, ro, + (flags & IP_NO_SND_TAG_RL) ? false : true); goto done; } @@ -1256,7 +1262,8 @@ sendorfree: counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } - error = ip6_output_send(inp, ifp, origifp, m, dst, ro); + error = ip6_output_send(inp, ifp, origifp, m, dst, ro, + true); } else m_freem(m); } ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r360930 - head/sys/netinet6
Author: gallatin Date: Mon May 11 21:23:22 2020 New Revision: 360930 URL: https://svnweb.freebsd.org/changeset/base/360930 Log: Fix the build Back out the IPv6 portion of r360903, as the stamp_tag param is apparently not supported in upstream FreeBSD. Sponsored by: Netflix Pointy hat to: gallatin Modified: head/sys/netinet6/ip6_output.c Modified: head/sys/netinet6/ip6_output.c == --- head/sys/netinet6/ip6_output.c Mon May 11 21:22:16 2020 (r360929) +++ head/sys/netinet6/ip6_output.c Mon May 11 21:23:22 2020 (r360930) @@ -353,10 +353,6 @@ ip6_output_send(struct inpcb *inp, struct ifnet *ifp, error = EAGAIN; goto done; } - /* -* Always stamp tags that include NIC ktls. -*/ - stamp_tag = true; } #endif #ifdef RATELIMIT ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r360914 - in head/sys: netinet netinet6
Author: gallatin Date: Mon May 11 19:17:33 2020 New Revision: 360914 URL: https://svnweb.freebsd.org/changeset/base/360914 Log: Ktls: never skip stamping tags for NIC TLS The newer RACK and BBR TCP stacks have added a mechanism to disable hardware packet pacing for TCP retransmits. This mechanism works by skipping the send-tag stamp on rate-limited connections when the TCP stack calls ip_output() with the IP_NO_SND_TAG_RL flag set. When doing NIC TLS, we must ignore this flag, as NIC TLS packets must always be stamped. Failure to stamp a NIC TLS packet will result in crypto issues. Reviewed by: hselasky, rrs Sponsored by: Netflix, Mellanox Modified: head/sys/netinet/ip_output.c head/sys/netinet6/ip6_output.c Modified: head/sys/netinet/ip_output.c == --- head/sys/netinet/ip_output.cMon May 11 19:16:49 2020 (r360913) +++ head/sys/netinet/ip_output.cMon May 11 19:17:33 2020 (r360914) @@ -242,6 +242,10 @@ ip_output_send(struct inpcb *inp, struct ifnet *ifp, s error = EAGAIN; goto done; } + /* +* Always stamp tags that include NIC ktls. +*/ + stamp_tag = true; } #endif #ifdef RATELIMIT Modified: head/sys/netinet6/ip6_output.c == --- head/sys/netinet6/ip6_output.c Mon May 11 19:16:49 2020 (r360913) +++ head/sys/netinet6/ip6_output.c Mon May 11 19:17:33 2020 (r360914) @@ -353,6 +353,10 @@ ip6_output_send(struct inpcb *inp, struct ifnet *ifp, error = EAGAIN; goto done; } + /* +* Always stamp tags that include NIC ktls. +*/ + stamp_tag = true; } #endif #ifdef RATELIMIT ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r359920 - head/sys/sys
Author: gallatin Date: Tue Apr 14 14:48:00 2020 New Revision: 359920 URL: https://svnweb.freebsd.org/changeset/base/359920 Log: Bump FreeBSD version after r359919 (KTLS / unmapped mbuf changes) The above changes mbufs, and any module using unmapped mbufs would need to be re-compiled. Sponsored by: Netflix Modified: head/sys/sys/param.h Modified: head/sys/sys/param.h == --- head/sys/sys/param.hTue Apr 14 14:46:06 2020(r359919) +++ head/sys/sys/param.hTue Apr 14 14:48:00 2020(r359920) @@ -60,7 +60,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1300091 /* Master, propagated to newvers */ +#define __FreeBSD_version 1300092 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r359919 - in head/sys: dev/cxgbe dev/cxgbe/crypto dev/cxgbe/tom dev/mlx5/mlx5_en kern netinet netinet6 sys
Author: gallatin Date: Tue Apr 14 14:46:06 2020 New Revision: 359919 URL: https://svnweb.freebsd.org/changeset/base/359919 Log: KTLS: Re-work unmapped mbufs to carry ext_pgs in the mbuf itself. While the original implementation of unmapped mbufs was a large step forward in terms of reducing cache misses by enabling mbufs to carry more than a single page for sendfile, they are rather cache unfriendly when accessing the ext_pgs metadata and data. This is because the ext_pgs part of the mbuf is allocated separately, and almost guaranteed to be cold in cache. This change takes advantage of the fact that unmapped mbufs are never used at the same time as pkthdr mbufs. Given this fact, we can overlap the ext_pgs metadata with the mbuf pkthdr, and carry the ext_pgs meta directly in the mbuf itself. Similarly, we can carry the ext_pgs data (TLS hdr/trailer/array of pages) directly after the existing m_ext. In order to be able to carry 5 pages (which is the minimum required for a 16K TLS record which is not perfectly aligned) on LP64, I've had to steal ext_arg2. The only user of this in the xmit path is sendfile, and I've adjusted it to use arg1 when using unmapped mbufs. This change is almost entirely mechanical, except that we change mb_alloc_ext_pgs() to no longer allow allocating pkthdrs, the change to avoid ext_arg2 as mentioned above, and the removal of the ext_pgs zone, This change saves roughly 2% "raw" CPU (~59% -> 57%), or over 3% "scaled" CPU on a Netflix 100% software kTLS workload at 90+ Gb/s on Broadwell Xeons. In a follow-on commit, I plan to remove some hacks to avoid access ext_pgs fields of mbufs, since they will now be in cache. Many thanks to glebius for helping to make this better in the Netflix tree. Reviewed by: hselasky, jhb, rrs, glebius (early version) Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D24213 Modified: head/sys/dev/cxgbe/crypto/t4_kern_tls.c head/sys/dev/cxgbe/t4_sge.c head/sys/dev/cxgbe/tom/t4_cpl_io.c head/sys/dev/cxgbe/tom/t4_tls.c head/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c head/sys/kern/kern_mbuf.c head/sys/kern/kern_sendfile.c head/sys/kern/subr_bus_dma.c head/sys/kern/subr_sglist.c head/sys/kern/uipc_ktls.c head/sys/kern/uipc_mbuf.c head/sys/kern/uipc_sockbuf.c head/sys/netinet/ip_output.c head/sys/netinet/tcp_output.c head/sys/netinet6/ip6_output.c head/sys/sys/mbuf.h Modified: head/sys/dev/cxgbe/crypto/t4_kern_tls.c == --- head/sys/dev/cxgbe/crypto/t4_kern_tls.c Tue Apr 14 13:32:03 2020 (r359918) +++ head/sys/dev/cxgbe/crypto/t4_kern_tls.c Tue Apr 14 14:46:06 2020 (r359919) @@ -905,8 +905,8 @@ ktls_tcp_payload_length(struct tlspcb *tlsp, struct mb u_int plen, mlen; MBUF_EXT_PGS_ASSERT(m_tls); - ext_pgs = m_tls->m_ext.ext_pgs; - hdr = (void *)ext_pgs->hdr; + ext_pgs = _tls->m_ext_pgs; + hdr = (void *)ext_pgs->m_epg_hdr; plen = ntohs(hdr->tls_length); /* @@ -961,8 +961,8 @@ ktls_payload_offset(struct tlspcb *tlsp, struct mbuf * #endif MBUF_EXT_PGS_ASSERT(m_tls); - ext_pgs = m_tls->m_ext.ext_pgs; - hdr = (void *)ext_pgs->hdr; + ext_pgs = _tls->m_ext_pgs; + hdr = (void *)ext_pgs->m_epg_hdr; plen = ntohs(hdr->tls_length); #ifdef INVARIANTS mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len; @@ -1008,7 +1008,7 @@ ktls_wr_len(struct tlspcb *tlsp, struct mbuf *m, struc u_int imm_len, offset, plen, wr_len, tlen; MBUF_EXT_PGS_ASSERT(m_tls); - ext_pgs = m_tls->m_ext.ext_pgs; + ext_pgs = _tls->m_ext_pgs; /* * Determine the size of the TLS record payload to send @@ -1040,7 +1040,7 @@ ktls_wr_len(struct tlspcb *tlsp, struct mbuf *m, struc return (wr_len); } - hdr = (void *)ext_pgs->hdr; + hdr = (void *)ext_pgs->m_epg_hdr; plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - ext_pgs->trail_len; if (tlen < plen) { plen = tlen; @@ -1474,7 +1474,7 @@ ktls_write_tunnel_packet(struct sge_txq *txq, void *ds /* Locate the template TLS header. */ MBUF_EXT_PGS_ASSERT(m_tls); - ext_pgs = m_tls->m_ext.ext_pgs; + ext_pgs = _tls->m_ext_pgs; /* This should always be the last TLS record in a chain. */ MPASS(m_tls->m_next == NULL); @@ -1543,8 +1543,8 @@ ktls_write_tunnel_packet(struct sge_txq *txq, void *ds (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp))); /* Copy the subset of the TLS header requested. */ - copy_to_txd(>eq, (char *)ext_pgs->hdr + mtod(m_tls, vm_offset_t), - , m_tls->m_len); + copy_to_txd(>eq, (char *)ext_pgs->m_epg_hdr + + mtod(m_tls, vm_offset_t), , m_tls->m_len); txq->imm_wrs++;
svn commit: r359908 - head/sys/net
Author: gallatin Date: Mon Apr 13 23:06:56 2020 New Revision: 359908 URL: https://svnweb.freebsd.org/changeset/base/359908 Log: lagg: stop double-counting output errors and counting drops as errors Before this change, lagg double-counted errors from lagg members, and counted every drop by a lagg member as an error. Eg, if lagg sent a packet, and the underlying hardware driver dropped it, a counter would be incremented by both lagg and the underlying driver. This change attempts to fix that by incrementing lagg's counters only for errors that do not come from underlying drivers. Reviewed by: hselasky, jhb Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D24331 Modified: head/sys/net/if_lagg.c Modified: head/sys/net/if_lagg.c == --- head/sys/net/if_lagg.c Mon Apr 13 22:21:01 2020(r359907) +++ head/sys/net/if_lagg.c Mon Apr 13 23:06:56 2020(r359908) @@ -1874,10 +1874,6 @@ lagg_transmit(struct ifnet *ifp, struct mbuf *m) error = lagg_proto_start(sc, m); LAGG_RUNLOCK(); - - if (error != 0) - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - return (error); } @@ -2100,6 +2096,7 @@ lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } @@ -2145,31 +2142,28 @@ lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m errors++; break; } - - ret = lagg_enqueue(last->lp_ifp, m0); - if (ret != 0) - errors++; + lagg_enqueue(last->lp_ifp, m0); } last = lp; } if (last == NULL) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENOENT); } if ((last = lagg_link_active(sc, last)) == NULL) { + errors++; + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors); m_freem(m); return (ENETDOWN); } ret = lagg_enqueue(last->lp_ifp, m); - if (ret != 0) - errors++; + if (errors != 0) + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors); - if (errors == 0) - return (ret); - - return (0); + return (ret); } static struct mbuf* @@ -2192,6 +2186,7 @@ lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) /* Use the master port if active or the next available port */ if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } @@ -2315,6 +2310,7 @@ lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } @@ -2386,6 +2382,7 @@ lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m) lp = lacp_select_tx_port(sc, m); if (lp == NULL) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r359474 - head/sys/kern
Author: gallatin Date: Mon Mar 30 23:29:53 2020 New Revision: 359474 URL: https://svnweb.freebsd.org/changeset/base/359474 Log: KTLS: Coalesce adjacent TLS trailers & headers to improve PCIe bus efficiency KTLS uses the embedded header and trailer fields of unmapped mbufs. This can lead to "silly" buffer lengths, where we have an mbuf chain that will create a scatter/gather lists with a regular pattern of 13 bytes followed by 16 bytes between each adjacent TLS record. For software ktls we typically wind up with a pattern where we have several TLS records encrypted, and made ready at once. When these records are made ready, we can coalesce these silly buffers in sbready_compress by copying 13b TLS header of the next record into the 16b TLS trailer of the current record. After doing so, we now have a small 29 byte chunk between each TLS record. This marginally increases PCIe bus efficiency. We've seen an almost 1Gb/s increase in peak throughput on Broadwell based Xeons running a 100% software TLS workload with Mellanox ConnectX-4 NICs. Note that this change is ifdef'ed for KTLS, as KTLS is currently the only user of the hdr/trailer feature of unmapped mbufs, and peeking into them is expensive, since the ext_pgs struct lives in separately allocated memory, and may be cold in cache. This optimization is not applicable to HW ("NIC") TLS, as that depends on having the entire TLS record described by a single unmapped mbuf, so we cannot shift parts of the record between mbufs for HW TLS. Reviewed by: jhb, hselasky, scottl Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D24204 Modified: head/sys/kern/uipc_sockbuf.c Modified: head/sys/kern/uipc_sockbuf.c == --- head/sys/kern/uipc_sockbuf.cMon Mar 30 22:13:32 2020 (r359473) +++ head/sys/kern/uipc_sockbuf.cMon Mar 30 23:29:53 2020 (r359474) @@ -112,7 +112,42 @@ sbready_compress(struct sockbuf *sb, struct mbuf *m0, for (m = m0; m != end; m = m->m_next) { MPASS((m->m_flags & M_NOTREADY) == 0); + /* +* NB: In sbcompress(), 'n' is the last mbuf in the +* socket buffer and 'm' is the new mbuf being copied +* into the trailing space of 'n'. Here, the roles +* are reversed and 'n' is the next mbuf after 'm' +* that is being copied into the trailing space of +* 'm'. +*/ + n = m->m_next; +#ifdef KERN_TLS + /* Try to coalesce adjacent ktls mbuf hdr/trailers. */ + if ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && + (m->m_flags & M_NOMAP) && + (n->m_flags & M_NOMAP) && + !mbuf_has_tls_session(m) && + !mbuf_has_tls_session(n)) { + struct mbuf_ext_pgs *mpgs, *npgs; + int hdr_len, trail_len; + mpgs = m->m_ext.ext_pgs; + npgs = n->m_ext.ext_pgs; + hdr_len = npgs->hdr_len; + trail_len = mpgs->trail_len; + if (trail_len != 0 && hdr_len != 0 && + trail_len + hdr_len <= MBUF_PEXT_TRAIL_LEN) { + /* copy n's header to m's trailer */ + memcpy(>trail[trail_len], npgs->hdr, + hdr_len); + mpgs->trail_len += hdr_len; + m->m_len += hdr_len; + npgs->hdr_len = 0; + n->m_len -= hdr_len; + } + } +#endif + /* Compress small unmapped mbufs into plain mbufs. */ if ((m->m_flags & M_NOMAP) && m->m_len <= MLEN && !mbuf_has_tls_session(m)) { @@ -124,15 +159,6 @@ sbready_compress(struct sockbuf *sb, struct mbuf *m0, } } - /* -* NB: In sbcompress(), 'n' is the last mbuf in the -* socket buffer and 'm' is the new mbuf being copied -* into the trailing space of 'n'. Here, the roles -* are reversed and 'n' is the next mbuf after 'm' -* that is being copied into the trailing space of -* 'm'. -*/ - n = m->m_next; while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && M_WRITABLE(m) && (m->m_flags & M_NOMAP) == 0 && ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to
svn commit: r359016 - head/sys/netinet
Author: gallatin Date: Mon Mar 16 14:03:27 2020 New Revision: 359016 URL: https://svnweb.freebsd.org/changeset/base/359016 Log: Avoid a cache miss accessing an mbuf ext_pgs pointer when doing SW kTLS. For a Netflix 90Gb/s 100% TLS software kTLS workload, this reduces the CPI of tcp_m_copym() from ~3.5 to ~2.5 as reported by vtune. Reviewed by: jtl, rrs Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D23998 Modified: head/sys/netinet/tcp_output.c Modified: head/sys/netinet/tcp_output.c == --- head/sys/netinet/tcp_output.c Mon Mar 16 13:53:29 2020 (r359015) +++ head/sys/netinet/tcp_output.c Mon Mar 16 14:03:27 2020 (r359016) @@ -1907,7 +1907,7 @@ tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *ple top = NULL; pkthdrlen = NULL; #ifdef KERN_TLS - if (m->m_flags & M_NOMAP) + if (hw_tls && (m->m_flags & M_NOMAP)) tls = m->m_ext.ext_pgs->tls; else tls = NULL; ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r358808 - in head/sys: kern net netinet
On 2020-03-09 09:44, Andrew Gallatin wrote: Author: gallatin Date: Mon Mar 9 13:44:51 2020 New Revision: 358808 URL: https://urldefense.com/v3/__https://svnweb.freebsd.org/changeset/base/358808__;!!OToaGQ!5mmRl2ROq7G4c4x2Xe2uHppYyETGlCRsREj-jHw0ZWcNqt3GhQju3BHBkM_vsrVvkQ$ Log: make lacp's use_numa hashing aware of send tags When I did the use_numa support, I missed the fact that there is a separate hash function for send tag nic selection. So when use_numa is enabled, ktls offload does not work properly, as it does not reliably allocate a send tag on the proper egress nic since different egress nics are selected for send-tag allocation and packet transmit. To fix this, this change: - refectors lacp_select_tx_port_by_hash() and lacp_select_tx_port() to make lacp_select_tx_port_by_hash() always called by lacp_select_tx_port() - pre-shifts flowids to convert them to hashes when calling lacp_select_tx_port_by_hash() - adds a numa_domain field to if_snd_tag_alloc_params - plumbs the numa domain into places where we allocate send tags In testing with NIC TLS setup on a NUMA machine, I see thousands of output errors before the change when enabling kern.ipc.tls.ifnet.permitted=1. After the change, I see no errors, and I see the NIC sysctl counters showing active TLS offload sessions. Reviewed by: rrs, hselasky, jhb Sponsored by:Netflix Forgot: Differential: https://reviews.freebsd.org/D23811 ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r358808 - in head/sys: kern net netinet
Author: gallatin Date: Mon Mar 9 13:44:51 2020 New Revision: 358808 URL: https://svnweb.freebsd.org/changeset/base/358808 Log: make lacp's use_numa hashing aware of send tags When I did the use_numa support, I missed the fact that there is a separate hash function for send tag nic selection. So when use_numa is enabled, ktls offload does not work properly, as it does not reliably allocate a send tag on the proper egress nic since different egress nics are selected for send-tag allocation and packet transmit. To fix this, this change: - refectors lacp_select_tx_port_by_hash() and lacp_select_tx_port() to make lacp_select_tx_port_by_hash() always called by lacp_select_tx_port() - pre-shifts flowids to convert them to hashes when calling lacp_select_tx_port_by_hash() - adds a numa_domain field to if_snd_tag_alloc_params - plumbs the numa domain into places where we allocate send tags In testing with NIC TLS setup on a NUMA machine, I see thousands of output errors before the change when enabling kern.ipc.tls.ifnet.permitted=1. After the change, I see no errors, and I see the NIC sysctl counters showing active TLS offload sessions. Reviewed by: rrs, hselasky, jhb Sponsored by: Netflix Modified: head/sys/kern/uipc_ktls.c head/sys/net/ieee8023ad_lacp.c head/sys/net/ieee8023ad_lacp.h head/sys/net/if_lagg.c head/sys/net/if_var.h head/sys/netinet/in_pcb.c head/sys/netinet/tcp_ratelimit.c Modified: head/sys/kern/uipc_ktls.c == --- head/sys/kern/uipc_ktls.c Mon Mar 9 13:36:45 2020(r358807) +++ head/sys/kern/uipc_ktls.c Mon Mar 9 13:44:51 2020(r358808) @@ -800,6 +800,7 @@ ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_sess params.hdr.type = IF_SND_TAG_TYPE_TLS; params.hdr.flowid = inp->inp_flowid; params.hdr.flowtype = inp->inp_flowtype; + params.hdr.numa_domain = inp->inp_numa_domain; params.tls.inp = inp; params.tls.tls = tls; INP_RUNLOCK(inp); Modified: head/sys/net/ieee8023ad_lacp.c == --- head/sys/net/ieee8023ad_lacp.c Mon Mar 9 13:36:45 2020 (r358807) +++ head/sys/net/ieee8023ad_lacp.c Mon Mar 9 13:44:51 2020 (r358808) @@ -832,13 +832,12 @@ lacp_stop(struct lagg_softc *sc) } struct lagg_port * -lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) +lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t hash, uint8_t numa_domain) { struct lacp_softc *lsc = LACP_SOFTC(sc); struct lacp_portmap *pm; struct lacp_port *lp; struct lacp_port **map; - uint32_t hash; int count; if (__predict_false(lsc->lsc_suppress_distributing)) { @@ -854,10 +853,10 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf #ifdef NUMA if ((sc->sc_opts & LAGG_OPT_USE_NUMA) && - pm->pm_num_dom > 1 && m->m_pkthdr.numa_domain < MAXMEMDOM) { - count = pm->pm_numa[m->m_pkthdr.numa_domain].count; + pm->pm_num_dom > 1 && numa_domain < MAXMEMDOM) { + count = pm->pm_numa[numa_domain].count; if (count > 0) { - map = pm->pm_numa[m->m_pkthdr.numa_domain].map; + map = pm->pm_numa[numa_domain].map; } else { /* No ports on this domain; use global hash. */ map = pm->pm_map; @@ -869,11 +868,6 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf map = pm->pm_map; count = pm->pm_count; } - if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && - M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) - hash = m->m_pkthdr.flowid >> sc->flowid_shift; - else - hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey); hash %= count; lp = map[hash]; @@ -884,33 +878,22 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf return (lp->lp_lagg); } -#if defined(RATELIMIT) || defined(KERN_TLS) struct lagg_port * -lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid) +lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) { struct lacp_softc *lsc = LACP_SOFTC(sc); - struct lacp_portmap *pm; - struct lacp_port *lp; uint32_t hash; + uint8_t numa_domain; - if (__predict_false(lsc->lsc_suppress_distributing)) { - LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); - return (NULL); - } + if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && + M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) + hash = m->m_pkthdr.flowid >> sc->flowid_shift; + else + hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey); - pm =
svn commit: r356866 - head/sys/vm
Author: gallatin Date: Sat Jan 18 18:25:37 2020 New Revision: 356866 URL: https://svnweb.freebsd.org/changeset/base/356866 Log: pcpu_page_alloc: guard against empty NUMA domains Some systems, such as higher end Threadripper, may have NUMA domains with no physical memory, Don't allocate from these domains. This fixes a "panic: vm_wait in early boot" on my 2990WX desktop Reviewed by: jeff Sponsored by: Netflix Modified: head/sys/vm/uma_core.c Modified: head/sys/vm/uma_core.c == --- head/sys/vm/uma_core.c Sat Jan 18 10:55:38 2020(r356865) +++ head/sys/vm/uma_core.c Sat Jan 18 18:25:37 2020(r356866) @@ -1521,7 +1521,11 @@ pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int p = vm_page_alloc(NULL, 0, flags); #else pc = pcpu_find(cpu); - p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags); + if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain))) + p = NULL; + else + p = vm_page_alloc_domain(NULL, 0, + pc->pc_domain, flags); if (__predict_false(p == NULL)) p = vm_page_alloc(NULL, 0, flags); #endif ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r354470 - head/sys/dev/hwpmc
Author: gallatin Date: Thu Nov 7 19:54:24 2019 New Revision: 354470 URL: https://svnweb.freebsd.org/changeset/base/354470 Log: hwpmc : fix AMD perf counter MSR access - amd_intr() does not account for the offset (0x200) in the counter MSR address and ends up accessing invalid regions while reading counter value after the 4th counter (0xC001000[8,9,..]) and erroneously updates the counter values for counters [1-4]. - amd_intr() should only check core pmcs for interrupts since other types of pmcs (L3,DF) cannot generate interrupts. - fix pmc NMI's being ignored due to NMI latency on newer AMD processors Note that this fixes a kernel panic due to GPFs accessing MSRs on higher core count AMD cpus (seen on both Rome 7502P, and Threadripper 2990WX 32-core CPUs) Discussed with: markj Submitted by: Shreyank Amartya Differential Revision:https://reviews.freebsd.org/D21553 Modified: head/sys/dev/hwpmc/hwpmc_amd.c head/sys/dev/hwpmc/hwpmc_amd.h Modified: head/sys/dev/hwpmc/hwpmc_amd.c == --- head/sys/dev/hwpmc/hwpmc_amd.c Thu Nov 7 19:54:08 2019 (r354469) +++ head/sys/dev/hwpmc/hwpmc_amd.c Thu Nov 7 19:54:24 2019 (r354470) @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -53,6 +54,10 @@ __FBSDID("$FreeBSD$"); enum pmc_class amd_pmc_class; #endif +#defineOVERFLOW_WAIT_COUNT 50 + +DPCPU_DEFINE_STATIC(uint32_t, nmi_counter); + /* AMD K7 & K8 PMCs */ struct amd_descr { struct pmc_descr pm_descr; /* "base class" */ @@ -739,6 +744,7 @@ amd_stop_pmc(int cpu, int ri) struct pmc_hw *phw; const struct amd_descr *pd; uint64_t config; + int i; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); @@ -761,6 +767,21 @@ amd_stop_pmc(int cpu, int ri) /* turn off the PMC ENABLE bit */ config = pm->pm_md.pm_amd.pm_amd_evsel & ~AMD_PMC_ENABLE; wrmsr(pd->pm_evsel, config); + + /* +* Due to NMI latency on newer AMD processors +* NMI interrupts are ignored, which leads to +* panic or messages based on kernel configuraiton +*/ + + /* Wait for the count to be reset */ + for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { + if (rdmsr(pd->pm_perfctr) & (1 << (pd->pm_descr.pd_width - 1))) + break; + + DELAY(1); + } + return 0; } @@ -779,6 +800,7 @@ amd_intr(struct trapframe *tf) struct pmc *pm; struct amd_cpu *pac; pmc_value_t v; + uint32_t active = 0, count = 0; cpu = curcpu; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), @@ -798,19 +820,21 @@ amd_intr(struct trapframe *tf) * * If found, we call a helper to process the interrupt. * -* If multiple PMCs interrupt at the same time, the AMD64 -* processor appears to deliver as many NMIs as there are -* outstanding PMC interrupts. So we process only one NMI -* interrupt at a time. +* PMCs interrupting at the same time are collapsed into +* a single interrupt. Check all the valid pmcs for +* overflow. */ - for (i = 0; retval == 0 && i < AMD_NPMCS; i++) { + for (i = 0; i < AMD_CORE_NPMCS; i++) { if ((pm = pac->pc_amdpmcs[i].phw_pmc) == NULL || !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) { continue; } + /* Consider pmc with valid handle as active */ + active++; + if (!AMD_PMC_HAS_OVERFLOWED(i)) continue; @@ -820,8 +844,8 @@ amd_intr(struct trapframe *tf) continue; /* Stop the PMC, reload count. */ - evsel = AMD_PMC_EVSEL_0 + i; - perfctr = AMD_PMC_PERFCTR_0 + i; + evsel = amd_pmcdesc[i].pm_evsel; + perfctr = amd_pmcdesc[i].pm_perfctr; v = pm->pm_sc.pm_reloadcount; config = rdmsr(evsel); @@ -837,6 +861,26 @@ amd_intr(struct trapframe *tf) error = pmc_process_interrupt(PMC_HR, pm, tf); if (error == 0) wrmsr(evsel, config); + } + + /* +* Due to NMI latency, there can be a scenario in which +* multiple pmcs gets serviced in an earlier NMI and we +* do not find an overflow in the subsequent NMI. +* +* For such cases we keep a per-cpu count of active NMIs +* and compare it with min(active pmcs, 2) to determine +* if this NMI was for a pmc overflow which was serviced +* in an earlier request or should be ignored. +*/ + + if (retval) { +
svn commit: r354338 - head/sys/x86/x86
Author: gallatin Date: Mon Nov 4 19:30:19 2019 New Revision: 354338 URL: https://svnweb.freebsd.org/changeset/base/354338 Log: Add tunable to allow interrupts on hyperthreaded cores Enabling interrupts on htt cores has benefits to workloads which are primarily interrupt driven by increasing the logical cores available for interrupt handling. The tunable is named machdep.hyperthreading_intr_allowed Reviewed by: kib, jhb Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D22233 Modified: head/sys/x86/x86/mp_x86.c Modified: head/sys/x86/x86/mp_x86.c == --- head/sys/x86/x86/mp_x86.c Mon Nov 4 18:34:29 2019(r354337) +++ head/sys/x86/x86/mp_x86.c Mon Nov 4 19:30:19 2019(r354338) @@ -144,6 +144,11 @@ static int hyperthreading_allowed = 1; SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, _allowed, 0, "Use Intel HTT logical CPUs"); +static int hyperthreading_intr_allowed = 0; +SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, + _intr_allowed, 0, + "Allow interrupts on HTT logical CPUs"); + static struct topo_node topo_root; static int pkg_id_shift; @@ -1121,7 +1126,8 @@ set_interrupt_apic_ids(void) continue; /* Don't let hyperthreads service interrupts. */ - if (cpu_info[apic_id].cpu_hyperthread) + if (cpu_info[apic_id].cpu_hyperthread && + !hyperthreading_intr_allowed) continue; intr_add_cpu(i); ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r354029 - head/sys/vm
Author: gallatin Date: Thu Oct 24 18:39:05 2019 New Revision: 354029 URL: https://svnweb.freebsd.org/changeset/base/354029 Log: Add a tunable to set the pgcache zone's maxcache When it is set to 0 (the default), a heavy Netflix-style web workload suffers from heavy lock contention on the vm page free queue called from vm_page_zone_{import,release}() as the buckets are frequently drained. When setting the maxcache, this contention goes away. We should eventually try to autotune this, as well as make this zone eligable for uma_reclaim(). Reviewed by: alc, markj Not Objected to by: jeff Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D22112 Modified: head/sys/vm/vm_page.c Modified: head/sys/vm/vm_page.c == --- head/sys/vm/vm_page.c Thu Oct 24 18:13:26 2019(r354028) +++ head/sys/vm/vm_page.c Thu Oct 24 18:39:05 2019(r354029) @@ -216,8 +216,10 @@ vm_page_init_cache_zones(void *dummy __unused) { struct vm_domain *vmd; struct vm_pgcache *pgcache; - int domain, pool; + int domain, maxcache, pool; + maxcache = 0; + TUNABLE_INT_FETCH("vm.pgcache_zone_max", ); for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); @@ -237,7 +239,7 @@ vm_page_init_cache_zones(void *dummy __unused) sizeof(struct vm_page), NULL, NULL, NULL, NULL, vm_page_zone_import, vm_page_zone_release, pgcache, UMA_ZONE_MAXBUCKET | UMA_ZONE_VM); - (void)uma_zone_set_maxcache(pgcache->zone, 0); + (void)uma_zone_set_maxcache(pgcache->zone, maxcache); } } } ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r352816 - in head/sys: kern sys
Author: gallatin Date: Fri Sep 27 20:08:19 2019 New Revision: 352816 URL: https://svnweb.freebsd.org/changeset/base/352816 Log: kTLS: Fix a bug where we would not encrypt anon data inplace. Software Kernel TLS needs to allocate a new destination crypto buffer when encrypting data from the page cache, so as to avoid overwriting shared clear-text file data with encrypted data specific to a single socket. When the data is anonymous, eg, not tied to a file, then we can encrypt in place and avoid allocating a new page. This fixes a bug where the existing code always assumes the data is private, and never encrypts in place. This results in unneeded page allocations and potentially more memory bandwidth consumption when doing socket writes. When the code was written at Netflix, ktls_encrypt() looked at private sendfile flags to determine if the pages being encrypted where part of the page cache (coming from sendfile) or anonymous (coming from sosend). This was broken internally at Netflix when the sendfile flags were made private, and the M_WRITABLE() check was added. Unfortunately, M_WRITABLE() will always be false for M_NOMAP mbufs, since one cannot just mtod() them. This change introduces a new flags field to the mbuf_ext_pgs struct by stealing a byte from the tls hdr. Note that the current header is still 2 bytes larger than the largest header we support: AES-CBC with explicit IV. We set MBUF_PEXT_FLAG_ANON when creating an unmapped mbuf in m_uiotombuf_nomap() (which is the path that socket writes take), and we check for that flag in ktls_encrypt() when looking for anon pages. Reviewed by: jhb Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D21796 Modified: head/sys/kern/kern_mbuf.c head/sys/kern/uipc_ktls.c head/sys/kern/uipc_mbuf.c head/sys/sys/mbuf.h Modified: head/sys/kern/kern_mbuf.c == --- head/sys/kern/kern_mbuf.c Fri Sep 27 19:26:52 2019(r352815) +++ head/sys/kern/kern_mbuf.c Fri Sep 27 20:08:19 2019(r352816) @@ -1171,6 +1171,7 @@ mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ex ext_pgs->nrdy = 0; ext_pgs->first_pg_off = 0; ext_pgs->last_pg_len = 0; + ext_pgs->flags = 0; ext_pgs->hdr_len = 0; ext_pgs->trail_len = 0; ext_pgs->tls = NULL; Modified: head/sys/kern/uipc_ktls.c == --- head/sys/kern/uipc_ktls.c Fri Sep 27 19:26:52 2019(r352815) +++ head/sys/kern/uipc_ktls.c Fri Sep 27 20:08:19 2019(r352816) @@ -1363,7 +1363,7 @@ ktls_encrypt(struct mbuf_ext_pgs *pgs) * (from sendfile), anonymous wired pages are * allocated and assigned to the destination iovec. */ - is_anon = M_WRITABLE(m); + is_anon = (pgs->flags & MBUF_PEXT_FLAG_ANON) != 0; off = pgs->first_pg_off; for (i = 0; i < pgs->npgs; i++, off = 0) { @@ -1416,6 +1416,9 @@ retry_page: /* Use the basic free routine. */ m->m_ext.ext_free = mb_free_mext_pgs; + + /* Pages are now writable. */ + pgs->flags |= MBUF_PEXT_FLAG_ANON; } /* Modified: head/sys/kern/uipc_mbuf.c == --- head/sys/kern/uipc_mbuf.c Fri Sep 27 19:26:52 2019(r352815) +++ head/sys/kern/uipc_mbuf.c Fri Sep 27 20:08:19 2019(r352816) @@ -1664,6 +1664,7 @@ m_uiotombuf_nomap(struct uio *uio, int how, int len, i prev->m_next = mb; prev = mb; pgs = mb->m_ext.ext_pgs; + pgs->flags = MBUF_PEXT_FLAG_ANON; needed = length = MIN(maxseg, total); for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) { retry_page: Modified: head/sys/sys/mbuf.h == --- head/sys/sys/mbuf.h Fri Sep 27 19:26:52 2019(r352815) +++ head/sys/sys/mbuf.h Fri Sep 27 20:08:19 2019(r352816) @@ -312,7 +312,7 @@ struct socket; * - 21 (AES-CBC with explicit IV) * - 13 (AES-GCM with 8 byte explicit IV) */ -#defineMBUF_PEXT_HDR_LEN 24 +#defineMBUF_PEXT_HDR_LEN 23 /* * TLS records for TLS 1.0-1.2 can have the following maximum trailer @@ -333,6 +333,8 @@ struct socket; #defineMBUF_PEXT_MAX_BYTES \ (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN) +#define MBUF_PEXT_FLAG_ANON1 /* Data can be encrypted in place. */ + /* * This struct is 256 bytes in size and is arranged so that the most * common case (accessing the first 4
svn commit: r352814 - in head/sys: kern net opencrypto sys
Author: gallatin Date: Fri Sep 27 19:17:40 2019 New Revision: 352814 URL: https://svnweb.freebsd.org/changeset/base/352814 Log: kTLS support for TLS 1.3 TLS 1.3 requires a few changes because 1.3 pretends to be 1.2 with a record type of application data. The "real" record type is then included at the end of the user-supplied plaintext data. This required adding a field to the mbuf_ext_pgs struct to save the record type, and passing the real record type to the sw_encrypt() ktls backend functions. Reviewed by: jhb, hselasky Sponsored by: Netflix Differential Revision:D21801 Modified: head/sys/kern/uipc_ktls.c head/sys/net/iflib.c head/sys/opencrypto/ktls_ocf.c head/sys/sys/ktls.h head/sys/sys/mbuf.h Modified: head/sys/kern/uipc_ktls.c == --- head/sys/kern/uipc_ktls.c Fri Sep 27 19:14:03 2019(r352813) +++ head/sys/kern/uipc_ktls.c Fri Sep 27 19:17:40 2019(r352814) @@ -389,14 +389,14 @@ ktls_create_session(struct socket *so, struct tls_enab if (en->tls_vmajor != TLS_MAJOR_VER_ONE) return (EINVAL); if (en->tls_vminor < TLS_MINOR_VER_ZERO || - en->tls_vminor > TLS_MINOR_VER_TWO) + en->tls_vminor > TLS_MINOR_VER_THREE) return (EINVAL); if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); - if (en->iv_len < 0 || en->iv_len > TLS_MAX_PARAM_SIZE) + if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv)) return (EINVAL); /* All supported algorithms require a cipher key. */ @@ -425,7 +425,10 @@ ktls_create_session(struct socket *so, struct tls_enab } if (en->auth_key_len != 0) return (EINVAL); - if (en->iv_len != TLS_AEAD_GCM_LEN) + if ((en->tls_vminor == TLS_MINOR_VER_TWO && + en->iv_len != TLS_AEAD_GCM_LEN) || + (en->tls_vminor == TLS_MINOR_VER_THREE && + en->iv_len != TLS_1_3_GCM_IV_LEN)) return (EINVAL); break; case CRYPTO_AES_CBC: @@ -477,8 +480,22 @@ ktls_create_session(struct socket *so, struct tls_enab tls->params.tls_hlen = sizeof(struct tls_record_layer); switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: - tls->params.tls_hlen += 8; + /* +* TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte +* nonce. TLS 1.3 uses a 12 byte implicit IV. +*/ + if (en->tls_vminor < TLS_MINOR_VER_THREE) + tls->params.tls_hlen += sizeof(uint64_t); tls->params.tls_tlen = AES_GMAC_HASH_LEN; + + /* +* TLS 1.3 includes optional padding which we +* do not support, and also puts the "real" record +* type at the end of the encrypted data. +*/ + if (en->tls_vminor == TLS_MINOR_VER_THREE) + tls->params.tls_tlen += sizeof(uint8_t); + tls->params.tls_bs = 1; break; case CRYPTO_AES_CBC: @@ -539,7 +556,6 @@ ktls_create_session(struct socket *so, struct tls_enab * of the IV are generated in ktls_frame() and ktls_seq(). */ if (en->iv_len != 0) { - MPASS(en->iv_len <= sizeof(tls->params.iv)); tls->params.iv_len = en->iv_len; error = copyin(en->iv, tls->params.iv, en->iv_len); if (error) @@ -1188,8 +1204,21 @@ ktls_frame(struct mbuf *top, struct ktls_session *tls, /* Populate the TLS header. */ tlshdr = (void *)pgs->hdr; tlshdr->tls_vmajor = tls->params.tls_vmajor; - tlshdr->tls_vminor = tls->params.tls_vminor; - tlshdr->tls_type = record_type; + + /* +* TLS 1.3 masquarades as TLS 1.2 with a record type +* of TLS_RLTYPE_APP. +*/ + if (tls->params.tls_vminor == TLS_MINOR_VER_THREE && + tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) { + tlshdr->tls_vminor = TLS_MINOR_VER_TWO; + tlshdr->tls_type = TLS_RLTYPE_APP; + /* save the real record type for later */ + pgs->record_type = record_type; + } else { + tlshdr->tls_vminor = tls->params.tls_vminor; + tlshdr->tls_type = record_type; + } tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr)); /* @@ -1365,7 +1394,8 @@ retry_page:
svn commit: r352552 - head/sys/kern
Author: gallatin Date: Fri Sep 20 09:36:07 2019 New Revision: 352552 URL: https://svnweb.freebsd.org/changeset/base/352552 Log: remove redundant "ktls" in KTLS thr name This reducesthe string width of the ktls thread name and improves "ps" output. Glanced at by: jhb Event: EuroBSDCon hackathon Sponsored by: Netflix Modified: head/sys/kern/uipc_ktls.c Modified: head/sys/kern/uipc_ktls.c == --- head/sys/kern/uipc_ktls.c Fri Sep 20 09:04:52 2019(r352551) +++ head/sys/kern/uipc_ktls.c Fri Sep 20 09:36:07 2019(r352552) @@ -349,7 +349,7 @@ ktls_init(void *dummy __unused) STAILQ_INIT(_wq[i].head); mtx_init(_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); error = kproc_kthread_add(ktls_work_thread, _wq[i], - _proc, , 0, 0, "KTLS", "ktls_thr_%d", i); + _proc, , 0, 0, "KTLS", "thr_%d", i); if (error) panic("Can't add KTLS thread %d error %d", i, error); ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r352228 - head/sys/netinet
On 2019-09-11 17:16, Conrad Meyer wrote: Small nitpick: On Wed, Sep 11, 2019 at 11:48 AM Andrew Gallatin wrote: Note that on a system under a syn flood attack, arc4random() becomes quite expensive, and the chacha_poly crypto that it calls arc4random uses chacha20 — there is no "poly" involved. Best, Conrad Sorry for the mis-statement. poly is associated with chacha in my mind. In any case, calling arc4random() and the chacha it uses millions of times per second is expensive, and avoiding it provides some headroom. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r352228 - head/sys/netinet
Author: gallatin Date: Wed Sep 11 18:48:26 2019 New Revision: 352228 URL: https://svnweb.freebsd.org/changeset/base/352228 Log: Avoid unneeded call to arc4random() in syncache_add() Don't call arc4random() unconditionally to initialize sc_iss, and then when syncookies are enabled, just overwrite it with the return value from from syncookie_generate(). Instead, only call arc4random() to initialize sc_iss when syncookies are not enabled. Note that on a system under a syn flood attack, arc4random() becomes quite expensive, and the chacha_poly crypto that it calls is one of the more expensive things happening on the system. Removing this unneeded arc4random() call reduces CPU from about 40% to about 35% in my test scenario (Broadwell Xeon, 6Mpps syn flood attack). Reviewed by: rrs, tuxen, bz Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D21591 Modified: head/sys/netinet/tcp_syncache.c Modified: head/sys/netinet/tcp_syncache.c == --- head/sys/netinet/tcp_syncache.c Wed Sep 11 18:40:05 2019 (r352227) +++ head/sys/netinet/tcp_syncache.c Wed Sep 11 18:48:26 2019 (r352228) @@ -1543,7 +1543,6 @@ skip_alloc: sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; - sc->sc_iss = arc4random(); sc->sc_flags = 0; sc->sc_flowlabel = 0; @@ -1617,6 +1616,8 @@ skip_alloc: if (V_tcp_syncookies) sc->sc_iss = syncookie_generate(sch, sc); + else + sc->sc_iss = arc4random(); #ifdef INET6 if (autoflowlabel) { if (V_tcp_syncookies) ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r346632 - head/sys/net
Author: gallatin Date: Wed Apr 24 13:32:04 2019 New Revision: 346632 URL: https://svnweb.freebsd.org/changeset/base/346632 Log: iflib: Add pfil hooks As with mlx5en, the idea is to drop unwanted traffic as early in receive as possible, before mbufs are allocated and anything is passed up the stack. This can save considerable CPU time when a machine is under a flooding style DOS attack. The major change here is to remove the unneeded abstraction where callers of rxd_frag_to_sd() get back a pointer to the mbuf ring, and are responsible for NULL'ing that mbuf themselves. Now this happens directly in rxd_frag_to_sd(), and it returns an mbuf. This allows us to use the decision (and potentially mbuf) returned by the pfil hooks. The driver can now recycle mbufs to avoid re-allocation when packets are dropped. Reviewed by: marius (shurd and erj also provided feedback) Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D19645 Modified: head/sys/net/iflib.c Modified: head/sys/net/iflib.c == --- head/sys/net/iflib.cWed Apr 24 13:15:56 2019(r346631) +++ head/sys/net/iflib.cWed Apr 24 13:32:04 2019(r346632) @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -432,6 +433,7 @@ struct iflib_rxq { if_ctx_tifr_ctx; iflib_fl_t ifr_fl; uint64_tifr_rx_irq; + struct pfil_head*pfil; uint16_tifr_id; uint8_t ifr_lro_enabled; uint8_t ifr_nfl; @@ -451,7 +453,6 @@ struct iflib_rxq { typedef struct if_rxsd { caddr_t *ifsd_cl; - struct mbuf **ifsd_m; iflib_fl_t ifsd_fl; qidx_t ifsd_cidx; } *if_rxsd_t; @@ -652,7 +653,6 @@ static int iflib_fast_intrs; static int iflib_rx_unavail; static int iflib_rx_ctx_inactive; static int iflib_rx_if_input; -static int iflib_rx_mbuf_null; static int iflib_rxd_flush; static int iflib_verbose_debug; @@ -669,8 +669,6 @@ SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLF _rx_ctx_inactive, 0, "# times rxeof called with inactive context"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD, _rx_if_input, 0, "# times rxeof called if_input"); -SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD, - _rx_mbuf_null, 0, "# times rxeof got null mbuf"); SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD, _rxd_flush, 0, "# times rxd_flush called"); SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW, @@ -689,7 +687,7 @@ iflib_debug_reset(void) iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs = iflib_rx_unavail = iflib_rx_ctx_inactive = iflib_rx_if_input = - iflib_rx_mbuf_null = iflib_rxd_flush = 0; + iflib_rxd_flush = 0; } #else @@ -2002,11 +2000,12 @@ _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int coun bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx], BUS_DMASYNC_PREREAD); - MPASS(sd_m[frag_idx] == NULL); - if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) { - break; + if (sd_m[frag_idx] == NULL) { + if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) { + break; + } + sd_m[frag_idx] = m; } - sd_m[frag_idx] = m; bit_set(fl->ifl_rx_bitmap, frag_idx); #if MEMORY_LOGGING fl->ifl_m_enqueued++; @@ -2483,13 +2482,15 @@ prefetch_pkts(iflib_fl_t fl, int cidx) prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]); } -static void -rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int unload, if_rxsd_t sd) +static struct mbuf * +rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd, +int *pf_rv, if_rxd_info_t ri) { - int flid, cidx; bus_dmamap_t map; iflib_fl_t fl; - int next; + caddr_t payload; + struct mbuf *m; + int flid, cidx, len, next; map = NULL; flid = irf->irf_flid; @@ -2497,7 +2498,7 @@ rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int fl = >ifr_fl[flid]; sd->ifsd_fl = fl; sd->ifsd_cidx = cidx; - sd->ifsd_m = >ifl_sds.ifsd_m[cidx]; + m = fl->ifl_sds.ifsd_m[cidx]; sd->ifsd_cl = >ifl_sds.ifsd_cl[cidx]; fl->ifl_credits--; #if MEMORY_LOGGING @@ -2513,39 +2514,89 @@ rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int /* not valid assert if bxe really does SGE from non-contiguous elements */ MPASS(fl->ifl_cidx == cidx); bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD); + + if (rxq->pfil !=
svn commit: r346579 - in head: share/man/man9 sys/dev/cxgbe sys/dev/mlx5/mlx5_en sys/net
Author: gallatin Date: Mon Apr 22 19:24:21 2019 New Revision: 346579 URL: https://svnweb.freebsd.org/changeset/base/346579 Log: Track device's NUMA domain in ifnet & alloc ifnet from NUMA local memory This commit adds new if_alloc_domain() and if_alloc_dev() methods to allocate ifnets. When called with a domain on a NUMA machine, ifalloc_domain() will record the NUMA domain in the ifnet, and it will allocate the ifnet struct from memory which is local to that NUMA node. Similarly, if_alloc_dev() is a wrapper for if_alloc_domain which uses a driver supplied device_t to call ifalloc_domain() with the appropriate domain. Note that the new if_numa_domain field fits in an alignment pad in struct ifnet, and so does not alter the size of the structure. Reviewed by: glebius, kib, markj Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D19930 Modified: head/share/man/man9/Makefile head/share/man/man9/ifnet.9 head/sys/dev/cxgbe/t4_main.c head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c head/sys/net/if.c head/sys/net/if_var.h Modified: head/share/man/man9/Makefile == --- head/share/man/man9/MakefileMon Apr 22 19:21:35 2019 (r346578) +++ head/share/man/man9/MakefileMon Apr 22 19:24:21 2019 (r346579) @@ -1175,6 +1175,8 @@ MLINKS+=iflibtxrx.9 isc_rxd_available.9 \ iflibtxrx.9 isc_txd_flush.9 MLINKS+=ifnet.9 if_addmulti.9 \ ifnet.9 if_alloc.9 \ + ifnet.9 if_alloc_dev.9 \ + ifnet.9 if_alloc_domain.9 \ ifnet.9 if_allmulti.9 \ ifnet.9 if_attach.9 \ ifnet.9 if_data.9 \ Modified: head/share/man/man9/ifnet.9 == --- head/share/man/man9/ifnet.9 Mon Apr 22 19:21:35 2019(r346578) +++ head/share/man/man9/ifnet.9 Mon Apr 22 19:24:21 2019(r346579) @@ -48,6 +48,10 @@ .Ss "Interface Manipulation Functions" .Ft "struct ifnet *" .Fn if_alloc "u_char type" +.Ft "struct ifnet *" +.Fn if_alloc_dev "u_char type" "device_t dev" +.Ft "struct ifnet *" +.Fn if_alloc_domain "u_char type" "int numa_domain" .Ft void .Fn if_attach "struct ifnet *ifp" .Ft void @@ -440,6 +444,15 @@ It is used to cache the type passed to but unlike .Va if_type , it would not be changed by drivers. +.It Va if_numa_domain +.Pq Vt uint8_t +The NUMA domain of the hardware device associated with the interface. +This is filled in with a wildcard value unless the kernel is NUMA +aware, the system is a NUMA system, and the ifnet is allocated +using +.Fn if_alloc_dev +or +.Fn if_alloc_domain . .El .Pp References to @@ -1151,6 +1164,24 @@ include the allocation of a .Fa type specific structure in .Va if_l2com . +.It Fn if_alloc_dev +Allocate and initialize +.Vt "struct ifnet" +as +.Fn if_alloc +does, with the addition that the ifnet can be tagged with the +appropriate NUMA domain derived from the +.Fa dev +argument passed by the caller. +.It Fn if_alloc_domain +Allocate and initialize +.Vt "struct ifnet" +as +.Fn if_alloc +does, with the addition that the ifnet will be tagged with the NUMA +domain via the +.Fa numa_domain +argument passed by the caller. .It Fn if_attach Link the specified interface .Fa ifp @@ -1168,7 +1199,10 @@ function.) The .Fa ifp must have been allocated by -.Fn if_alloc . +.Fn if_alloc , +.Fn if_alloc_dev +or +.Fn if_alloc_domain . .It Fn if_detach Shut down and unlink the specified .Fa ifp Modified: head/sys/dev/cxgbe/t4_main.c == --- head/sys/dev/cxgbe/t4_main.cMon Apr 22 19:21:35 2019 (r346578) +++ head/sys/dev/cxgbe/t4_main.cMon Apr 22 19:24:21 2019 (r346579) @@ -1636,7 +1636,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi) callout_init(>tick, 1); /* Allocate an ifnet and set it up */ - ifp = if_alloc(IFT_ETHER); + ifp = if_alloc_dev(IFT_ETHER, dev); if (ifp == NULL) { device_printf(dev, "Cannot allocate ifnet\n"); return (ENOMEM); Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c == --- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 22 19:21:35 2019 (r346578) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 22 19:24:21 2019 (r346579) @@ -3682,7 +3682,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev) M_MLX5EN, M_WAITOK | M_ZERO); mlx5e_priv_mtx_init(priv); - ifp = priv->ifp = if_alloc(IFT_ETHER); + ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev); if (ifp == NULL) { mlx5_core_err(mdev, "if_alloc() failed\n"); goto err_free_priv; Modified: head/sys/net/if.c
svn commit: r346281 - head/sys/sys
Author: gallatin Date: Tue Apr 16 16:49:34 2019 New Revision: 346281 URL: https://svnweb.freebsd.org/changeset/base/346281 Log: Replace cosqos with numa_domain in mbuf pkthdr The cosqos field was added nearly 6 years ago in r254804, and it is still unused by any in-tree consumers. I have a patchset that I'm working on which aligns many network resources by NUMA domain, including inps, inpcb lb group, tcp pacing, lagg output link selection, backing pages for sendfile, and more. It reduces cross-domain traffic by roughly 50% for a real web workload. This patchset relies on being able to store the numa domain in the mbuf, and grabbing the unused cosqos field for this purpose is the first step in starting to usptream it. Reviewed by: kib, markj Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D19862 Modified: head/sys/sys/mbuf.h Modified: head/sys/sys/mbuf.h == --- head/sys/sys/mbuf.h Tue Apr 16 15:52:04 2019(r346280) +++ head/sys/sys/mbuf.h Tue Apr 16 16:49:34 2019(r346281) @@ -98,6 +98,7 @@ struct mbuf; #defineMLEN((int)(MSIZE - MHSIZE)) #defineMHLEN ((int)(MSIZE - MPKTHSIZE)) #defineMINCLSIZE (MHLEN + 1) +#defineM_NODOM 255 #ifdef _KERNEL /*- @@ -158,7 +159,7 @@ struct pkthdr { uint32_t flowid;/* packet's 4-tuple system */ uint32_t csum_flags;/* checksum and offload features */ uint16_t fibnum;/* this packet should use this fib */ - uint8_t cosqos;/* class/quality of service */ + uint8_t numa_domain; /* NUMA domain of recvd pkt */ uint8_t rsstype; /* hash type */ union { uint64_trcv_tstmp; /* timestamp in ns */ @@ -405,33 +406,6 @@ struct mbuf { #defineM_HASHTYPE_SET(m, v)((m)->m_pkthdr.rsstype = (v)) #defineM_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) #defineM_HASHTYPE_ISHASH(m)(M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP) - -/* - * COS/QOS class and quality of service tags. - * It uses DSCP code points as base. - */ -#defineQOS_DSCP_CS00x00 -#defineQOS_DSCP_DEFQOS_DSCP_CS0 -#defineQOS_DSCP_CS10x20 -#defineQOS_DSCP_AF11 0x28 -#defineQOS_DSCP_AF12 0x30 -#defineQOS_DSCP_AF13 0x38 -#defineQOS_DSCP_CS20x40 -#defineQOS_DSCP_AF21 0x48 -#defineQOS_DSCP_AF22 0x50 -#defineQOS_DSCP_AF23 0x58 -#defineQOS_DSCP_CS30x60 -#defineQOS_DSCP_AF31 0x68 -#defineQOS_DSCP_AF32 0x70 -#defineQOS_DSCP_AF33 0x78 -#defineQOS_DSCP_CS40x80 -#defineQOS_DSCP_AF41 0x88 -#defineQOS_DSCP_AF42 0x90 -#defineQOS_DSCP_AF43 0x98 -#defineQOS_DSCP_CS50xa0 -#defineQOS_DSCP_EF 0xb8 -#defineQOS_DSCP_CS60xc0 -#defineQOS_DSCP_CS70xe0 /* * External mbuf storage buffer types. ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r346247 - head/sys/dev/mlx5/mlx5_en
Author: gallatin Date: Mon Apr 15 17:14:50 2019 New Revision: 346247 URL: https://svnweb.freebsd.org/changeset/base/346247 Log: mlx5en: Enable new pfil(9) KPI ethernet filtering hooks This allows efficient filtering at packet ingress on mlx5en. Note that the packets are filtered (and potentially dropped) *before* the driver has committed to (re)allocating an mbuf for the packet. Dropped packets are treated essentially the same as an error. Nothing is allocated, and the existing buffer is recycled. This allows us to drop malicious packets at close to line rate with very little CPU use. Reviewed by: hselasky, slavash, kib Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D19063 Modified: head/sys/dev/mlx5/mlx5_en/en.h head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c Modified: head/sys/dev/mlx5/mlx5_en/en.h == --- head/sys/dev/mlx5/mlx5_en/en.h Mon Apr 15 16:57:27 2019 (r346246) +++ head/sys/dev/mlx5/mlx5_en/en.h Mon Apr 15 17:14:50 2019 (r346247) @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -838,6 +839,7 @@ struct mlx5e_priv { struct mlx5e_clbr_point clbr_points[2]; u_int clbr_gen; + struct pfil_head *pfil; struct mlx5e_channel channel[]; }; Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c == --- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 15 16:57:27 2019 (r346246) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 15 17:14:50 2019 (r346247) @@ -3664,6 +3664,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev) struct sysctl_oid_list *child; int ncv = mdev->priv.eq_table.num_comp_vectors; char unit[16]; + struct pfil_head_args pa; int err; int i; u32 eth_proto_cap; @@ -3898,6 +3899,12 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev) callout_init(>tstmp_clbr, CALLOUT_DIRECT); mlx5e_reset_calibration_callout(priv); + pa.pa_version = PFIL_VERSION; + pa.pa_flags = PFIL_IN; + pa.pa_type = PFIL_TYPE_ETHERNET; + pa.pa_headname = ifp->if_xname; + priv->pfil = pfil_head_register(); + return (priv); #ifdef RATELIMIT @@ -3972,6 +3979,12 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vp if_printf(priv->ifp, "Waiting for all unlimited connections " "to terminate\n"); pause("W", hz); + } + + /* deregister pfil */ + if (priv->pfil != NULL) { + pfil_head_unregister(priv->pfil); + priv->pfil = NULL; } /* unregister device */ Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c == --- head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c Mon Apr 15 16:57:27 2019 (r346246) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c Mon Apr 15 17:14:50 2019 (r346247) @@ -430,15 +430,18 @@ mlx5e_decompress_cqes(struct mlx5e_cq *cq) static int mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget) { - int i; + struct pfil_head *pfil; + int i, rv; + CURVNET_SET_QUIET(rq->ifp->if_vnet); + pfil = rq->channel->priv->pfil; for (i = 0; i < budget; i++) { struct mlx5e_rx_wqe *wqe; struct mlx5_cqe64 *cqe; struct mbuf *mb; __be16 wqe_counter_be; u16 wqe_counter; - u32 byte_cnt; + u32 byte_cnt, seglen; cqe = mlx5e_get_cqe(>cq); if (!cqe) @@ -462,6 +465,39 @@ mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget) rq->stats.wqe_err++; goto wq_ll_pop; } + if (pfil != NULL && PFIL_HOOKED_IN(pfil)) { + seglen = MIN(byte_cnt, MLX5E_MAX_RX_BYTES); + rv = pfil_run_hooks(rq->channel->priv->pfil, + rq->mbuf[wqe_counter].data, rq->ifp, + seglen | PFIL_MEMPTR | PFIL_IN, NULL); + + switch (rv) { + case PFIL_DROPPED: + case PFIL_CONSUMED: + /* +* Filter dropped or consumed it. In +* either case, we can just recycle +* buffer; there is no more work to do. +*/ + rq->stats.packets++; + goto wq_ll_pop; + case PFIL_REALLOCED: + /* +* Filter copied it; recycle buffer +
Re: svn commit: r351200 - in head/sys: amd64/amd64 dev/acpica
On 2019-08-18 19:44, Jeff Roberson wrote: Author: jeff Date: Sun Aug 18 23:44:23 2019 New Revision: 351200 <..> Log: Allocate all per-cpu datastructures in domain correct memory. Reviewed by: kib, gallatin (some objections) No objection to what you actually committed. The only objection was this issues I found on non-NUMA, which you fixed in the committed code. Thanks! Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r350245 - head/usr.sbin/pciconf
Author: gallatin Date: Tue Jul 23 16:28:17 2019 New Revision: 350245 URL: https://svnweb.freebsd.org/changeset/base/350245 Log: pciconf: report PCI Gen4 speeds PCIe gen4 runs at 16GT/s. Report this as the speed of Gen4 links. Reviewed by: imp MFC after:7 days Sponsored by: Netflix Modified: head/usr.sbin/pciconf/cap.c Modified: head/usr.sbin/pciconf/cap.c == --- head/usr.sbin/pciconf/cap.c Tue Jul 23 16:27:36 2019(r350244) +++ head/usr.sbin/pciconf/cap.c Tue Jul 23 16:28:17 2019(r350245) @@ -389,6 +389,8 @@ link_speed_string(uint8_t speed) return ("5.0"); case 3: return ("8.0"); + case 4: + return ("16.0"); default: return ("undef"); } ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r349055 - head/sys/net
On 2019-06-15 11:59, Marius Strobl wrote: On Sat, Jun 15, 2019 at 09:08:05AM -0400, Andrew Gallatin wrote: On 2019-06-15 07:07, Marius Strobl wrote: Author: marius Date: Sat Jun 15 11:07:41 2019 New Revision: 349055 Log: - Replace unused and only ever written to members of public iflib(9) structs with placeholders (in the latter case, IFLIB_MAX_TX_BYTES etc. are also only ever used for these write-only members if at all, so both these macros and members can just go). Using these spares may render it possible to merge certain iflib(9) fixes to stable/12. Otherwise, changes extending struct if_irq or struct if_shared_ctx in any way would break KBI as instances of these are allocated by the driver front-ends (by contrast, struct if_pkt_info as well as struct if_softc_ctx instances are provided by iflib(9) and, thus, may grow at least at the end without breaking KBI). Given the above, why replace ipi_tcp_sum in if_pkt_info with a spare? Given that if_pkt_info can grow, I would also expect it to be able to shrink. So I don't quite see why the spare is needed here. I also worry about carrying the other spares around forever. Yes, KBI-wise it should be also safe for instances of structures allocated by iflib(9) to shrink at the end (though shrinking structures usually isn't a concern when MFCing as such parts may just be omitted); changes altering the offsets of members would be a problem regarding KBI. Still, I don't like changing the size of publicly visible structures in stable branches without a real good reason even if such a change doesn't strictly break the KBI. So the plan is to MFC the spares but then to get rid of the ones whose removal doesn't break KBI in head. Marius Thanks, that makes sense Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r349055 - head/sys/net
On 2019-06-15 07:07, Marius Strobl wrote: Author: marius Date: Sat Jun 15 11:07:41 2019 New Revision: 349055 Log: - Replace unused and only ever written to members of public iflib(9) structs with placeholders (in the latter case, IFLIB_MAX_TX_BYTES etc. are also only ever used for these write-only members if at all, so both these macros and members can just go). Using these spares may render it possible to merge certain iflib(9) fixes to stable/12. Otherwise, changes extending struct if_irq or struct if_shared_ctx in any way would break KBI as instances of these are allocated by the driver front-ends (by contrast, struct if_pkt_info as well as struct if_softc_ctx instances are provided by iflib(9) and, thus, may grow at least at the end without breaking KBI). Given the above, why replace ipi_tcp_sum in if_pkt_info with a spare? Given that if_pkt_info can grow, I would also expect it to be able to shrink. So I don't quite see why the spare is needed here. I also worry about carrying the other spares around forever. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r348241 - head
On 2019-05-24 11:45, Mark Johnston wrote: Modernize the MAKE_JUST_KERNELS hint in the top-level makefile. It doesn't make sense to limit to -j12 anymore, build scalability is better than it used to be. Fold the hint into the description of the universe target. Reviewed by: imp Dumb question about this: Will it update toolchains, or just use what can find? Thanks, Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r348109 - in head/sys/x86: include x86
On 2019-05-22 13:09, Andriy Gapon wrote: On 22/05/2019 16:44, Andrew Gallatin wrote: This is needed for AMD SMCA processors, as SMCA uses different MSR address for access MCA banks. Just curious, what is SMCA? " Scalable Machine Check Architecture " See https://www.nextplatform.com/2017/07/12/heart-amds-epyc-comeback-infinity-fabric/ Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r348109 - in head/sys/x86: include x86
Author: gallatin Date: Wed May 22 13:44:15 2019 New Revision: 348109 URL: https://svnweb.freebsd.org/changeset/base/348109 Log: x86 MCA: introduce MCA hooks for different vendor implementations This is needed for AMD SMCA processors, as SMCA uses different MSR address for access MCA banks. Use IA32 specific msr_ops as defualt, and use SMCA-specific msr_ops when on an SMCA-enabled processor Submitted by: chandu from amd dot com Reviewed by: cem Differential Revision:https://reviews.freebsd.org/D18055 Modified: head/sys/x86/include/specialreg.h head/sys/x86/x86/mca.c Modified: head/sys/x86/include/specialreg.h == --- head/sys/x86/include/specialreg.h Wed May 22 08:30:03 2019 (r348108) +++ head/sys/x86/include/specialreg.h Wed May 22 13:44:15 2019 (r348109) @@ -944,6 +944,16 @@ #defineMC_MISC_AMD_PTR_MASK0xff00 /* Pointer to additional registers */ #defineMC_MISC_AMD_PTR_SHIFT 24 +/* AMD Scalable MCA */ +#define MSR_SMCA_MC0_CTL 0xc0002000 +#define MSR_SMCA_MC0_STATUS 0xc0002001 +#define MSR_SMCA_MC0_ADDR 0xc0002002 +#define MSR_SMCA_MC0_MISC00xc0002003 +#define MSR_SMCA_MC_CTL(x) (MSR_SMCA_MC0_CTL + 0x10 * (x)) +#define MSR_SMCA_MC_STATUS(x)(MSR_SMCA_MC0_STATUS + 0x10 * (x)) +#define MSR_SMCA_MC_ADDR(x) (MSR_SMCA_MC0_ADDR + 0x10 * (x)) +#define MSR_SMCA_MC_MISC(x) (MSR_SMCA_MC0_MISC0 + 0x10 * (x)) + /* * The following four 3-byte registers control the non-cacheable regions. * These registers must be written as three separate bytes. Modified: head/sys/x86/x86/mca.c == --- head/sys/x86/x86/mca.c Wed May 22 08:30:03 2019(r348108) +++ head/sys/x86/x86/mca.c Wed May 22 13:44:15 2019(r348109) @@ -90,6 +90,13 @@ struct mca_internal { STAILQ_ENTRY(mca_internal) link; }; +struct mca_enumerator_ops { +unsigned int (*ctl)(int); +unsigned int (*status)(int); +unsigned int (*addr)(int); +unsigned int (*misc)(int); +}; + static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture"); static volatile int mca_count; /* Number of records stored. */ @@ -124,6 +131,61 @@ static struct taskqueue *mca_tq; static struct task mca_refill_task, mca_scan_task; static struct mtx mca_lock; +static unsigned int +mca_ia32_ctl_reg(int bank) +{ + return (MSR_MC_CTL(bank)); +} + +static unsigned int +mca_ia32_status_reg(int bank) +{ + return (MSR_MC_STATUS(bank)); +} + +static unsigned int +mca_ia32_addr_reg(int bank) +{ + return (MSR_MC_ADDR(bank)); +} + +static unsigned int +mca_ia32_misc_reg(int bank) +{ + return (MSR_MC_MISC(bank)); +} + +static unsigned int +mca_smca_ctl_reg(int bank) +{ +return (MSR_SMCA_MC_CTL(bank)); +} + +static unsigned int +mca_smca_status_reg(int bank) +{ +return (MSR_SMCA_MC_STATUS(bank)); +} + +static unsigned int +mca_smca_addr_reg(int bank) +{ +return (MSR_SMCA_MC_ADDR(bank)); +} + +static unsigned int +mca_smca_misc_reg(int bank) +{ +return (MSR_SMCA_MC_MISC(bank)); +} + +static struct mca_enumerator_ops mca_msr_ops = { +.ctl= mca_ia32_ctl_reg, +.status = mca_ia32_status_reg, +.addr = mca_ia32_addr_reg, +.misc = mca_ia32_misc_reg +}; + #ifdef DEV_APIC static struct cmc_state **cmc_state; /* Indexed by cpuid, bank. */ static struct amd_et_state **amd_et_state; /* Indexed by cpuid, bank. */ @@ -462,7 +524,7 @@ mca_check_status(int bank, struct mca_record *rec) uint64_t status; u_int p[4]; - status = rdmsr(MSR_MC_STATUS(bank)); + status = rdmsr(mca_msr_ops.status(bank)); if (!(status & MC_STATUS_VAL)) return (0); @@ -471,10 +533,10 @@ mca_check_status(int bank, struct mca_record *rec) rec->mr_bank = bank; rec->mr_addr = 0; if (status & MC_STATUS_ADDRV) - rec->mr_addr = rdmsr(MSR_MC_ADDR(bank)); + rec->mr_addr = rdmsr(mca_msr_ops.addr(bank)); rec->mr_misc = 0; if (status & MC_STATUS_MISCV) - rec->mr_misc = rdmsr(MSR_MC_MISC(bank)); + rec->mr_misc = rdmsr(mca_msr_ops.misc(bank)); rec->mr_tsc = rdtsc(); rec->mr_apic_id = PCPU_GET(apic_id); rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP); @@ -488,7 +550,7 @@ mca_check_status(int bank, struct mca_record *rec) * errors so that the BIOS can see them. */ if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) { - wrmsr(MSR_MC_STATUS(bank), 0); + wrmsr(mca_msr_ops.status(bank), 0); do_cpuid(0, p); } return (1); @@ -648,7 +710,7 @@ amd_thresholding_update(enum scan_mode mode, int bank, int count; cc =
Re: svn commit: r347410 - in head: . sys/amd64/conf sys/arm/conf sys/arm64/conf sys/i386/conf sys/powerpc/conf sys/riscv/conf sys/sparc64/conf
On 2019-05-10 11:50, Kristof Provost wrote: On 10 May 2019, at 8:31, Andrew Gallatin wrote: On 2019-05-10 08:44, Slawa Olhovchenkov wrote: pf have ifdef for IPSEC, but don't have support IPSEC_SUPPORT (netpfil/pf/if_pfsync.c). Thanks for pointing this out. It seems like IPSEC_SUPPORT would work for this. I've made a patch, and it compiles and the pf module loads. However, I have no knowledge of how to test it. Is this something that you use, and which you can test? I suspect this code has not actually been enabled for a long time. gettdb() doesn’t actually appear to be defined anywhere, so I wouldn’t expect it to ever compile. gettdb() does exist in OpenBSD, so my current guess is that this is just an import artefact, and we should |#ifdef OPENBSD| it or something, or just remove it completely. For completeness, and because I never shut up about this: to test pf |kldload pfsync|, |cd /usr/tests/sys/netpfil/pf| and |sudo kyua test| There’s more information in the current edition of the FreeBSD journal. Regards, Kristof Thanks, you are correct. Including options_ipsec.h reveals that the code does not even compile (cannot find gettdb(), which does not appear to be defined anywhere in our tree). Given that it is dead code, I'd rather just not touch it. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r347410 - in head: . sys/amd64/conf sys/arm/conf sys/arm64/conf sys/i386/conf sys/powerpc/conf sys/riscv/conf sys/sparc64/conf
On 2019-05-10 08:44, Slawa Olhovchenkov wrote: pf have ifdef for IPSEC, but don't have support IPSEC_SUPPORT (netpfil/pf/if_pfsync.c). Thanks for pointing this out. It seems like IPSEC_SUPPORT would work for this. I've made a patch, and it compiles and the pf module loads. However, I have no knowledge of how to test it. Is this something that you use, and which you can test? Thanks, Drew diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c index 45b1e090f95c..cc06637b862e 100644 --- a/sys/netpfil/pf/if_pfsync.c +++ b/sys/netpfil/pf/if_pfsync.c @@ -308,7 +308,7 @@ static void pfsync_bulk_update(void *); static void pfsync_bulk_fail(void *); static void pfsync_detach_ifnet(struct ifnet *); -#ifdef IPSEC +#ifdef IPSEC_SUPPORT static void pfsync_update_net_tdb(struct pfsync_tdb *); #endif static struct pfsync_bucket *pfsync_get_bucket(struct pfsync_softc *, @@ -1228,7 +1228,7 @@ pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { int len = count * sizeof(struct pfsync_tdb); -#if defined(IPSEC) +#if defined(IPSEC_SUPPORT) struct pfsync_tdb *tp; struct mbuf *mp; int offp; @@ -1249,7 +1249,7 @@ pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) return (len); } -#if defined(IPSEC) +#if defined(IPSEC_SUPPORT) /* Update an in-kernel tdb. Silently fail if no tdb is found. */ static void pfsync_update_net_tdb(struct pfsync_tdb *pt) ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r347430 - in head/sys: kern netinet sys
Author: gallatin Date: Fri May 10 13:41:19 2019 New Revision: 347430 URL: https://svnweb.freebsd.org/changeset/base/347430 Log: Bind TCP HPTS (pacer) threads to NUMA domains Bind the TCP pacer threads to NUMA domains and build per-domain pacer-thread lookup tables. These tables allow us to use the inpcb's NUMA domain information to match an inpcb with a pacer thread on the same domain. The motivation for this is to keep the TCP connection local to a NUMA domain as much as possible. Thanks to jhb for pre-reviewing an earlier version of the patch. Reviewed by: rrs Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D20134 Modified: head/sys/kern/kern_intr.c head/sys/netinet/tcp_hpts.c head/sys/sys/interrupt.h Modified: head/sys/kern/kern_intr.c == --- head/sys/kern/kern_intr.c Fri May 10 13:18:22 2019(r347429) +++ head/sys/kern/kern_intr.c Fri May 10 13:41:19 2019(r347430) @@ -380,6 +380,25 @@ intr_event_bind_ithread(struct intr_event *ie, int cpu return (_intr_event_bind(ie, cpu, false, true)); } +/* + * Bind an interrupt event's ithread to the specified cpuset. + */ +int +intr_event_bind_ithread_cpuset(struct intr_event *ie, cpuset_t *cs) +{ + lwpid_t id; + + mtx_lock(>ie_lock); + if (ie->ie_thread != NULL) { + id = ie->ie_thread->it_thread->td_tid; + mtx_unlock(>ie_lock); + return (cpuset_setthread(id, cs)); + } else { + mtx_unlock(>ie_lock); + } + return (ENODEV); +} + static struct intr_event * intr_lookup(int irq) { Modified: head/sys/netinet/tcp_hpts.c == --- head/sys/netinet/tcp_hpts.c Fri May 10 13:18:22 2019(r347429) +++ head/sys/netinet/tcp_hpts.c Fri May 10 13:41:19 2019(r347430) @@ -131,6 +131,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -171,7 +172,7 @@ MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #include static int tcp_bind_threads = 1; #else -static int tcp_bind_threads = 0; +static int tcp_bind_threads = 2; #endif TUNABLE_INT("net.inet.tcp.bind_hptss", _bind_threads); @@ -207,6 +208,13 @@ static int32_t logging_on = 0; static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); static int32_t tcp_hpts_precision = 120; +struct hpts_domain_info { + int count; + int cpu[MAXCPU]; +}; + +struct hpts_domain_info hpts_domains[MAXMEMDOM]; + SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, _hpts_precision, 120, "Value for PRE() precision of callout"); @@ -1079,8 +1087,10 @@ hpts_random_cpu(struct inpcb *inp){ static uint16_t hpts_cpuid(struct inpcb *inp){ u_int cpuid; +#ifdef NUMA + struct hpts_domain_info *di; +#endif - /* * If one has been set use it i.e. we want both in and out on the * same hpts. @@ -1103,11 +1113,21 @@ hpts_cpuid(struct inpcb *inp){ * unknown cpuids to curcpu. Not the best, but apparently better * than defaulting to swi 0. */ - if (inp->inp_flowtype != M_HASHTYPE_NONE) { + + if (inp->inp_flowtype == M_HASHTYPE_NONE) + return (hpts_random_cpu(inp)); + /* +* Hash to a thread based on the flowid. If we are using numa, +* then restrict the hash to the numa domain where the inp lives. +*/ +#ifdef NUMA + if (tcp_bind_threads == 2 && inp->inp_numa_domain != M_NODOM) { + di = _domains[inp->inp_numa_domain]; + cpuid = di->cpu[inp->inp_flowid % di->count]; + } else +#endif cpuid = inp->inp_flowid % mp_ncpus; - return (cpuid); - } - cpuid = hpts_random_cpu(inp); + return (cpuid); #endif } @@ -1781,8 +1801,11 @@ tcp_init_hptsi(void *st) struct timeval tv; sbintime_t sb; struct tcp_hpts_entry *hpts; + struct pcpu *pc; + cpuset_t cs; char unit[16]; uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; + int count, domain; tcp_pace.rp_proc = NULL; tcp_pace.rp_num_hptss = ncpus; @@ -1861,6 +1884,11 @@ tcp_init_hptsi(void *st) } callout_init(>co, 1); } + + /* Don't try to bind to NUMA domains if we don't have any */ + if (vm_ndomains == 1 && tcp_bind_threads == 2) + tcp_bind_threads = 0; + /* * Now lets start ithreads to handle the hptss. */ @@ -1875,9 +1903,20 @@ tcp_init_hptsi(void *st) hpts, i, error); } created++; - if (tcp_bind_threads) { + if (tcp_bind_threads == 1) { if (intr_event_bind(hpts->ie, i) == 0)
svn commit: r347410 - in head: . sys/amd64/conf sys/arm/conf sys/arm64/conf sys/i386/conf sys/powerpc/conf sys/riscv/conf sys/sparc64/conf
Author: gallatin Date: Thu May 9 22:38:15 2019 New Revision: 347410 URL: https://svnweb.freebsd.org/changeset/base/347410 Log: Remove IPSEC from GENERIC due to performance issues Having IPSEC compiled into the kernel imposes a non-trivial performance penalty on multi-threaded workloads due to IPSEC refcounting. In my benchmarks of multi-threaded UDP transmit (connected sockets), I've seen a roughly 20% performance penalty when the IPSEC option is included in the kernel (16.8Mpps vs 13.8Mpps with 32 senders on a 14 core / 28 HTT Xeon 2697v3)). This is largely due to key_addref() incrementing and decrementing an atomic reference count on the default policy. This cause all CPUs to stall on the same cacheline, as it bounces between different CPUs. Given that relatively few users use ipsec, and that it can be loaded as a module, it seems reasonable to ask those users to load the ipsec module so as to avoid imposing this penalty on the GENERIC kernel. Its my hope that this will make FreeBSD look better in "out of the box" benchmark comparisons with other operating systems. Many thanks to ae for fixing auto-loading of ipsec.ko when ifconfig tries to configure ipsec, and to cy for volunteering to ensure the the racoon ports will load the ipsec.ko module Reviewed by: cem, cy, delphij, gnn, jhb, jpaetzel Differential Revision:https://reviews.freebsd.org/D20163 Modified: head/UPDATING head/sys/amd64/conf/GENERIC head/sys/arm/conf/std.armv6 head/sys/arm/conf/std.armv7 head/sys/arm64/conf/GENERIC head/sys/i386/conf/GENERIC head/sys/powerpc/conf/GENERIC head/sys/powerpc/conf/GENERIC64 head/sys/riscv/conf/GENERIC head/sys/sparc64/conf/GENERIC Modified: head/UPDATING == --- head/UPDATING Thu May 9 22:31:47 2019(r347409) +++ head/UPDATING Thu May 9 22:38:15 2019(r347410) @@ -32,6 +32,10 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 13.x IS SLOW: "ln -s 'abort:false,junk:false' /etc/malloc.conf".) 20190507: + The IPSEC option has been removed from GENERIC. Users requiring + ipsec(4) must now load the ipsec(4) kernel module. + +20190507: The tap(4) driver has been folded into tun(4), and the module has been renamed to tuntap. You should update any kld_load="if_tap" or kld_load="if_tun" entries in /etc/rc.conf, if_tap_load="YES" or Modified: head/sys/amd64/conf/GENERIC == --- head/sys/amd64/conf/GENERIC Thu May 9 22:31:47 2019(r347409) +++ head/sys/amd64/conf/GENERIC Thu May 9 22:38:15 2019(r347410) @@ -30,7 +30,6 @@ options PREEMPTION # Enable kernel thread preemption optionsVIMAGE # Subsystem virtualization, e.g. VNET optionsINET# InterNETworking optionsINET6 # IPv6 communications protocols -optionsIPSEC # IP (v4/v6) security optionsIPSEC_SUPPORT # Allow kldload of ipsec and tcpmd5 optionsTCP_OFFLOAD # TCP offload optionsTCP_BLACKBOX# Enhanced TCP event logging Modified: head/sys/arm/conf/std.armv6 == --- head/sys/arm/conf/std.armv6 Thu May 9 22:31:47 2019(r347409) +++ head/sys/arm/conf/std.armv6 Thu May 9 22:38:15 2019(r347410) @@ -11,7 +11,7 @@ options INET# InterNETworking optionsINET6 # IPv6 communications protocols optionsTCP_HHOOK # hhook(9) framework for TCP device crypto # core crypto support -optionsIPSEC # IP (v4/v6) security +optionsIPSEC_SUPPORT # Allow kldload of ipsec and tcpmd5 optionsSCTP# Stream Control Transmission Protocol optionsFFS # Berkeley Fast Filesystem optionsSOFTUPDATES # Enable FFS soft updates support Modified: head/sys/arm/conf/std.armv7 == --- head/sys/arm/conf/std.armv7 Thu May 9 22:31:47 2019(r347409) +++ head/sys/arm/conf/std.armv7 Thu May 9 22:38:15 2019(r347410) @@ -11,7 +11,7 @@ options INET# InterNETworking optionsINET6 # IPv6 communications protocols optionsTCP_HHOOK # hhook(9) framework for TCP device crypto # core crypto support -optionsIPSEC # IP (v4/v6) security +optionsIPSEC_SUPPORT # Allow kldload of ipsec and tcpmd5 optionsSCTP# Stream Control Transmission Protocol optionsFFS
svn commit: r347055 - in head: sbin/ifconfig sys/net
Author: gallatin Date: Fri May 3 14:43:21 2019 New Revision: 347055 URL: https://svnweb.freebsd.org/changeset/base/347055 Log: Select lacp egress ports based on NUMA domain This change creates an array of port maps indexed by numa domain for lacp port selection. If we have lacp interfaces in more than one domain, then we select the egress port by indexing into the numa port maps and picking a port on the appropriate numa domain. This is behavior is controlled by the new ifconfig use_numa flag and net.link.lagg.use_numa sysctl/tunable (both modeled after the existing use_flowid), which default to enabled. Reviewed by: bz, hselasky, markj (and scottl, earlier version) Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D20060 Modified: head/sbin/ifconfig/ifconfig.8 head/sbin/ifconfig/iflagg.c head/sys/net/ieee8023ad_lacp.c head/sys/net/ieee8023ad_lacp.h head/sys/net/if_lagg.c head/sys/net/if_lagg.h Modified: head/sbin/ifconfig/ifconfig.8 == --- head/sbin/ifconfig/ifconfig.8 Fri May 3 13:06:46 2019 (r347054) +++ head/sbin/ifconfig/ifconfig.8 Fri May 3 14:43:21 2019 (r347055) @@ -28,7 +28,7 @@ .\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94 .\" $FreeBSD$ .\" -.Dd June 27, 2018 +.Dd May 3, 2019 .Dt IFCONFIG 8 .Os .Sh NAME @@ -2497,6 +2497,22 @@ Use the RSS hash from the network card if available. Set a shift parameter for RSS local hash computation. Hash is calculated by using flowid bits in a packet header mbuf which are shifted by the number of this parameter. +.It Cm use_numa +Enable selection of egress ports based on the native +.Xr NUMA 4 +domain for the packets being transmitted. +This is currently only implemented for lacp mode. +This works only on +.Xr NUMA 4 +hardware, running a kernel compiled with the +.Xr NUMA 4 +option, and when interfaces from multiple +.Xr NUMA 4 +domains are ports of the aggregation interface. +.It Cm -use_numa +Disable selection of egress ports based on the native +.Xr NUMA 4 +domain for the packets being transmitted. .It Cm lacp_fast_timeout Enable lacp fast-timeout on the interface. .It Cm -lacp_fast_timeout Modified: head/sbin/ifconfig/iflagg.c == --- head/sbin/ifconfig/iflagg.c Fri May 3 13:06:46 2019(r347054) +++ head/sbin/ifconfig/iflagg.c Fri May 3 14:43:21 2019(r347055) @@ -130,6 +130,8 @@ setlaggsetopt(const char *val, int d, int s, const str switch (ro.ro_opts) { case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: + case LAGG_OPT_USE_NUMA: + case -LAGG_OPT_USE_NUMA: case LAGG_OPT_LACP_STRICT: case -LAGG_OPT_LACP_STRICT: case LAGG_OPT_LACP_TXTEST: @@ -303,6 +305,8 @@ static struct cmd lagg_cmds[] = { DEF_CMD_ARG("lagghash", setlagghash), DEF_CMD("use_flowid", LAGG_OPT_USE_FLOWID,setlaggsetopt), DEF_CMD("-use_flowid", -LAGG_OPT_USE_FLOWID, setlaggsetopt), + DEF_CMD("use_numa", LAGG_OPT_USE_NUMA, setlaggsetopt), + DEF_CMD("-use_numa",-LAGG_OPT_USE_NUMA, setlaggsetopt), DEF_CMD("lacp_strict", LAGG_OPT_LACP_STRICT, setlaggsetopt), DEF_CMD("-lacp_strict", -LAGG_OPT_LACP_STRICT, setlaggsetopt), DEF_CMD("lacp_txtest", LAGG_OPT_LACP_TXTEST, setlaggsetopt), Modified: head/sys/net/ieee8023ad_lacp.c == --- head/sys/net/ieee8023ad_lacp.c Fri May 3 13:06:46 2019 (r347054) +++ head/sys/net/ieee8023ad_lacp.c Fri May 3 14:43:21 2019 (r347055) @@ -835,7 +835,9 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf struct lacp_softc *lsc = LACP_SOFTC(sc); struct lacp_portmap *pm; struct lacp_port *lp; + struct lacp_port **map; uint32_t hash; + int count; if (__predict_false(lsc->lsc_suppress_distributing)) { LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); @@ -848,14 +850,32 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf return (NULL); } +#ifdef NUMA + if ((sc->sc_opts & LAGG_OPT_USE_NUMA) && + pm->pm_num_dom > 1 && m->m_pkthdr.numa_domain < MAXMEMDOM) { + count = pm->pm_numa[m->m_pkthdr.numa_domain].count; + if (count > 0) { + map = pm->pm_numa[m->m_pkthdr.numa_domain].map; + } else { + /* No ports on this domain; use global hash. */ + map = pm->pm_map; + count = pm->pm_count; + } + } else +#endif + { + map = pm->pm_map; + count = pm->pm_count; + } if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
Re: svn commit: r346598 - head/sys/modules
On 2019-04-29 10:54, Emmanuel Vadot wrote: On Mon, 29 Apr 2019 10:49:01 -0400 Andrew Gallatin wrote: On 2019-04-29 10:21, Rodney W. Grimes wrote: On Tue, 23 Apr 2019 at 13:26, Rodney W. Grimes wrote: Very cool, now how do I get a PCIe slot into a RPI3!!! lol :-) I know you're joking but the comment does highlight an issue in the AArch64 world - there's a lack of good mid-range developer platforms. I may of been joking with respect to the RPI3, but at the same time I do know that the RockPro64 exists and does have that PCIe slot I want, I also know that Michael Dexter has one he would loan me should I wish to investigate our state of support. Does anybody know what PCIe Generation / speed that slot runs at? All I can find them saying is "PCIe x4", which implies Gen 1, 2.5GT/s speeds, which is not terribly useful. Gen2 or better would be enough to run 10GbE, which would be fun :) Drew It/s PCIe 2.1 compatible. See http://rockchip.fr/Rockchip%20RK3399%20TRM%20V1.3%20Part2.pdf Everything I'm seeing there says Gen1 vs Gen2 depends on "PCIE_GENERATION_SEL", and that if its set to 0, you get Gen1 2.5Gt/s and if it is set to 1, you get Gen2, 5.0Gt/s. But I don't see anything specifying this value for the RockPro64 board. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r346598 - head/sys/modules
On 2019-04-29 10:21, Rodney W. Grimes wrote: On Tue, 23 Apr 2019 at 13:26, Rodney W. Grimes wrote: Very cool, now how do I get a PCIe slot into a RPI3!!! lol :-) I know you're joking but the comment does highlight an issue in the AArch64 world - there's a lack of good mid-range developer platforms. I may of been joking with respect to the RPI3, but at the same time I do know that the RockPro64 exists and does have that PCIe slot I want, I also know that Michael Dexter has one he would loan me should I wish to investigate our state of support. Does anybody know what PCIe Generation / speed that slot runs at? All I can find them saying is "PCIe x4", which implies Gen 1, 2.5GT/s speeds, which is not terribly useful. Gen2 or better would be enough to run 10GbE, which would be fun :) Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r346677 - in head/sys: dev/cxgbe dev/mlx5/mlx5_en kern netinet netinet6
Author: gallatin Date: Thu Apr 25 15:37:28 2019 New Revision: 346677 URL: https://svnweb.freebsd.org/changeset/base/346677 Log: Track TCP connection's NUMA domain in the inpcb Drivers can now pass up numa domain information via the mbuf numa domain field. This information is then used by TCP syncache_socket() to associate that information with the inpcb. The domain information is then fed back into transmitted mbufs in ip{6}_output(). This mechanism is nearly identical to what is done to track RSS hash values in the inp_flowid. Follow on changes will use this information for lacp egress port selection, binding TCP pacers to the appropriate NUMA domain, etc. Reviewed by: markj, kib, slavash, bz, scottl, jtl, tuexen Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D20028 Modified: head/sys/dev/cxgbe/t4_sge.c head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c head/sys/kern/uipc_mbuf.c head/sys/netinet/in_pcb.c head/sys/netinet/in_pcb.h head/sys/netinet/ip_output.c head/sys/netinet/tcp_syncache.c head/sys/netinet6/ip6_output.c Modified: head/sys/dev/cxgbe/t4_sge.c == --- head/sys/dev/cxgbe/t4_sge.c Thu Apr 25 15:31:35 2019(r346676) +++ head/sys/dev/cxgbe/t4_sge.c Thu Apr 25 15:37:28 2019(r346677) @@ -2046,6 +2046,9 @@ t4_eth_rx(struct sge_iq *iq, const struct rss_header * rxq->vlan_extraction++; } +#ifdef NUMA + m0->m_pkthdr.numa_domain = ifp->if_numa_domain; +#endif #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED) { if (sort_before_lro(lro)) { Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c == --- head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c Thu Apr 25 15:31:35 2019 (r346676) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c Thu Apr 25 15:37:28 2019 (r346677) @@ -520,6 +520,9 @@ rx_common: mlx5e_build_rx_mbuf(cqe, rq, mb, byte_cnt); rq->stats.bytes += byte_cnt; rq->stats.packets++; +#ifdef NUMA + mb->m_pkthdr.numa_domain = rq->ifp->if_numa_domain; +#endif #if !defined(HAVE_TCP_LRO_RX) tcp_lro_queue_mbuf(>lro, mb); Modified: head/sys/kern/uipc_mbuf.c == --- head/sys/kern/uipc_mbuf.c Thu Apr 25 15:31:35 2019(r346676) +++ head/sys/kern/uipc_mbuf.c Thu Apr 25 15:37:28 2019(r346677) @@ -341,6 +341,9 @@ m_pkthdr_init(struct mbuf *m, int how) #endif m->m_data = m->m_pktdat; bzero(>m_pkthdr, sizeof(m->m_pkthdr)); +#ifdef NUMA + m->m_pkthdr.numa_domain = M_NODOM; +#endif #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); Modified: head/sys/netinet/in_pcb.c == --- head/sys/netinet/in_pcb.c Thu Apr 25 15:31:35 2019(r346676) +++ head/sys/netinet/in_pcb.c Thu Apr 25 15:37:28 2019(r346677) @@ -510,6 +510,9 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbin if (inp == NULL) return (ENOBUFS); bzero(>inp_start_zero, inp_zero_size); +#ifdef NUMA + inp->inp_numa_domain = M_NODOM; +#endif inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); Modified: head/sys/netinet/in_pcb.h == --- head/sys/netinet/in_pcb.h Thu Apr 25 15:31:35 2019(r346676) +++ head/sys/netinet/in_pcb.h Thu Apr 25 15:37:28 2019(r346677) @@ -272,7 +272,7 @@ struct inpcb { inp_hpts_calls :1, /* (i) from output hpts */ inp_input_calls :1,/* (i) from input hpts */ inp_spare_bits2 : 4; - uint8_t inp_spare_byte; /* Compiler hole */ + uint8_t inp_numa_domain;/* numa domain */ void*inp_ppcb; /* (i) pointer to per-protocol pcb */ struct socket *inp_socket; /* (i) back pointer to socket */ uint32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i) */ Modified: head/sys/netinet/ip_output.c == --- head/sys/netinet/ip_output.cThu Apr 25 15:31:35 2019 (r346676) +++ head/sys/netinet/ip_output.cThu Apr 25 15:37:28 2019 (r346677) @@ -247,6 +247,9 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct rou m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } +#ifdef NUMA + m->m_pkthdr.numa_domain = inp->inp_numa_domain; +#endif
svn commit: r346632 - head/sys/net
Author: gallatin Date: Wed Apr 24 13:32:04 2019 New Revision: 346632 URL: https://svnweb.freebsd.org/changeset/base/346632 Log: iflib: Add pfil hooks As with mlx5en, the idea is to drop unwanted traffic as early in receive as possible, before mbufs are allocated and anything is passed up the stack. This can save considerable CPU time when a machine is under a flooding style DOS attack. The major change here is to remove the unneeded abstraction where callers of rxd_frag_to_sd() get back a pointer to the mbuf ring, and are responsible for NULL'ing that mbuf themselves. Now this happens directly in rxd_frag_to_sd(), and it returns an mbuf. This allows us to use the decision (and potentially mbuf) returned by the pfil hooks. The driver can now recycle mbufs to avoid re-allocation when packets are dropped. Reviewed by: marius (shurd and erj also provided feedback) Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D19645 Modified: head/sys/net/iflib.c Modified: head/sys/net/iflib.c == --- head/sys/net/iflib.cWed Apr 24 13:15:56 2019(r346631) +++ head/sys/net/iflib.cWed Apr 24 13:32:04 2019(r346632) @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -432,6 +433,7 @@ struct iflib_rxq { if_ctx_tifr_ctx; iflib_fl_t ifr_fl; uint64_tifr_rx_irq; + struct pfil_head*pfil; uint16_tifr_id; uint8_t ifr_lro_enabled; uint8_t ifr_nfl; @@ -451,7 +453,6 @@ struct iflib_rxq { typedef struct if_rxsd { caddr_t *ifsd_cl; - struct mbuf **ifsd_m; iflib_fl_t ifsd_fl; qidx_t ifsd_cidx; } *if_rxsd_t; @@ -652,7 +653,6 @@ static int iflib_fast_intrs; static int iflib_rx_unavail; static int iflib_rx_ctx_inactive; static int iflib_rx_if_input; -static int iflib_rx_mbuf_null; static int iflib_rxd_flush; static int iflib_verbose_debug; @@ -669,8 +669,6 @@ SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLF _rx_ctx_inactive, 0, "# times rxeof called with inactive context"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD, _rx_if_input, 0, "# times rxeof called if_input"); -SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD, - _rx_mbuf_null, 0, "# times rxeof got null mbuf"); SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD, _rxd_flush, 0, "# times rxd_flush called"); SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW, @@ -689,7 +687,7 @@ iflib_debug_reset(void) iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs = iflib_rx_unavail = iflib_rx_ctx_inactive = iflib_rx_if_input = - iflib_rx_mbuf_null = iflib_rxd_flush = 0; + iflib_rxd_flush = 0; } #else @@ -2002,11 +2000,12 @@ _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int coun bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx], BUS_DMASYNC_PREREAD); - MPASS(sd_m[frag_idx] == NULL); - if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) { - break; + if (sd_m[frag_idx] == NULL) { + if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) { + break; + } + sd_m[frag_idx] = m; } - sd_m[frag_idx] = m; bit_set(fl->ifl_rx_bitmap, frag_idx); #if MEMORY_LOGGING fl->ifl_m_enqueued++; @@ -2483,13 +2482,15 @@ prefetch_pkts(iflib_fl_t fl, int cidx) prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]); } -static void -rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int unload, if_rxsd_t sd) +static struct mbuf * +rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd, +int *pf_rv, if_rxd_info_t ri) { - int flid, cidx; bus_dmamap_t map; iflib_fl_t fl; - int next; + caddr_t payload; + struct mbuf *m; + int flid, cidx, len, next; map = NULL; flid = irf->irf_flid; @@ -2497,7 +2498,7 @@ rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int fl = >ifr_fl[flid]; sd->ifsd_fl = fl; sd->ifsd_cidx = cidx; - sd->ifsd_m = >ifl_sds.ifsd_m[cidx]; + m = fl->ifl_sds.ifsd_m[cidx]; sd->ifsd_cl = >ifl_sds.ifsd_cl[cidx]; fl->ifl_credits--; #if MEMORY_LOGGING @@ -2513,39 +2514,89 @@ rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int /* not valid assert if bxe really does SGE from non-contiguous elements */ MPASS(fl->ifl_cidx == cidx); bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD); + + if (rxq->pfil !=
svn commit: r346579 - in head: share/man/man9 sys/dev/cxgbe sys/dev/mlx5/mlx5_en sys/net
Author: gallatin Date: Mon Apr 22 19:24:21 2019 New Revision: 346579 URL: https://svnweb.freebsd.org/changeset/base/346579 Log: Track device's NUMA domain in ifnet & alloc ifnet from NUMA local memory This commit adds new if_alloc_domain() and if_alloc_dev() methods to allocate ifnets. When called with a domain on a NUMA machine, ifalloc_domain() will record the NUMA domain in the ifnet, and it will allocate the ifnet struct from memory which is local to that NUMA node. Similarly, if_alloc_dev() is a wrapper for if_alloc_domain which uses a driver supplied device_t to call ifalloc_domain() with the appropriate domain. Note that the new if_numa_domain field fits in an alignment pad in struct ifnet, and so does not alter the size of the structure. Reviewed by: glebius, kib, markj Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D19930 Modified: head/share/man/man9/Makefile head/share/man/man9/ifnet.9 head/sys/dev/cxgbe/t4_main.c head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c head/sys/net/if.c head/sys/net/if_var.h Modified: head/share/man/man9/Makefile == --- head/share/man/man9/MakefileMon Apr 22 19:21:35 2019 (r346578) +++ head/share/man/man9/MakefileMon Apr 22 19:24:21 2019 (r346579) @@ -1175,6 +1175,8 @@ MLINKS+=iflibtxrx.9 isc_rxd_available.9 \ iflibtxrx.9 isc_txd_flush.9 MLINKS+=ifnet.9 if_addmulti.9 \ ifnet.9 if_alloc.9 \ + ifnet.9 if_alloc_dev.9 \ + ifnet.9 if_alloc_domain.9 \ ifnet.9 if_allmulti.9 \ ifnet.9 if_attach.9 \ ifnet.9 if_data.9 \ Modified: head/share/man/man9/ifnet.9 == --- head/share/man/man9/ifnet.9 Mon Apr 22 19:21:35 2019(r346578) +++ head/share/man/man9/ifnet.9 Mon Apr 22 19:24:21 2019(r346579) @@ -48,6 +48,10 @@ .Ss "Interface Manipulation Functions" .Ft "struct ifnet *" .Fn if_alloc "u_char type" +.Ft "struct ifnet *" +.Fn if_alloc_dev "u_char type" "device_t dev" +.Ft "struct ifnet *" +.Fn if_alloc_domain "u_char type" "int numa_domain" .Ft void .Fn if_attach "struct ifnet *ifp" .Ft void @@ -440,6 +444,15 @@ It is used to cache the type passed to but unlike .Va if_type , it would not be changed by drivers. +.It Va if_numa_domain +.Pq Vt uint8_t +The NUMA domain of the hardware device associated with the interface. +This is filled in with a wildcard value unless the kernel is NUMA +aware, the system is a NUMA system, and the ifnet is allocated +using +.Fn if_alloc_dev +or +.Fn if_alloc_domain . .El .Pp References to @@ -1151,6 +1164,24 @@ include the allocation of a .Fa type specific structure in .Va if_l2com . +.It Fn if_alloc_dev +Allocate and initialize +.Vt "struct ifnet" +as +.Fn if_alloc +does, with the addition that the ifnet can be tagged with the +appropriate NUMA domain derived from the +.Fa dev +argument passed by the caller. +.It Fn if_alloc_domain +Allocate and initialize +.Vt "struct ifnet" +as +.Fn if_alloc +does, with the addition that the ifnet will be tagged with the NUMA +domain via the +.Fa numa_domain +argument passed by the caller. .It Fn if_attach Link the specified interface .Fa ifp @@ -1168,7 +1199,10 @@ function.) The .Fa ifp must have been allocated by -.Fn if_alloc . +.Fn if_alloc , +.Fn if_alloc_dev +or +.Fn if_alloc_domain . .It Fn if_detach Shut down and unlink the specified .Fa ifp Modified: head/sys/dev/cxgbe/t4_main.c == --- head/sys/dev/cxgbe/t4_main.cMon Apr 22 19:21:35 2019 (r346578) +++ head/sys/dev/cxgbe/t4_main.cMon Apr 22 19:24:21 2019 (r346579) @@ -1636,7 +1636,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi) callout_init(>tick, 1); /* Allocate an ifnet and set it up */ - ifp = if_alloc(IFT_ETHER); + ifp = if_alloc_dev(IFT_ETHER, dev); if (ifp == NULL) { device_printf(dev, "Cannot allocate ifnet\n"); return (ENOMEM); Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c == --- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 22 19:21:35 2019 (r346578) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 22 19:24:21 2019 (r346579) @@ -3682,7 +3682,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev) M_MLX5EN, M_WAITOK | M_ZERO); mlx5e_priv_mtx_init(priv); - ifp = priv->ifp = if_alloc(IFT_ETHER); + ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev); if (ifp == NULL) { mlx5_core_err(mdev, "if_alloc() failed\n"); goto err_free_priv; Modified: head/sys/net/if.c
svn commit: r346281 - head/sys/sys
Author: gallatin Date: Tue Apr 16 16:49:34 2019 New Revision: 346281 URL: https://svnweb.freebsd.org/changeset/base/346281 Log: Replace cosqos with numa_domain in mbuf pkthdr The cosqos field was added nearly 6 years ago in r254804, and it is still unused by any in-tree consumers. I have a patchset that I'm working on which aligns many network resources by NUMA domain, including inps, inpcb lb group, tcp pacing, lagg output link selection, backing pages for sendfile, and more. It reduces cross-domain traffic by roughly 50% for a real web workload. This patchset relies on being able to store the numa domain in the mbuf, and grabbing the unused cosqos field for this purpose is the first step in starting to usptream it. Reviewed by: kib, markj Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D19862 Modified: head/sys/sys/mbuf.h Modified: head/sys/sys/mbuf.h == --- head/sys/sys/mbuf.h Tue Apr 16 15:52:04 2019(r346280) +++ head/sys/sys/mbuf.h Tue Apr 16 16:49:34 2019(r346281) @@ -98,6 +98,7 @@ struct mbuf; #defineMLEN((int)(MSIZE - MHSIZE)) #defineMHLEN ((int)(MSIZE - MPKTHSIZE)) #defineMINCLSIZE (MHLEN + 1) +#defineM_NODOM 255 #ifdef _KERNEL /*- @@ -158,7 +159,7 @@ struct pkthdr { uint32_t flowid;/* packet's 4-tuple system */ uint32_t csum_flags;/* checksum and offload features */ uint16_t fibnum;/* this packet should use this fib */ - uint8_t cosqos;/* class/quality of service */ + uint8_t numa_domain; /* NUMA domain of recvd pkt */ uint8_t rsstype; /* hash type */ union { uint64_trcv_tstmp; /* timestamp in ns */ @@ -405,33 +406,6 @@ struct mbuf { #defineM_HASHTYPE_SET(m, v)((m)->m_pkthdr.rsstype = (v)) #defineM_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) #defineM_HASHTYPE_ISHASH(m)(M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP) - -/* - * COS/QOS class and quality of service tags. - * It uses DSCP code points as base. - */ -#defineQOS_DSCP_CS00x00 -#defineQOS_DSCP_DEFQOS_DSCP_CS0 -#defineQOS_DSCP_CS10x20 -#defineQOS_DSCP_AF11 0x28 -#defineQOS_DSCP_AF12 0x30 -#defineQOS_DSCP_AF13 0x38 -#defineQOS_DSCP_CS20x40 -#defineQOS_DSCP_AF21 0x48 -#defineQOS_DSCP_AF22 0x50 -#defineQOS_DSCP_AF23 0x58 -#defineQOS_DSCP_CS30x60 -#defineQOS_DSCP_AF31 0x68 -#defineQOS_DSCP_AF32 0x70 -#defineQOS_DSCP_AF33 0x78 -#defineQOS_DSCP_CS40x80 -#defineQOS_DSCP_AF41 0x88 -#defineQOS_DSCP_AF42 0x90 -#defineQOS_DSCP_AF43 0x98 -#defineQOS_DSCP_CS50xa0 -#defineQOS_DSCP_EF 0xb8 -#defineQOS_DSCP_CS60xc0 -#defineQOS_DSCP_CS70xe0 /* * External mbuf storage buffer types. ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r346247 - head/sys/dev/mlx5/mlx5_en
Author: gallatin Date: Mon Apr 15 17:14:50 2019 New Revision: 346247 URL: https://svnweb.freebsd.org/changeset/base/346247 Log: mlx5en: Enable new pfil(9) KPI ethernet filtering hooks This allows efficient filtering at packet ingress on mlx5en. Note that the packets are filtered (and potentially dropped) *before* the driver has committed to (re)allocating an mbuf for the packet. Dropped packets are treated essentially the same as an error. Nothing is allocated, and the existing buffer is recycled. This allows us to drop malicious packets at close to line rate with very little CPU use. Reviewed by: hselasky, slavash, kib Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D19063 Modified: head/sys/dev/mlx5/mlx5_en/en.h head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c Modified: head/sys/dev/mlx5/mlx5_en/en.h == --- head/sys/dev/mlx5/mlx5_en/en.h Mon Apr 15 16:57:27 2019 (r346246) +++ head/sys/dev/mlx5/mlx5_en/en.h Mon Apr 15 17:14:50 2019 (r346247) @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -838,6 +839,7 @@ struct mlx5e_priv { struct mlx5e_clbr_point clbr_points[2]; u_int clbr_gen; + struct pfil_head *pfil; struct mlx5e_channel channel[]; }; Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c == --- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 15 16:57:27 2019 (r346246) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 15 17:14:50 2019 (r346247) @@ -3664,6 +3664,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev) struct sysctl_oid_list *child; int ncv = mdev->priv.eq_table.num_comp_vectors; char unit[16]; + struct pfil_head_args pa; int err; int i; u32 eth_proto_cap; @@ -3898,6 +3899,12 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev) callout_init(>tstmp_clbr, CALLOUT_DIRECT); mlx5e_reset_calibration_callout(priv); + pa.pa_version = PFIL_VERSION; + pa.pa_flags = PFIL_IN; + pa.pa_type = PFIL_TYPE_ETHERNET; + pa.pa_headname = ifp->if_xname; + priv->pfil = pfil_head_register(); + return (priv); #ifdef RATELIMIT @@ -3972,6 +3979,12 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vp if_printf(priv->ifp, "Waiting for all unlimited connections " "to terminate\n"); pause("W", hz); + } + + /* deregister pfil */ + if (priv->pfil != NULL) { + pfil_head_unregister(priv->pfil); + priv->pfil = NULL; } /* unregister device */ Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c == --- head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c Mon Apr 15 16:57:27 2019 (r346246) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c Mon Apr 15 17:14:50 2019 (r346247) @@ -430,15 +430,18 @@ mlx5e_decompress_cqes(struct mlx5e_cq *cq) static int mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget) { - int i; + struct pfil_head *pfil; + int i, rv; + CURVNET_SET_QUIET(rq->ifp->if_vnet); + pfil = rq->channel->priv->pfil; for (i = 0; i < budget; i++) { struct mlx5e_rx_wqe *wqe; struct mlx5_cqe64 *cqe; struct mbuf *mb; __be16 wqe_counter_be; u16 wqe_counter; - u32 byte_cnt; + u32 byte_cnt, seglen; cqe = mlx5e_get_cqe(>cq); if (!cqe) @@ -462,6 +465,39 @@ mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget) rq->stats.wqe_err++; goto wq_ll_pop; } + if (pfil != NULL && PFIL_HOOKED_IN(pfil)) { + seglen = MIN(byte_cnt, MLX5E_MAX_RX_BYTES); + rv = pfil_run_hooks(rq->channel->priv->pfil, + rq->mbuf[wqe_counter].data, rq->ifp, + seglen | PFIL_MEMPTR | PFIL_IN, NULL); + + switch (rv) { + case PFIL_DROPPED: + case PFIL_CONSUMED: + /* +* Filter dropped or consumed it. In +* either case, we can just recycle +* buffer; there is no more work to do. +*/ + rq->stats.packets++; + goto wq_ll_pop; + case PFIL_REALLOCED: + /* +* Filter copied it; recycle buffer +
svn commit: r345273 - head/sys/kern
Author: gallatin Date: Mon Mar 18 12:41:42 2019 New Revision: 345273 URL: https://svnweb.freebsd.org/changeset/base/345273 Log: Fix a typo introduced in r344133 The line was misedited to change tt to st instead of changing ut to st. The use of st as the denominator in mul64_by_fraction() will lead to an integer divide fault in the intr proc (the process holding ithreads) where st will be 0. This divide by 0 happens after the total runtime for all ithreads exceeds 76 hours. Submitted by: bde Modified: head/sys/kern/kern_resource.c Modified: head/sys/kern/kern_resource.c == --- head/sys/kern/kern_resource.c Mon Mar 18 12:34:13 2019 (r345272) +++ head/sys/kern/kern_resource.c Mon Mar 18 12:41:42 2019 (r345273) @@ -978,7 +978,7 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struc su = (tu * st) / tt; } else { uu = mul64_by_fraction(tu, ut, tt); - su = mul64_by_fraction(tu, ut, st); + su = mul64_by_fraction(tu, st, tt); } if (tu >= ruxp->rux_tu) { ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r345138 - head/share/man/man9
On 3/14/19 11:36 PM, Rodney W. Grimes wrote: [ Charset UTF-8 unsupported, converting... ] On Thu, 14 Mar 2019 at 22:39, Rodney W. Grimes wrote: 4. There is no easy way to show "changed byte at offset 0x432 from 0xef to 0xfe" How do we represent Copyright and License in such objects? This is an issue that is totally left out of even .uu version. This is an excellent point. What I used to do for mxge firmware when I worked at Myricom was to have a shell script that created a source file with the uuencoded bits as a static array. That way, it had copyright info in the file itself. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r344817 - in head/sys: dev/e1000 net
On 3/5/19 4:06 PM, Matthew Macy wrote: This represents a misunderstanding of how defines are used. This left the option open to the user to enable the use of larger than page size buffers as it does enable better performance. Over the course of a long uptime memory can get too fragmented. However, this left it open to the end consumer. I'd like to see this reverted with perhaps a better name for the define and the addition of an explanatory comment. I'd strongly prefer that it stay removed. Since it is not hooked to an option, no user is ever going to find it. This really should have been a tuneable (since it is done at ring init time, rather than rx buffer alloc time), but nobody cared enough to make it actually usable. From brief memories of performance tuning 10G adapters 14 years ago, the differences between page-sized and 9k jumbos were minimal even back then (1/3 as many mbuf alloc/free, smaller chains). So I'm not convinced that it is worth bringing back in any form. My general feeling is that the more of this code that we can remove, the better. Iflib is tricky enough that it is already challenging to reason about and maintain. Removing code which is for all intents and purposes unreachable and never tested is Good Thing. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r344099 - head/sys/net
I think the misunderstanding here is that I think he's not getting the ifp from the route. My recollection is that he is holding the ifps when he enables HW pacing in BBR. Due to limitations in different NIC hardware, you can only have N different rates, etc. So he goes ahead and allocates those N rates up front so that he knows he can reserve them & know that he can always get them. Then when the system reboots, BBR has an eventhandler that goes ahead and frees those reservations. I think that he's using the ifp that he's holding here. In the case that tripped him up, that ifp was lagg. Your workaround would also work, but Randall does have a point about symmetric alloc/free especially when viewed from his perspective, Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r343430 - head/sys/net
Author: gallatin Date: Fri Jan 25 15:02:18 2019 New Revision: 343430 URL: https://svnweb.freebsd.org/changeset/base/343430 Log: Fix an iflib driver unload panic introduced in r343085 The new loop to sync and unload descriptors was indexed by "i", rather than "j". The panic was caused by "i" being advanced rather than "j", and eventually becoming out of bounds. Reviewed by: kib MFC after:3 days Sponsored by: Netflix Modified: head/sys/net/iflib.c Modified: head/sys/net/iflib.c == --- head/sys/net/iflib.cFri Jan 25 14:46:13 2019(r343429) +++ head/sys/net/iflib.cFri Jan 25 15:02:18 2019(r343430) @@ -2197,17 +2197,17 @@ iflib_rx_sds_free(iflib_rxq_t rxq) fl = >ifr_fl[i]; if (fl->ifl_desc_tag != NULL) { if (fl->ifl_sds.ifsd_map != NULL) { - for (j = 0; j < fl->ifl_size; i++) { - if (fl->ifl_sds.ifsd_map[i] == + for (j = 0; j < fl->ifl_size; j++) { + if (fl->ifl_sds.ifsd_map[j] == NULL) - continue; + continue; bus_dmamap_sync( fl->ifl_desc_tag, - fl->ifl_sds.ifsd_map[i], + fl->ifl_sds.ifsd_map[j], BUS_DMASYNC_POSTREAD); bus_dmamap_unload( fl->ifl_desc_tag, - fl->ifl_sds.ifsd_map[i]); + fl->ifl_sds.ifsd_map[j]); } } bus_dma_tag_destroy(fl->ifl_desc_tag); ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r343269 - head/sys/dev/cxgbe
On 1/21/19 1:42 PM, Navdeep Parhar wrote: Log: cxgbe(4): Allow negative values in hw.cxgbe.fw_install and take them to Thank you! Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r343030 - in head/sys: cam conf dev/md dev/nvme fs/fuse fs/nfsclient fs/smbfs kern sys ufs/ffs vm
On 1/14/19 8:02 PM, Gleb Smirnoff wrote: Log: Allocate pager bufs from UMA instead of 80-ish mutex protected linked list. <...> Together with: gallatin Thank you so much for carrying this over the finish line! Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r342774 - head/sys/conf
Author: gallatin Date: Fri Jan 4 18:38:27 2019 New Revision: 342774 URL: https://svnweb.freebsd.org/changeset/base/342774 Log: Limit git history searches in newvers.sh newvers.sh takes upwards of 4-5 seconds to complete on trees checked out from github, due to searching the entire history for non-existent git-svn metadata. Similarly, if one does not check out notes, we again search the entire history for notes. That makes newvers.sh very slow for many github users. To fix this in a fair way, limit the history search to the last 10K commits: if you're more than 10K commits out of sync, then you've forked the project, and our SVN rev is no longer very important to you. Due to how git implements --grep in conjunction with -n, --grep has been removed for performance reasons (git does not seem to limit its search to the -n limit in this case, and takes just as long as it did with no limit). Reviewed by: emaste, imp Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D18745 Modified: head/sys/conf/newvers.sh Modified: head/sys/conf/newvers.sh == --- head/sys/conf/newvers.shFri Jan 4 18:35:25 2019(r342773) +++ head/sys/conf/newvers.shFri Jan 4 18:38:27 2019(r342774) @@ -243,11 +243,15 @@ if [ -n "$git_cmd" ] ; then svn=" r${gitsvn}" git="=${git}" else - gitsvn=`$git_cmd log --grep '^git-svn-id:' | \ +# Log searches are limited to 10k commits to speed up failures. +# We assume that if a tree is more than 10k commits out-of-sync +# with FreeBSD, it has forked the the OS and the SVN rev no +# longer matters. + gitsvn=`$git_cmd log -n 1 | grep '^git-svn-id:' | head -1 | \ sed -n 's/^.*@\([0-9][0-9]*\).*$/\1/p'` if [ -z "$gitsvn" ] ; then - gitsvn=`$git_cmd log --format='format:%N' | \ + gitsvn=`$git_cmd log -n 1 --format='format:%N' | \ grep '^svn ' | head -1 | \ sed -n 's/^.*revision=\([0-9][0-9]*\).*$/\1/p'` fi ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r341578 - head/sys/dev/mlx5/mlx5_en
On 12/17/18 2:08 PM, Bruce Evans wrote: On Mon, 17 Dec 2018, Andrew Gallatin wrote: On 12/5/18 9:20 AM, Slava Shwartsman wrote: Author: slavash Date: Wed Dec 5 14:20:57 2018 New Revision: 341578 URL: https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_341578=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=BFp2c_-S0jnzRZJF2APwvTwmnmVFcyjcnBvHRZ3Locc=b7fvhOzf_b5bMVGquu4SaBhMNql5N8dVPAvpfKtz53Q= Log: mlx5en: Remove the DRBR and associated logic in the transmit path. The hardware queues are deep enough currently and using the DRBR and associated callbacks only leads to more task switching in the TX path. The is also a race setting the queue_state which can lead to hung TX rings. The point of DRBR in the tx path is not simply to provide a software ring for queuing excess packets. Rather it provides a mechanism to avoid lock contention by shoving a packet into the software ring, where it will later be found & processed, rather than blocking the caller on a mtx lock. I'm concerned you may have introduced a performance regression for use cases where you have N:1 or N:M lock contention where many threads on different cores are contending for the same tx queue. The state of the art for this is no longer DRBR, but mp_ring, as used by both cxgbe and iflib. iflib uses queuing techniques to significantly pessimize em NICs with 1 hardware queue. On fast machines, it attempts to do 1 context switch per This can happen even w/o contention when "abdicate" is enabled in mp ring. I complained about this as well, and the default was changed in mp ring to not always "abdicate" (eg, switch to the tq to handle the packet). Abdication substantially pessimizes Netflix style web uncontended workloads, but it generally helps small packet forwarding. It is interesting that you see the opposite. I should try benchmarking with just a single ring. (small) tx packet and can't keep up. On slow machines it has a chance of handling multiple packets per context switch, but since the machine is too slow it can't keep up and saturates at a slightly different point. Results for netblast $lanhost 5001 5 10 (5-byte payload for 10 seconds) on an I218V on Haswell 4 cores x 2 threads @4.08GHz running i386: Old results with no iflib and no EM_MULTIQUEUE except as indicated: FBSD-10 UP 1377+0 FBSD-11 UP 1326+0 FBSD-11 SMP-1 1484+0 FBSD-11 SMP-8 1395+0 FBSD-12mod SMP-1 1386+0 FBSD-12mod SMP-8 1422+0 FBSD-12mod SMP-1 1270+0 # use iflib (lose 8% performance) FBSD-12mod SMP-8 1279+0 # use iflib (lose 10% performance using more CPU) 1377+0 means 1377 kpps sent and 0 kpps errors, etc. SMP-8 means use all 8 CPUs. SMP-1 means restrict netblast to 1 CPU different from the taskqueue CPUs using cpuset. New results: FBSD-11 SMP-8 1440+0 # no iflib, no EM_MULTIQUEUE FBSD-11 SMP-8 1486+241 # no iflib, use EM_MULTIQUEUE (now saturate 1Gbps) FBSD-cur SMP-8 533+0 # use iflib, use i386 with 4G KVA iflib only decimates performance relative to the FreeBSD-11 version with no EM_MULTIQUEUE, but EM_MULTIQUEUE gives better queueing using more CPUs. This gives the extra 10-20% of performance needed to saturate the NIC and 1Gbps ethernet. The FreeBSD-current version is not directly comparable since using 4G KVA on i386 reduces performance by about a factor of 2.5 for all loads with mostly small i/o's (for 128K disk i/o's the reduction is only 10-20%). i386 ran at about the same speed as amd64 when it had 1GB KVA, but I don't have any savd results for amd64 to compare with precisely). This is all with security-related things like ibrs unavailable or turned off. All versions use normal Intel interrupt moderation which gives an interrupt rate of 8k/sec. Old versions of em use a "fast" interrupt handler and a slow switch to a taskqueue. This gives a contex switch rate of about 16k/ sec. In the SMP case, netblast normally runs on another CPU and I think it fills h/w tx queue(s) synchronously, and the taskqueue only does minor cleanups. Old em also has a ping latency of about 10% smaller than with iflib (73 usec instead of 80 usec after setting em.x.itr to 0 and other tuning to kill interrupt moderation, and similar for a bge NIC on the other end). The synchronous queue filling probably improves latency, but it is hard to see how it makes a difference of more than 1 usec. 73 is already too high. An old PRO1000 Intel NIC has a latency of only 50 usec on the same network. The switch costs about 20 usec of this. iflib uses taskqueue more. netblast normally runs on another CPU and I think it only fills s/w tx queue(s) synchronously, and wakes up the taskqueues for every packet. The CPUs are almost fast enough to keep up, and the system does about 1M context switches for this (in versions other than i386 with 4G KVA). That is slightly mor
Re: svn commit: r341578 - head/sys/dev/mlx5/mlx5_en
On 12/5/18 9:20 AM, Slava Shwartsman wrote: Author: slavash Date: Wed Dec 5 14:20:57 2018 New Revision: 341578 URL: https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_341578=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=BFp2c_-S0jnzRZJF2APwvTwmnmVFcyjcnBvHRZ3Locc=b7fvhOzf_b5bMVGquu4SaBhMNql5N8dVPAvpfKtz53Q= Log: mlx5en: Remove the DRBR and associated logic in the transmit path. The hardware queues are deep enough currently and using the DRBR and associated callbacks only leads to more task switching in the TX path. The is also a race setting the queue_state which can lead to hung TX rings. The point of DRBR in the tx path is not simply to provide a software ring for queuing excess packets. Rather it provides a mechanism to avoid lock contention by shoving a packet into the software ring, where it will later be found & processed, rather than blocking the caller on a mtx lock. I'm concerned you may have introduced a performance regression for use cases where you have N:1 or N:M lock contention where many threads on different cores are contending for the same tx queue. The state of the art for this is no longer DRBR, but mp_ring, as used by both cxgbe and iflib. For well behaved workloads (like Netflix's), I don't anticipate this being a performance issue. However, I worry that this will impact other workloads and that you should consider running some testing of N:1 contention. Eg, 128 netperfs running in parallel with only a few nic tx rings. Sorry for the late reply.. I'm behind on my -committers email. If you have not already MFC'ed this, you may want to reconsider. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r341095 - head/sys/net
Author: gallatin Date: Tue Nov 27 20:01:05 2018 New Revision: 341095 URL: https://svnweb.freebsd.org/changeset/base/341095 Log: Use busdma unconditionally in iflib - Remove the complex mechanism to choose between using busdma and raw pmap_kextract at runtime. The reduced complexity makes the code easier to read and maintain. - Fix a bug in the small packet receive path where clusters were repeatedly mapped but never unmapped. We now store the cluster's bus address and avoid re-mapping the cluster each time a small packet is received. This patch fixes bugs I've seen where ixl(4) will not even respond to ping without seeing DMAR faults. I see a small improvement (14%) on packet forwarding tests using a Haswell based Xeon E5-2697 v3. Olivier sees a small regression (-3% to -6%) with lower end hardware. Reviewed by: mmacy Not objected to by: sbruno MFC after:8 weeks Sponsored by: Netflix, Inc Differential Revision:https://reviews.freebsd.org/D17901 Modified: head/sys/net/iflib.c head/sys/net/iflib_private.h Modified: head/sys/net/iflib.c == --- head/sys/net/iflib.cTue Nov 27 19:50:58 2018(r341094) +++ head/sys/net/iflib.cTue Nov 27 20:01:05 2018(r341095) @@ -92,15 +92,6 @@ __FBSDID("$FreeBSD$"); #include "ifdi_if.h" -#if defined(__i386__) || defined(__amd64__) -#include -#include -#include -#include -#include -#include -#endif - #ifdef PCI_IOV #include #endif @@ -282,24 +273,16 @@ iflib_get_sctx(if_ctx_t ctx) #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP) #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF) -#define RX_SW_DESC_MAP_CREATED (1 << 0) -#define TX_SW_DESC_MAP_CREATED (1 << 1) -#define RX_SW_DESC_INUSE(1 << 3) -#define TX_SW_DESC_MAPPED (1 << 4) - -#defineM_TOOBIGM_PROTO1 - typedef struct iflib_sw_rx_desc_array { bus_dmamap_t*ifsd_map; /* bus_dma maps for packet */ struct mbuf **ifsd_m; /* pkthdr mbufs */ caddr_t *ifsd_cl; /* direct cluster pointer for rx */ - uint8_t *ifsd_flags; + bus_addr_t *ifsd_ba; /* bus addr of cluster for rx */ } iflib_rxsd_array_t; typedef struct iflib_sw_tx_desc_array { bus_dmamap_t*ifsd_map; /* bus_dma maps for packet */ struct mbuf**ifsd_m; /* pkthdr mbufs */ - uint8_t *ifsd_flags; } if_txsd_vec_t; @@ -940,9 +923,8 @@ iflib_netmap_txsync(struct netmap_kring *kring, int fl if_ctx_t ctx = ifp->if_softc; iflib_txq_t txq = >ifc_txqs[kring->ring_id]; - if (txq->ift_sds.ifsd_map) - bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* @@ -1024,9 +1006,8 @@ iflib_netmap_txsync(struct netmap_kring *kring, int fl kring->nr_hwcur = nm_i; /* synchronize the NIC ring */ - if (txq->ift_sds.ifsd_map) - bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* (re)start the tx unit up to slot nic_i (excluded) */ ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i); @@ -1129,9 +1110,8 @@ iflib_netmap_rxsync(struct netmap_kring *kring, int fl error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, ); ring->slot[nm_i].len = error ? 0 : ri.iri_len - crclen; ring->slot[nm_i].flags = 0; - if (fl->ifl_sds.ifsd_map) - bus_dmamap_sync(fl->ifl_ifdi->idi_tag, - fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD); + bus_dmamap_sync(fl->ifl_ifdi->idi_tag, + fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD); nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } @@ -1210,9 +1190,6 @@ iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq) slot = netmap_reset(na, NR_TX, txq->ift_id, 0); if (slot == NULL) return; - if (txq->ift_sds.ifsd_map == NULL) - return; - for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) { /* @@
Re: svn commit: r340097 - in head/sys: kern sys
On 11/2/18 11:43 PM, Matt Macy wrote: Author: mmacy Date: Sat Nov 3 03:43:32 2018 New Revision: 340097 URL: https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_340097=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=C46M75X_gZcJY3aXGYy_P4DQJhD-uEFU00BP6AzHPik=JvPbkoXDB3zzo2IjmopaQxJ3kRcIwzosrpY4elq80LQ= Log: Convert epoch to read / write records per cpu In discussing D17503 "Run epoch calls sooner and more reliably" with sbahra@ we came to the conclusion that epoch is currently misusing the ck_epoch API. It isn't safe to do a "write side" operation (ck_epoch_call or ck_epoch_poll) in the middle of a "read side" section. Since, by definition, it's possible to be preempted during the middle of an EPOCH_PREEMPT epoch the GC task might call ck_epoch_poll or another thread might call ck_epoch_call on the same section. The right solution is ultimately to change the way that ck_epoch works for this use case. However, as a stopgap for 12 we agreed to simply have separate records for each use case. Tested by: pho@ MFC after: 3 days Hi Matt, Can you elaborate why this is needed? I seem to recall that Samy Al Bahra made some upstream changes to CK that modified the CK API to legitimize our use of the API, and these were brought into FreeBSD in r339375. Were these insufficient? Also, it would be great if you could get review on epoch changes. Epoch is totally awesome, and I'm thrilled that you brought it in. However, it is very tricky, and it seems like changes here could benefit from review. Thanks, Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r339043 - in head/sys: kern vm x86/acpica
On 10/1/18 10:14 AM, Andrew Gallatin wrote: Author: gallatin Date: Mon Oct 1 14:14:21 2018 New Revision: 339043 URL: https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_339043=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=vFxrWMxnRsVgXYUUeDU3mY3EdLAlur-SanLWzMxFWow=a6s6FleHIdYhZF1D_SqEOf9apgxdQ2RBvF0HcKicCus= Log: Allow empty NUMA memory domains to support Threadripper2 The AMD Threadripper 2990WX is basically a slightly crippled Epyc. Rather than having 4 memory controllers, one per NUMA domain, it has only 2 memory controllers enabled. This means that only 2 of the 4 NUMA domains can be populated with physical memory, and the others are empty. Add support to FreeBSD for empty NUMA domains by: - creating empty memory domains when parsing the SRAT table, rather than failing to parse the table - not running the pageout deamon threads in empty domains - adding defensive code to UMA to avoid allocating from empty domains - adding defensive code to cpuset to avoid binding to an empty domain Thanks to Jeff for suggesting this strategy. Reviewed by: alc, markj Approved by: re (gjb@) Differential Revision: https://reviews.freebsd.org/D1683 Whoops, cut-and-paste error. The Differential Revision should have been: https://reviews.freebsd.org/D16836 Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r339043 - in head/sys: kern vm x86/acpica
Author: gallatin Date: Mon Oct 1 14:14:21 2018 New Revision: 339043 URL: https://svnweb.freebsd.org/changeset/base/339043 Log: Allow empty NUMA memory domains to support Threadripper2 The AMD Threadripper 2990WX is basically a slightly crippled Epyc. Rather than having 4 memory controllers, one per NUMA domain, it has only 2 memory controllers enabled. This means that only 2 of the 4 NUMA domains can be populated with physical memory, and the others are empty. Add support to FreeBSD for empty NUMA domains by: - creating empty memory domains when parsing the SRAT table, rather than failing to parse the table - not running the pageout deamon threads in empty domains - adding defensive code to UMA to avoid allocating from empty domains - adding defensive code to cpuset to avoid binding to an empty domain Thanks to Jeff for suggesting this strategy. Reviewed by: alc, markj Approved by: re (gjb@) Differential Revision:https://reviews.freebsd.org/D1683 Modified: head/sys/kern/kern_cpuset.c head/sys/vm/uma_core.c head/sys/vm/vm_kern.c head/sys/vm/vm_pageout.c head/sys/vm/vm_pagequeue.h head/sys/x86/acpica/srat.c Modified: head/sys/kern/kern_cpuset.c == --- head/sys/kern/kern_cpuset.c Mon Oct 1 14:05:31 2018(r339042) +++ head/sys/kern/kern_cpuset.c Mon Oct 1 14:14:21 2018(r339043) @@ -65,7 +65,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include +#include +#include +#include #ifdef DDB #include @@ -479,6 +484,26 @@ _domainset_create(struct domainset *domain, struct dom } /* + * Are any of the domains in the mask empty? If so, silently + * remove them. If only empty domains are present, we must + * return failure. + */ +static bool +domainset_empty_vm(struct domainset *domain) +{ + int i, max; + + max = DOMAINSET_FLS(>ds_mask) + 1; + for (i = 0; i < max; i++) { + if (DOMAINSET_ISSET(i, >ds_mask) && + VM_DOMAIN_EMPTY(i)) + DOMAINSET_CLR(i, >ds_mask); + } + + return (DOMAINSET_EMPTY(>ds_mask)); +} + +/* * Create or lookup a domainset based on the key held in 'domain'. */ struct domainset * @@ -1360,6 +1385,7 @@ domainset_zero(void) DOMAINSET_SET(i, >ds_mask); dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH; dset->ds_prefer = -1; + (void)domainset_empty_vm(dset); curthread->td_domain.dr_policy = _domainset_create(dset, NULL); domainset_copy(dset, ); @@ -2086,6 +2112,13 @@ kern_cpuset_setdomain(struct thread *td, cpulevel_t le /* This will be constrained by domainset_shadow(). */ DOMAINSET_FILL(_mask); } + + /* +* When given an impossible policy, fall back to interleaving +* across all domains +*/ + if (domainset_empty_vm()) + domainset_copy(, ); switch (level) { case CPU_LEVEL_ROOT: Modified: head/sys/vm/uma_core.c == --- head/sys/vm/uma_core.c Mon Oct 1 14:05:31 2018(r339042) +++ head/sys/vm/uma_core.c Mon Oct 1 14:14:21 2018(r339043) @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -2469,9 +2470,11 @@ zalloc_start: if (bucket != NULL) bucket_free(zone, bucket, udata); - if (zone->uz_flags & UMA_ZONE_NUMA) + if (zone->uz_flags & UMA_ZONE_NUMA) { domain = PCPU_GET(domain); - else + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } else domain = UMA_ANYDOMAIN; /* Short-circuit for zones without buckets and low memory. */ @@ -2647,7 +2650,11 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdo rdomain = 0; rr = rdomain == UMA_ANYDOMAIN; if (rr) { - keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + start = keg->uk_cursor; + do { + keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + domain = keg->uk_cursor; + } while (VM_DOMAIN_EMPTY(domain) && domain != start); domain = start = keg->uk_cursor; /* Only block on the second pass. */ if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK) @@ -2698,8 +2705,11 @@ again: LIST_INSERT_HEAD(>ud_part_slab, slab, us_link); return (slab); } - if (rr) - domain = (domain + 1) % vm_ndomains; + if (rr) { + do { + domain = (domain + 1) % vm_ndomains; +
svn commit: r338341 - head/sys/netinet6
Author: gallatin Date: Mon Aug 27 18:13:20 2018 New Revision: 338341 URL: https://svnweb.freebsd.org/changeset/base/338341 Log: Reject IPv4 SO_REUSEPORT_LB groups when looking up an IPv6 listening socket Similar to how the IPv4 code will reject an IPv6 LB group, we must ignore IPv4 LB groups when looking up an IPv6 listening socket. If this is not done, a port only match may return an IPv4 socket, which causes problems (like sending IPv6 packets with a hopcount of 0, making them unrouteable). Thanks to rrs for all the work to diagnose this. Approved by: re (rgrimes) Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D16899 Modified: head/sys/netinet6/in6_pcb.c Modified: head/sys/netinet6/in6_pcb.c == --- head/sys/netinet6/in6_pcb.c Mon Aug 27 15:20:42 2018(r338340) +++ head/sys/netinet6/in6_pcb.c Mon Aug 27 18:13:20 2018(r338341) @@ -901,6 +901,10 @@ in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, * - Load balanced does not contain IPv4 mapped INET6 wild sockets. */ LIST_FOREACH(grp, hdr, il_list) { +#ifdef INET + if (!(grp->il_vflag & INP_IPV6)) + continue; +#endif if (grp->il_lport == lport) { idx = 0; int pkt_hash = INP_PCBLBGROUP_PKTHASH( ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r337709 - head/sys/net
It could probably be MFCed if somebody could verify that it causes no harm in 11. I have no way to test lagg/lacp on 11, so I did not mark it for MFC. Drew On 8/13/18 9:58 PM, Kubilay Kocak wrote: On 14/08/2018 12:13 am, Andrew Gallatin wrote: Author: gallatin Date: Mon Aug 13 14:13:25 2018 New Revision: 337709 URL: https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_337709=DwICaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=1cWDZxkgrPXh1V368in81GXeCx7nVoSbXY9khM0W2r0=psymcTlMReu-E0h72SEySpgbsxzW7KxTwtQDKS8ocBY= Log: lagg: allow lacp to manage the link state Lacp needs to manage the link state itself. Unlike other lagg protocols, the ability of lacp to pass traffic depends not only on the lagg members having link, but also on the lacp protocol converging to a distributing state with the link partner. If we prematurely mark the link as up, then we will send a gratuitous arp (via arp_handle_ifllchange()) before the lacp interface is capable of passing traffic. When this happens, the gratuitous arp is lost, and our link partner may cache a stale mac address (eg, when the base mac address for the lagg bundle changes, due to a BIOS change re-ordering NIC unit numbers) Hi Andrew Can this be MFC'd? Reviewed by: jtl, hselasky Sponsored by: Netflix Modified: head/sys/net/ieee8023ad_lacp.c head/sys/net/if_lagg.c Modified: head/sys/net/ieee8023ad_lacp.c == --- head/sys/net/ieee8023ad_lacp.c Mon Aug 13 13:58:45 2018 (r337708) +++ head/sys/net/ieee8023ad_lacp.c Mon Aug 13 14:13:25 2018 (r337709) @@ -711,6 +711,8 @@ lacp_disable_distributing(struct lacp_port *lp) } lp->lp_state &= ~LACP_STATE_DISTRIBUTING; + if_link_state_change(sc->sc_ifp, + sc->sc_active ? LINK_STATE_UP : LINK_STATE_DOWN); } static void @@ -745,6 +747,9 @@ lacp_enable_distributing(struct lacp_port *lp) } else /* try to become the active aggregator */ lacp_select_active_aggregator(lsc); + + if_link_state_change(sc->sc_ifp, + sc->sc_active ? LINK_STATE_UP : LINK_STATE_DOWN); } static void Modified: head/sys/net/if_lagg.c == --- head/sys/net/if_lagg.c Mon Aug 13 13:58:45 2018(r337708) +++ head/sys/net/if_lagg.c Mon Aug 13 14:13:25 2018(r337709) @@ -1737,6 +1737,10 @@ lagg_linkstate(struct lagg_softc *sc) LAGG_XLOCK_ASSERT(sc); + /* LACP handles link state itself */ + if (sc->sc_proto == LAGG_PROTO_LACP) + return; + /* Our link is considered up if at least one of our ports is active */ LAGG_RLOCK(); CK_SLIST_FOREACH(lp, >sc_ports, lp_entries) { ___ svn-src-head@freebsd.org mailing list https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.freebsd.org_mailman_listinfo_svn-2Dsrc-2Dhead=DwICaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=1cWDZxkgrPXh1V368in81GXeCx7nVoSbXY9khM0W2r0=SLnmQNpAX0j6HgJ5_yIcrQJAf9xCWtNqoEJ2qbOy7_E= To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org" ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r337709 - head/sys/net
Author: gallatin Date: Mon Aug 13 14:13:25 2018 New Revision: 337709 URL: https://svnweb.freebsd.org/changeset/base/337709 Log: lagg: allow lacp to manage the link state Lacp needs to manage the link state itself. Unlike other lagg protocols, the ability of lacp to pass traffic depends not only on the lagg members having link, but also on the lacp protocol converging to a distributing state with the link partner. If we prematurely mark the link as up, then we will send a gratuitous arp (via arp_handle_ifllchange()) before the lacp interface is capable of passing traffic. When this happens, the gratuitous arp is lost, and our link partner may cache a stale mac address (eg, when the base mac address for the lagg bundle changes, due to a BIOS change re-ordering NIC unit numbers) Reviewed by: jtl, hselasky Sponsored by: Netflix Modified: head/sys/net/ieee8023ad_lacp.c head/sys/net/if_lagg.c Modified: head/sys/net/ieee8023ad_lacp.c == --- head/sys/net/ieee8023ad_lacp.c Mon Aug 13 13:58:45 2018 (r337708) +++ head/sys/net/ieee8023ad_lacp.c Mon Aug 13 14:13:25 2018 (r337709) @@ -711,6 +711,8 @@ lacp_disable_distributing(struct lacp_port *lp) } lp->lp_state &= ~LACP_STATE_DISTRIBUTING; + if_link_state_change(sc->sc_ifp, + sc->sc_active ? LINK_STATE_UP : LINK_STATE_DOWN); } static void @@ -745,6 +747,9 @@ lacp_enable_distributing(struct lacp_port *lp) } else /* try to become the active aggregator */ lacp_select_active_aggregator(lsc); + + if_link_state_change(sc->sc_ifp, + sc->sc_active ? LINK_STATE_UP : LINK_STATE_DOWN); } static void Modified: head/sys/net/if_lagg.c == --- head/sys/net/if_lagg.c Mon Aug 13 13:58:45 2018(r337708) +++ head/sys/net/if_lagg.c Mon Aug 13 14:13:25 2018(r337709) @@ -1737,6 +1737,10 @@ lagg_linkstate(struct lagg_softc *sc) LAGG_XLOCK_ASSERT(sc); + /* LACP handles link state itself */ + if (sc->sc_proto == LAGG_PROTO_LACP) + return; + /* Our link is considered up if at least one of our ports is active */ LAGG_RLOCK(); CK_SLIST_FOREACH(lp, >sc_ports, lp_entries) { ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r335916 - head/sys/conf
On 07/05/18 19:59, John Baldwin wrote: You misunderstand. /usr/local/sys/modules would hold module sources so that they can be recompiled when building a kernel without having to rebuild the package or reinstall the package. Binary modules would continue to be installed in /boot/modules. This is very similar to the approach that many Linux distributions take with DKMS. The kernel sources for out-of-tree modules are kept around, and every time a kernel is installed, its new header files are used to re-compile the out-of-tree module. Similarly, when you install a package containing a kernel module, it is re-compiled and installed for every installed kernel. One thing that was tangentially brought up is that the ability to compile out-of-tree modules requires keeping the kernel-headers around. So we may need to identify all the headers that a module might need, and install them in /boot/$KERNEL/sys or some-such. This would be needed if, for example, we wanted to install a new Nvidia or Virtual Box module and have it work for older installed kernel versions too (eg, across ABI breaking changes in -current). This would certainly make life easier for people running -current. This system works quite well on Linux. For comparison, I used an Ubuntu based desktop with Nvidia graphics at a previous employers, and a FreeBSD-current desktop w/Nvidia graphics now. I've been left w/o graphics accidentally much more often on FreeBSD than I ever had been on Ubuntu, even when compiling my own kernels from git.. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r336042 - head/sys/dev/cxgbe/common
On 07/06/18 15:33, Navdeep Parhar wrote: Log: cxgbe(4): Assume that any unknown flash on the card is 4MB and has 64KB sectors, instead of refusing to attach to the card. Thank you! Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r335967 - head/sys/dev/mxge
On 07/05/18 17:14, Rick Macklem wrote: Andrew Gallatin wrote: On 7/4/18 9:20 PM, Rodney W. Grimes wrote: [stuff snipped] It is using a magic constant twice, where one has a derived value that is dependent on the value of the other. That is bad and error prone and does not document that one depends on the other. Please fix this. Or at least make 65536 a #define so that it only needs changed one place and clearly shows the interdependence of these values. To me, 65536 is one of the few cases where the magic number is more meaningful than a name. But fine, if you feel that strongly about it, I'll change it for you. Btw, in general, if_hw_tsomax and if_hw_tsomaxsegsize are not related or the same value. It just happens that they both appear to be related to 64K in this case. (I believe this is fairly common, since the original Microsoft "standard" used 64K as a limit, since it was stored in 16bits.) Yes; exactly. if_hw_tsomax is the maximum size of the entire TSO segment, including MAC level headers (commonly 64K, due to Mircosoft... but could be larger if the hardware guys chose to do so). Given that we do TSO like Linux, and not like MS (meaning we express the size of the pre-segmented packet using the a 16-bit value in the IPv4/IPv6 header), supporting more than 64K is not possible in FreeBSD, so I'm basically saying "nerf this constraint". MS windows does it better / different; they express the size of the pre-segmented packet in packet metadata, leaving ip->ip_len = 0. This is better, since then the pseudo hdr checksum in the template header can be re-used (with the len added) for every segment by the NIC. If you've ever seen a driver set ip->ip_len = 0, and re-calc the pseudo-hdr checksum, that's why. This is also why MS LSOv2 can support TSO of packets larger than 64K, since they're not constrained by the 16-bit value in the IP{4,6} header. The value of TSO larger than 64K is questionable at best though. Without pacing, you'd just get more packets dropped when talking across the internet.. if_hw_tsomaxsegsize is the maximum size of contiguous memory that a "chunk" of the TSO segment can be stored in for handling by the driver's transmit side. Since higher And this is what I object to. TCP should not care about this. Drivers should use busdma, or otherwise be capable of chopping large contig regions down to chunks that they can handle. If a driver can really only handle 2K, then it should be having busdma give it an s/g list that is 2x as long, not having TCP call m_dupcl() 2x as often on page-sized data generated by sendfile (or more on non-x86 with larger pages). level code such as NFS (and iSCSI, I think?) uses MCLBYTE clusters, anything 2K or higher normally works the same. Not sure about sosend(), but I think it also copies the data into MCLBYTE clusters? This would change if someday jumbo mbuf clusters become the norm. (I tried changing the NFS code to use jumbo clusters, but it would result in fragmentation of the memory used for mbuf cluster allocation, so I never committed it.) At least for sendfile(), vm pages are wrapped up and attached to mbufs, so you have 4K (and potentially much more on non-x86). Doesn't NFS do something similar when sending data, or do you copy into clusters? I have changes which I have not upstreamed yet which enhance mbufs to carry TLS metadata & vector of physical addresses (which I call unmapped mbufs) for sendfile and kernel TLS. As part of that, sosend (for kTLS) can allocate many pages and attach them to one mbuf. The idea (for kTLS) is that you can keep an entire TLS record (with framing information) in a single unmapped mbuf, which saves a huge amount of CPU which would be lost to cache misses doing pointer-chasing of really long mbuf chains (TLS hdrs and trailers are generally 13 and 16 bytes). The goal was to regain CPU during Netflix's transition to https streaming. However, it is unintentionally quite helpful on i386, since it reduces overhead from having to map/unmap sf_bufs. FWIW, these mbufs have been in production at Netflix for over a year, and carry a large fraction of the worlds internet traffic :) rick ps: And I'll admit I don't find 65536 very magic;-) :) Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r335973 - head/sys/dev/mxge
Author: gallatin Date: Thu Jul 5 02:43:10 2018 New Revision: 335973 URL: https://svnweb.freebsd.org/changeset/base/335973 Log: mxge: replace 65536 with IP_MAXPACKET in tso settings. Modified: head/sys/dev/mxge/if_mxge.c Modified: head/sys/dev/mxge/if_mxge.c == --- head/sys/dev/mxge/if_mxge.c Thu Jul 5 02:08:57 2018(r335972) +++ head/sys/dev/mxge/if_mxge.c Thu Jul 5 02:43:10 2018(r335973) @@ -4984,9 +4984,9 @@ mxge_attach(device_t dev) ifp->if_ioctl = mxge_ioctl; ifp->if_start = mxge_start; ifp->if_get_counter = mxge_get_counter; - ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc; - ifp->if_hw_tsomaxsegsize = 65536; + ifp->if_hw_tsomaxsegsize = IP_MAXPACKET; /* Initialise the ifmedia structure */ ifmedia_init(>media, 0, mxge_media_change, mxge_media_status); ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r335967 - head/sys/dev/mxge
On 7/4/18 9:20 PM, Rodney W. Grimes wrote: On 07/04/18 15:46, Rodney W. Grimes wrote: Author: gallatin Date: Wed Jul 4 19:29:06 2018 New Revision: 335967 URL: https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_335967=DwICAg=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=2rIiw5AUJ2ishkBkygGMa_9kr0LJOaonX8ni3BF2BHk=MwCt6_IgNah0XklsYThsXFcwZD54Xl78TRlnFXJ4zWs= Log: mxge: choose appropriate values for hw tso Modified: head/sys/dev/mxge/if_mxge.c Modified: head/sys/dev/mxge/if_mxge.c == --- head/sys/dev/mxge/if_mxge.c Wed Jul 4 18:54:44 2018(r335966) +++ head/sys/dev/mxge/if_mxge.c Wed Jul 4 19:29:06 2018(r335967) @@ -4984,6 +4984,9 @@ mxge_attach(device_t dev) ifp->if_ioctl = mxge_ioctl; ifp->if_start = mxge_start; ifp->if_get_counter = mxge_get_counter; + ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); Would not this be more accurate (need to reorder assigns): ifp->if_hw_tsomax = ifp->if_hw_tsomaxsegsize - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc; + ifp->if_hw_tsomaxsegsize = 65536; It seems simpler as-is to me. It is using a magic constant twice, where one has a derived value that is dependent on the value of the other. That is bad and error prone and does not document that one depends on the other. Please fix this. Or at least make 65536 a #define so that it only needs changed one place and clearly shows the interdependence of these values. To me, 65536 is one of the few cases where the magic number is more meaningful than a name. But fine, if you feel that strongly about it, I'll change it for you. ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r335967 - head/sys/dev/mxge
On 07/04/18 15:46, Rodney W. Grimes wrote: Author: gallatin Date: Wed Jul 4 19:29:06 2018 New Revision: 335967 URL: https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_335967=DwICAg=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=2rIiw5AUJ2ishkBkygGMa_9kr0LJOaonX8ni3BF2BHk=MwCt6_IgNah0XklsYThsXFcwZD54Xl78TRlnFXJ4zWs= Log: mxge: choose appropriate values for hw tso Modified: head/sys/dev/mxge/if_mxge.c Modified: head/sys/dev/mxge/if_mxge.c == --- head/sys/dev/mxge/if_mxge.c Wed Jul 4 18:54:44 2018(r335966) +++ head/sys/dev/mxge/if_mxge.c Wed Jul 4 19:29:06 2018(r335967) @@ -4984,6 +4984,9 @@ mxge_attach(device_t dev) ifp->if_ioctl = mxge_ioctl; ifp->if_start = mxge_start; ifp->if_get_counter = mxge_get_counter; + ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); Would not this be more accurate (need to reorder assigns): ifp->if_hw_tsomax = ifp->if_hw_tsomaxsegsize - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc; + ifp->if_hw_tsomaxsegsize = 65536; It seems simpler as-is to me. Looking around at other drivers, I see at least one (cxgbe) which does the same thing. After doing the grep, I'm more concerned with drivers which may be setting their tsomaxsegsize incorrectly to be too small and hurting their performance by causing TCP to chop needlessly at smaller boundaries which are already enforced by their busdma tags. PAGE_SIZE, which seems to be the common mistaken size, won't hurt too much I suppose. But the default of 2K is probably not very good. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r335967 - head/sys/dev/mxge
Author: gallatin Date: Wed Jul 4 19:29:06 2018 New Revision: 335967 URL: https://svnweb.freebsd.org/changeset/base/335967 Log: mxge: choose appropriate values for hw tso Modified: head/sys/dev/mxge/if_mxge.c Modified: head/sys/dev/mxge/if_mxge.c == --- head/sys/dev/mxge/if_mxge.c Wed Jul 4 18:54:44 2018(r335966) +++ head/sys/dev/mxge/if_mxge.c Wed Jul 4 19:29:06 2018(r335967) @@ -4984,6 +4984,9 @@ mxge_attach(device_t dev) ifp->if_ioctl = mxge_ioctl; ifp->if_start = mxge_start; ifp->if_get_counter = mxge_get_counter; + ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc; + ifp->if_hw_tsomaxsegsize = 65536; /* Initialise the ifmedia structure */ ifmedia_init(>media, 0, mxge_media_change, mxge_media_status); ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r335966 - head/sys/dev/mxge
On 07/04/18 14:54, Andrew Gallatin wrote: mxge: Add SIOCGI2C support for devices with SFP/XFP cages Note that I do not have any XFP devices to test with, only SFP and CX4. If this causes problems for XFP devices, I can restrict SIOCGI2C support to just SFP if needed. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r335966 - head/sys/dev/mxge
Author: gallatin Date: Wed Jul 4 18:54:44 2018 New Revision: 335966 URL: https://svnweb.freebsd.org/changeset/base/335966 Log: mxge: Add SIOCGI2C support for devices with SFP/XFP cages Modified: head/sys/dev/mxge/if_mxge.c Modified: head/sys/dev/mxge/if_mxge.c == --- head/sys/dev/mxge/if_mxge.c Wed Jul 4 18:03:19 2018(r335965) +++ head/sys/dev/mxge/if_mxge.c Wed Jul 4 18:54:44 2018(r335966) @@ -4154,10 +4154,50 @@ mxge_media_status(struct ifnet *ifp, struct ifmediareq } static int +mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c) +{ + mxge_cmd_t cmd; + uint32_t i2c_args; + int i, ms, err; + + + if (i2c->dev_addr != 0xA0 && + i2c->dev_addr != 0xA2) + return (EINVAL); + if (i2c->len > sizeof(i2c->data)) + return (EINVAL); + + for (i = 0; i < i2c->len; i++) { + i2c_args = i2c->dev_addr << 0x8; + i2c_args |= i2c->offset + i; + cmd.data0 = 0; /* just fetch 1 byte, not all 256 */ + cmd.data1 = i2c_args; + err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, ); + + if (err != MXGEFW_CMD_OK) + return (EIO); + /* now we wait for the data to be cached */ + cmd.data0 = i2c_args & 0xff; + err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, ); + for (ms = 0; (err == EBUSY) && (ms < 50); ms++) { + cmd.data0 = i2c_args & 0xff; + err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, ); + if (err == EBUSY) + DELAY(1000); + } + if (err != MXGEFW_CMD_OK) + return (EIO); + i2c->data[i] = cmd.data0; + } + return (0); +} + +static int mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { mxge_softc_t *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; + struct ifi2creq i2c; int err, mask; err = 0; @@ -4292,6 +4332,26 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t >media, command); break; + case SIOCGI2C: + if (sc->connector != MXGE_XFP && + sc->connector != MXGE_SFP) { + err = ENXIO; + break; + } + err = copyin(ifr_data_get_ptr(ifr), , sizeof(i2c)); + if (err != 0) + break; + mtx_lock(>driver_mtx); + if (sc->dying) { + mtx_unlock(>driver_mtx); + return (EINVAL); + } + err = mxge_fetch_i2c(sc, ); + mtx_unlock(>driver_mtx); + if (err == 0) + err = copyout(, ifr->ifr_ifru.ifru_data, + sizeof(i2c)); + break; default: err = ether_ioctl(ifp, command, data); break; ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r335957 - head/sys/dev/mxge
Author: gallatin Date: Wed Jul 4 14:25:38 2018 New Revision: 335957 URL: https://svnweb.freebsd.org/changeset/base/335957 Log: mxge: fix panic at module unload r333175 (multicast changes) exposed a bug where mxge was not checking to see if the driver was being unloaded while handing ioctls that touch hardware. As a result, now that in6m_disconnect() is run from an async gtaskq, it was busy-waiting in mxge_send_cmd() while the mcast list was destroyed. Modified: head/sys/dev/mxge/if_mxge.c Modified: head/sys/dev/mxge/if_mxge.c == --- head/sys/dev/mxge/if_mxge.c Wed Jul 4 14:20:19 2018(r335956) +++ head/sys/dev/mxge/if_mxge.c Wed Jul 4 14:25:38 2018(r335957) @@ -4193,6 +4193,10 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t case SIOCADDMULTI: case SIOCDELMULTI: mtx_lock(>driver_mtx); + if (sc->dying) { + mtx_unlock(>driver_mtx); + return (EINVAL); + } mxge_set_multicast_list(sc); mtx_unlock(>driver_mtx); break; @@ -4278,6 +4282,10 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t case SIOCGIFMEDIA: mtx_lock(>driver_mtx); + if (sc->dying) { + mtx_unlock(>driver_mtx); + return (EINVAL); + } mxge_media_probe(sc); mtx_unlock(>driver_mtx); err = ifmedia_ioctl(ifp, (struct ifreq *)data, ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r334143 - head/sys/dev/cxgbe
On 05/24/18 06:18, Navdeep Parhar wrote: Log: cxgbe(4): Data path for rate-limited tx. This is hardware support for the SO_MAX_PACING_RATE sockopt (see setsockopt(2)), which is available in kernels built with "options RATELIMIT". Relnotes: Yes Sponsored by:Chelsio Communications Hurray! Thanks so much for supporting this! Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r333793 - head/usr.sbin/pmcannotate
Author: gallatin Date: Fri May 18 14:14:04 2018 New Revision: 333793 URL: https://svnweb.freebsd.org/changeset/base/333793 Log: Teach pmcannotate about $TMPDIR and _PATH_TMP Convert pmcannotate to using $TMPDIR and _PATH_TMP rather than hard coding /tmp for temporary files. Pmcannotate sometimes needs quite a lot of space to store the output from objdump, and will fail in odd ways if that output is truncated due to lack of space in /tmp. Reviewed by: jtl Sponsored by: Netflix Modified: head/usr.sbin/pmcannotate/pmcannotate.c Modified: head/usr.sbin/pmcannotate/pmcannotate.c == --- head/usr.sbin/pmcannotate/pmcannotate.c Fri May 18 13:49:12 2018 (r333792) +++ head/usr.sbin/pmcannotate/pmcannotate.c Fri May 18 14:14:04 2018 (r333793) @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -47,7 +48,7 @@ __FBSDID("$FreeBSD$"); #defineFNBUFF 512 #defineLNBUFF 512 -#defineTMPPATH "/tmp/pmcannotate.XX" +#defineTMPNAME "pmcannotate.XX" #defineFATAL(ptr, x ...) do { \ fqueue_deleteall(); \ @@ -671,7 +672,8 @@ usage(const char *progname) int main(int argc, char *argv[]) { - char buffer[LNBUFF], fname[FNBUFF], tbfl[] = TMPPATH, tofl[] = TMPPATH; + char buffer[LNBUFF], fname[FNBUFF]; + char *tbfl, *tofl, *tmpdir; char tmpf[MAXPATHLEN * 2 + 50]; float limit; char *bin, *exec, *kfile, *ofile; @@ -721,6 +723,17 @@ main(int argc, char *argv[]) exec); bzero(tmpf, sizeof(tmpf)); + tmpdir = getenv("TMPDIR"); + if (tmpdir == NULL) { + asprintf(, "%s/%s", _PATH_TMP, TMPNAME); + asprintf(, "%s/%s", _PATH_TMP, TMPNAME); + } else { + asprintf(, "%s/%s", tmpdir, TMPNAME); + asprintf(, "%s/%s", tmpdir, TMPNAME); + } + if (tofl == NULL || tbfl == NULL) + FATAL(exec, "%s: Cannot create tempfile templates\n", + exec); if (mkstemp(tofl) == -1) FATAL(exec, "%s: Impossible to create the tmp file\n", exec); ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r333703 - head/sys/vm
On 05/17/18 14:07, Mark Johnston wrote: On Thu, May 17, 2018 at 10:07:34AM -0700, Conrad Meyer wrote: On Wed, May 16, 2018 at 9:27 PM, Mark Johnstonwrote: Author: markj Date: Thu May 17 04:27:08 2018 New Revision: 333703 URL: https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_333703=DwIBAg=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=6lhtci2MYxtyrK5Ub70QC0DcEiQ77Ry2LTAb6cDtW5A=z0SOGvNGORjI-SySfy-aovuyFzy_K5CtCfbNeWbRGLA= Log: Fix a race in vm_page_pagequeue_lockptr(). The value of m->queue must be cached after comparing it with PQ_NONE, since it may be concurrently changing. Reported by: glebius What were the symptoms of this issue? The test plan in the linked phabricator revision says: "Gleb reported seeing panics as a result of the use of a bogus index into the pagequeue array, and also reported that this patch fixed the panics." So an attempt to lock pagequeues[PQ_NONE=255].pq_mutex, which is either something later in the vm_domain object, or bogus memory? One of the mtx asserts trips? I think it was "mtx_lock() of spin mutex"; I didn't get a lot of details. I failed to note in the commit message that this race was introduced in r332974. The most common stack was: panic: mtx_lock() of spin mutex (null) @ /data/ocafirmware.alt/FreeBSD/sys/vm/vm_page.c:3344 cpuid = 4 time = 1526415167 KDB: stack backtrace: db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfe158af62380 vpanic() at vpanic+0x1a3/frame 0xfe158af623e0 doadump() at doadump/frame 0xfe158af62460 __mtx_lock_flags() at __mtx_lock_flags+0x11a/frame 0xfe158af624a0 vm_page_dequeue() at vm_page_dequeue+0x8a/frame 0xfe158af624e0 vm_page_alloc_domain_after() at vm_page_alloc_domain_after+0x2cb/frame 0xfe158af62560 vm_page_grab_pages() at vm_page_grab_pages+0x274/frame 0xfe158af62610 vn_sendfile() at vn_sendfile+0x83a/frame 0xfe158af628e0 [Tue May 15 20:12:48 2018]sys_sendfile() at sys_sendfile+0x119/frame 0xfe158af62980 amd64_syscall() at amd64_syscall+0x298/frame 0xfe158af62ab0 fast_syscall_common() at fast_syscall_common+0x101/frame 0xfe158af62ab0 I once saw one like this: Fatal trap 9: general protection fault while in kernel mode cpuid = 0; apic id = 00 instruction pointer = 0x20:0x8088bf74 stack pointer = 0x28:0xfe55af7712e0 frame pointer = 0x28:0xfe55af771330 code segment= base 0x0, limit 0xf, type 0x1b = DPL 0, pres 1, long 1, def32 0, gran 1 processor eflags= interrupt enabled, resume, IOPL = 0 current process = 12 (irq446: mlx5_core0) [Mon May 14 04:45:10 2018]trap number = 9 panic: general protection fault cpuid = 0 time = 1526273109 KDB: stack backtrace: db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfe55af770ff0 vpanic() at vpanic+0x1a3/frame 0xfe55af771050 panic() at panic+0x43/frame 0xfe55af7710b0 trap_fatal() at trap_fatal+0x35f/frame 0xfe55af771100 trap() at trap+0x6d/frame 0xfe55af771210 [Mon May 14 04:45:10 2018]calltrap() at calltrap+0x8/frame 0xfe55af771210 --- trap 0x9, rip = 0x8088bf74, rsp = 0xfe55af7712e0, rbp = 0xfe55af771330 --- vm_pqbatch_submit_page() at vm_pqbatch_submit_page+0x144/frame 0xfe55af771330 sendfile_free_page() at sendfile_free_page+0x10e/frame 0xfe55af771360 sendfile_free_mext_pg() at sendfile_free_mext_pg+0xb7/frame 0xfe55af7713b0 mb_free_ext() at mb_free_ext+0x103/frame 0xfe55af7713e0 m_freem() at m_freem+0x48/frame 0xfe55af771400 tcp_do_segment() at tcp_do_segment+0x1647/frame 0xfe55af771500 tcp_input_with_port() at tcp_input_with_port+0xfcc/frame 0xfe55af771650 tcp_input() at tcp_input+0xb/frame 0xfe55af771660 [Mon May 14 04:45:10 2018]ip_input() at ip_input+0xe9/frame 0xfe55af7716c0 netisr_dispatch_src() at netisr_dispatch_src+0xa8/frame 0xfe55af771710 ether_demux() at ether_demux+0x140/frame 0xfe55af771740 ether_nh_input() at ether_nh_input+0x32c/frame 0xfe55af7717a0 netisr_dispatch_src() at netisr_dispatch_src+0xa8/frame 0xfe55af7717f0 ether_input() at ether_input+0x26/frame 0xfe55af771810 tcp_lro_flush_all() at tcp_lro_flush_all+0xf2/frame 0xfe55af771850 mlx5e_rx_cq_comp() at mlx5e_rx_cq_comp+0x5e5/frame 0xfe55af771950 mlx5_cq_completion() at mlx5_cq_completion+0x73/frame 0xfe55af771990 <...> Thanks again for fixing it so quickly! Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r333655 - head/sys/sys
Author: gallatin Date: Tue May 15 23:55:38 2018 New Revision: 333655 URL: https://svnweb.freebsd.org/changeset/base/333655 Log: Unhook DEBUG_BUFRING from INVARIANTS Some of the DEBUG_BUFRING checks are racy, and can lead to spurious assertions when run under high load. Unhook these from INVARIANTS until the author can fix or remove them. Reviewed by: mmacy Sponsored by: Netflix Modified: head/sys/sys/buf_ring.h Modified: head/sys/sys/buf_ring.h == --- head/sys/sys/buf_ring.h Tue May 15 23:46:49 2018(r333654) +++ head/sys/sys/buf_ring.h Tue May 15 23:55:38 2018(r333655) @@ -34,10 +34,6 @@ #include -#if defined(INVARIANTS) && !defined(DEBUG_BUFRING) -#define DEBUG_BUFRING 1 -#endif - #ifdef DEBUG_BUFRING #include #include @@ -69,6 +65,12 @@ buf_ring_enqueue(struct buf_ring *br, void *buf) uint32_t prod_head, prod_next, cons_tail; #ifdef DEBUG_BUFRING int i; + + /* +* Note: It is possible to encounter an mbuf that was removed +* via drbr_peek(), and then re-added via drbr_putback() and +* trigger a spurious panic. +*/ for (i = br->br_cons_head; i != br->br_prod_head; i = ((i + 1) & br->br_cons_mask)) if(br->br_ring[i] == buf) ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r333470 - in head: share/mk sys/conf
On 05/10/18 20:11, Ed Maste wrote: On 10 May 2018 at 20:00, Andrew Gallatin <galla...@cs.duke.edu> wrote: Unfortunately, it looks like this method will get blown away by an installworld: Ah. You can set WITH_LLD_IS_LD in /etc/src.conf and installworld will install ld as a symlink to ld.lld, Super! That's the answer that I was looking for, and what should get me back to building kernels like it's 1999 :) Thanks, Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r333470 - in head: share/mk sys/conf
On 05/10/18 19:14, Ed Maste wrote: On 10 May 2018 at 18:24, Andrew Gallatin <galla...@cs.duke.edu> wrote: Rather than erroring out, we please just use the appropriate linker? That's my goal, but it's a bit of an involved change and will take some time to make sure we don't introduce new corner cases. I'm sorry that I didn't catch this before the first ifunc use went in -- lld has been the default bootstrap linker (via buildworld or kernel-toolchain) since mid-Jan and this problem slipped my mind. I added the error in the meantime to avoid the silently broken kernel case that you unfortunately encountered. The low-friction method of getting past this in the interim is to just use ld.lld as the system linker: # ln -fs ld.lld /usr/bin/ld I'm just waiting on an update to the lang/ghc port and another exp-run before that becomes the default. Thanks! Unfortunately, it looks like this method will get blown away by an installworld: <7:57pm>thing1/gallatin:src>ls -li /usr/bin/ld* 12038400 lrwxr-xr-x 1 root wheel15 May 10 19:21 /usr/bin/ld@ -> /usr/bin/ld.lld 32386537 -r-xr-xr-x 1 root wheel 1911384 May 10 09:13 /usr/bin/ld.bfd* 32387059 -r-xr-xr-x 1 root wheel 40449288 May 10 09:13 /usr/bin/ld.lld* 32386878 -r-xr-xr-x 1 root wheel 19352 May 10 09:13 /usr/bin/ldd* 32387816 -r-xr-xr-x 1 root wheel 26872 May 10 09:14 /usr/bin/ldd32* <7:57pm>thing1/gallatin:src>sudo make -j32 installworld >& log <7:58pm>thing1/gallatin:src>!ls ls -li /usr/bin/ld* 32347218 -r-xr-xr-x 2 root wheel 1911384 May 10 19:58 /usr/bin/ld* 32347218 -r-xr-xr-x 2 root wheel 1911384 May 10 19:58 /usr/bin/ld.bfd* 32348085 -r-xr-xr-x 1 root wheel 40449288 May 10 19:58 /usr/bin/ld.lld* 32347538 -r-xr-xr-x 1 root wheel 19352 May 10 19:58 /usr/bin/ldd* 32348365 -r-xr-xr-x 1 root wheel 26872 May 10 19:58 /usr/bin/ldd32* Would it make sense to just set LD=ld.lld in my and root's .cshrc? Thanks, Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r333470 - in head: share/mk sys/conf
On 05/10/18 16:10, Ed Maste wrote: Author: emaste Date: Thu May 10 20:10:02 2018 New Revision: 333470 URL: https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_333470=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=rex4ilVMckTDXNGV-XhKnQ02pSuAJ0JPojwMYmZ6d9U=OfKJ8mXeldmYLNTK2NE1g9kYsBPeucarY_F6p-A3e0g= Log: Error out on attempt to link amd64 kernel with old binutils linker I lost the better part of a day due to the issue of the build using the wrong linker. Rather than erroring out, we please just use the appropriate linker? My workflow is that of the typical dinosaur: config -g GENERIC cd ../compile/GENERIC make cleandepend && make depend && make -j64 Thanks, Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r333462 - head/sys/netinet6
Author: gallatin Date: Thu May 10 16:19:41 2018 New Revision: 333462 URL: https://svnweb.freebsd.org/changeset/base/333462 Log: Fix a panic in the IPv6 multicast code. Use LIST_FOREACH_SAFE in in6m_disconnect() since we're deleting and freeing item from the membership list while traversing the list. Reviewed by: mmacy Sponsored by: Netflix Modified: head/sys/netinet6/in6_mcast.c Modified: head/sys/netinet6/in6_mcast.c == --- head/sys/netinet6/in6_mcast.c Thu May 10 15:01:43 2018 (r333461) +++ head/sys/netinet6/in6_mcast.c Thu May 10 16:19:41 2018 (r333462) @@ -581,7 +581,7 @@ in6m_disconnect(struct in6_multi *inm) struct ifnet *ifp; struct ifaddr *ifa; struct in6_ifaddr *ifa6; - struct in6_multi_mship *imm; + struct in6_multi_mship *imm, *imm_tmp; struct ifmultiaddr *ifma, *ll_ifma; ifp = inm->in6m_ifp; @@ -607,7 +607,8 @@ in6m_disconnect(struct in6_multi *inm) if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (void *)ifa; - LIST_FOREACH(imm, >ia6_memberships, i6mm_chain) { + LIST_FOREACH_SAFE(imm, >ia6_memberships, + i6mm_chain, imm_tmp) { if (inm == imm->i6mm_maddr) { LIST_REMOVE(imm, i6mm_chain); free(imm, M_IP6MADDR); ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r333457 - head/sys/kern
On 05/10/18 07:36, Ed Maste wrote: Author: emaste Date: Thu May 10 11:36:16 2018 New Revision: 333457 URL: https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_333457=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=Y7TkuLso5vnwZN5ypgs4eLKVEdMOSRgvhZZz1iAMdyU=-shrydFGkcYwmYlaG3W1nMyk2hg7rbKzCPfHI8_6GYM= Log: ANSIfy sys_generic.c Modified: head/sys/kern/sys_generic.c Modified: head/sys/kern/sys_generic.c == --- head/sys/kern/sys_generic.c Thu May 10 09:37:54 2018(r333456) +++ head/sys/kern/sys_generic.c Thu May 10 11:36:16 2018(r333457) <..> @@ -532,11 +519,7 @@ sys_pwritev(struct thread *td, struct pwritev_args *ua } int -kern_pwritev(td, fd, auio, offset) - struct thread *td; - struct uio *auio; - int fd; - off_t offset; +kern_pwritev(struct thread *td, struct uio *auio, int fd, off_t offset) { struct file *fp; int error; This breaks the kernel build: /usr/src/sys/kern/sys_generic.c:522:1: error: conflicting types for 'kern_pwritev' kern_pwritev(struct thread *td, struct uio *auio, int fd, off_t offset) ^ /usr/src/sys/sys/syscallsubr.h:212:5: note: previous declaration is here int kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset); ^ 1 error generated. *** [sys_generic.o] Error code 1 I think the problem was that the non-ansi args were enumerated in a different order than their type declarations. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r333459 - head/sys/kern
Author: gallatin Date: Thu May 10 13:19:42 2018 New Revision: 333459 URL: https://svnweb.freebsd.org/changeset/base/333459 Log: Fix the build after r333457 In r333457, the arguments to kern_pwritev() were accidentally re-ordered as part of ANSIfication, breaking the build. Modified: head/sys/kern/sys_generic.c Modified: head/sys/kern/sys_generic.c == --- head/sys/kern/sys_generic.c Thu May 10 12:25:01 2018(r333458) +++ head/sys/kern/sys_generic.c Thu May 10 13:19:42 2018(r333459) @@ -519,7 +519,7 @@ sys_pwritev(struct thread *td, struct pwritev_args *ua } int -kern_pwritev(struct thread *td, struct uio *auio, int fd, off_t offset) +kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset) { struct file *fp; int error; ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r333329 - head/sys/net
Author: gallatin Date: Mon May 7 18:11:22 2018 New Revision: 29 URL: https://svnweb.freebsd.org/changeset/base/29 Log: Fix an off-by-one error when deciding to request a tx interrupt The canonical check for whether or not a ring is drainable is TXQ_AVAIL() > MAX_TX_DESC() + 2. Use this same construct here, in order to avoid a potential off-by-one error where we might otherwise fail to request an interrupt. Reviewed by: mmacy Sponsored by: Netflix Modified: head/sys/net/iflib.c Modified: head/sys/net/iflib.c == --- head/sys/net/iflib.cMon May 7 17:37:07 2018(r28) +++ head/sys/net/iflib.cMon May 7 18:11:22 2018(r29) @@ -3299,7 +3299,7 @@ defrag: */ txq->ift_rs_pending += nsegs + 1; if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) || -iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs - 1) <= MAX_TX_DESC(ctx)) { +iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) { pi.ipi_flags |= IPI_TX_INTR; txq->ift_rs_pending = 0; } ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r333325 - head/sys/kern
Author: gallatin Date: Mon May 7 15:24:03 2018 New Revision: 25 URL: https://svnweb.freebsd.org/changeset/base/25 Log: Boost thread priority while changing CPU frequency Boost the priority of user-space threads when they set their affinity to a core to adjust its frequency. This avoids a situation where a CPU bound kernel thread with the same affinity is running on a down-clocked core, and will "block" powerd from up-clocking the core until the kernel thread yields. This can lead to poor perfomance, and to things potentially getting stuck on Giant. Reviewed by: kib (imp reviewed earlier version) Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D15246 Modified: head/sys/kern/kern_cpu.c Modified: head/sys/kern/kern_cpu.c == --- head/sys/kern/kern_cpu.cMon May 7 15:07:28 2018(r24) +++ head/sys/kern/kern_cpu.cMon May 7 15:24:03 2018(r25) @@ -245,6 +245,7 @@ cf_set_method(device_t dev, const struct cf_level *lev struct cf_saved_freq *saved_freq, *curr_freq; struct pcpu *pc; int error, i; + u_char pri; sc = device_get_softc(dev); error = 0; @@ -333,6 +334,8 @@ cf_set_method(device_t dev, const struct cf_level *lev /* Bind to the target CPU before switching. */ pc = cpu_get_pcpu(set->dev); thread_lock(curthread); + pri = curthread->td_priority; + sched_prio(curthread, PRI_MIN); sched_bind(curthread, pc->pc_cpuid); thread_unlock(curthread); CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq, @@ -340,6 +343,7 @@ cf_set_method(device_t dev, const struct cf_level *lev error = CPUFREQ_DRV_SET(set->dev, set); thread_lock(curthread); sched_unbind(curthread); + sched_prio(curthread, pri); thread_unlock(curthread); if (error) { goto out; @@ -357,6 +361,8 @@ cf_set_method(device_t dev, const struct cf_level *lev /* Bind to the target CPU before switching. */ pc = cpu_get_pcpu(set->dev); thread_lock(curthread); + pri = curthread->td_priority; + sched_prio(curthread, PRI_MIN); sched_bind(curthread, pc->pc_cpuid); thread_unlock(curthread); CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq, @@ -364,6 +370,7 @@ cf_set_method(device_t dev, const struct cf_level *lev error = CPUFREQ_DRV_SET(set->dev, set); thread_lock(curthread); sched_unbind(curthread); + sched_prio(curthread, pri); thread_unlock(curthread); if (error) { /* XXX Back out any successful setting? */ ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r333141 - head/sys/dev/cxgbe
Author: gallatin Date: Tue May 1 15:33:21 2018 New Revision: 333141 URL: https://svnweb.freebsd.org/changeset/base/333141 Log: Optionally panic when cxgbe encounters a fatal error Sometimes it is better to panic than to leave a machine unreachable. Reviewed by: np Sponsored by: Netflix Modified: head/sys/dev/cxgbe/t4_main.c Modified: head/sys/dev/cxgbe/t4_main.c == --- head/sys/dev/cxgbe/t4_main.cTue May 1 15:17:46 2018 (r333140) +++ head/sys/dev/cxgbe/t4_main.cTue May 1 15:33:21 2018 (r333141) @@ -469,6 +469,8 @@ TUNABLE_INT("hw.cxgbe.num_vis", _num_vis); static int pcie_relaxed_ordering = -1; TUNABLE_INT("hw.cxgbe.pcie_relaxed_ordering", _relaxed_ordering); +static int t4_panic_on_fatal_err = 0; +TUNABLE_INT("hw.cxgbe.panic_on_fatal_err", _panic_on_fatal_err); #ifdef TCP_OFFLOAD /* @@ -,6 +2224,8 @@ t4_fatal_err(struct adapter *sc) t4_intr_disable(sc); log(LOG_EMERG, "%s: encountered fatal error, adapter stopped.\n", device_get_nameunit(sc->dev)); + if (t4_panic_on_fatal_err) + panic("panic requested on fatal error"); } void ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r333131 - head/sys/net
Author: gallatin Date: Mon Apr 30 23:53:27 2018 New Revision: 333131 URL: https://svnweb.freebsd.org/changeset/base/333131 Log: Fix iflib_encap() EFBIG handling bugs 1) Don't give up if m_collapse() fails. Rather than giving up, try m_defrag() immediately. 2) Fix a leak where, if the NIC driver rejected the defrag'ed chain as having too many segments, we would fail to free the chain. Reviewed by: Matthew Macy(this version of patch) Submitted by: Matthew Macy (early version of leak fix) Modified: head/sys/net/iflib.c Modified: head/sys/net/iflib.c == --- head/sys/net/iflib.cMon Apr 30 23:05:57 2018(r333130) +++ head/sys/net/iflib.cMon Apr 30 23:53:27 2018(r333131) @@ -3244,8 +3244,12 @@ defrag: switch (err) { case EFBIG: /* try collapse once and defrag once */ - if (remap == 0) + if (remap == 0) { m_head = m_collapse(*m_headp, M_NOWAIT, max_segs); + /* try defrag if collapsing fails */ + if (m_head == NULL) + remap++; + } if (remap == 1) m_head = m_defrag(*m_headp, M_NOWAIT); remap++; @@ -,13 +3337,18 @@ defrag: */ txq->ift_pidx = pi.ipi_new_pidx; txq->ift_npending += pi.ipi_ndescs; - } else if (__predict_false(err == EFBIG && remap < 2)) { + } else { *m_headp = m_head = iflib_remove_mbuf(txq); - remap = 1; - txq->ift_txd_encap_efbig++; - goto defrag; - } else + if (err == EFBIG) { + txq->ift_txd_encap_efbig++; + if (remap < 2) { + remap = 1; + goto defrag; + } + } DBG_COUNTER_INC(encap_txd_encap_fail); + goto defrag_failed; + } return (err); defrag_failed: ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
Re: svn commit: r332860 - head/sys/kern
On 04/24/18 13:24, Jonathan T. Looney wrote: On Mon, Apr 23, 2018 at 6:04 PM, John Baldwin> wrote: > > I think this is actually a key question. In my experience to date I have not > encountered a large number of post-panic assertion failures. Given that > we already break all locks and disable assertions for locks I'd be curious > which assertions are actually failing. My inclination given my experiences > to date would be to explicitly ignore those as we do for locking if it is > constrained set rather than blacklisting all of them. However, I would be > most interested in seeing some examples of assertions that are failing. The latest example (the one that prompted me to finally commit this) is in lockmgr_sunlock_try(): 'panic: Assertion (*xp & ~LK_EXCLUSIVE_SPINNERS) == LK_SHARERS_LOCK(1) failed at /usr/src/sys/kern/kern_lock.c:541' I don't see any obvious recent changes that would have caused this, so this is probably a case where a change to another file suddenly made us trip over this assert. FWIW, that assertion has prevented me from getting a dump from an INVARIANTS kernel for at least a year. Drew ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"
svn commit: r332653 - head/sys/dev/ixgbe
Author: gallatin Date: Tue Apr 17 16:51:27 2018 New Revision: 332653 URL: https://svnweb.freebsd.org/changeset/base/332653 Log: Restore SIOCGI2C functionality to ixgbe When ixgbe was converted to iflib, it lost the SIOCGI2C support that allows ifconfig to print SFP state, optical light levels, etc. Restore this by plugging in to the ifdi_i2c_req iflib method. Note that the sanity checking on dev_addr that used to be done in ixgbe is now done in iflib. Reviewed by: erj, Matthew MacySponsored by: Netflix Modified: head/sys/dev/ixgbe/if_ix.c Modified: head/sys/dev/ixgbe/if_ix.c == --- head/sys/dev/ixgbe/if_ix.c Tue Apr 17 16:46:08 2018(r332652) +++ head/sys/dev/ixgbe/if_ix.c Tue Apr 17 16:51:27 2018(r332653) @@ -137,7 +137,7 @@ static void ixgbe_if_timer(if_ctx_t ctx, uint16_t); static void ixgbe_if_update_admin_status(if_ctx_t ctx); static void ixgbe_if_vlan_register(if_ctx_t ctx, u16 vtag); static void ixgbe_if_vlan_unregister(if_ctx_t ctx, u16 vtag); - +static int ixgbe_if_i2c_req(if_ctx_t ctx, struct ifi2creq *req); int ixgbe_intr(void *arg); / @@ -270,6 +270,7 @@ static device_method_t ixgbe_if_methods[] = { DEVMETHOD(ifdi_vlan_register, ixgbe_if_vlan_register), DEVMETHOD(ifdi_vlan_unregister, ixgbe_if_vlan_unregister), DEVMETHOD(ifdi_get_counter, ixgbe_if_get_counter), + DEVMETHOD(ifdi_i2c_req, ixgbe_if_i2c_req), #ifdef PCI_IOV DEVMETHOD(ifdi_iov_init, ixgbe_if_iov_init), DEVMETHOD(ifdi_iov_uninit, ixgbe_if_iov_uninit), @@ -1232,6 +1233,25 @@ ixgbe_if_get_counter(if_ctx_t ctx, ift_counter cnt) } /* ixgbe_if_get_counter */ / + * ixgbe_if_i2c_req + / +static int +ixgbe_if_i2c_req(if_ctx_t ctx, struct ifi2creq *req) +{ + struct adapter *adapter = iflib_get_softc(ctx); + struct ixgbe_hw *hw = >hw; + int i; + + + if (hw->phy.ops.read_i2c_byte == NULL) + return (ENXIO); + for (i = 0; i < req->len; i++) + hw->phy.ops.read_i2c_byte(hw, req->offset + i, + req->dev_addr, >data[i]); + return (0); +} /* ixgbe_if_i2c_req */ + +/ * ixgbe_add_media_types / static void @@ -4547,4 +4567,3 @@ ixgbe_check_fan_failure(struct adapter *adapter, u32 r if (reg & mask) device_printf(adapter->dev, "\nCRITICAL: FAN FAILURE!! REPLACE IMMEDIATELY!!\n"); } /* ixgbe_check_fan_failure */ - ___ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"