svn commit: r368819 - in head: share/man/man4 sys/netinet sys/netinet6

2020-12-19 Thread Andrew Gallatin
Author: gallatin
Date: Sat Dec 19 22:04:46 2020
New Revision: 368819
URL: https://svnweb.freebsd.org/changeset/base/368819

Log:
  Filter TCP connections to SO_REUSEPORT_LB listen sockets by NUMA domain
  
  In order to efficiently serve web traffic on a NUMA
  machine, one must avoid as many NUMA domain crossings as
  possible. With SO_REUSEPORT_LB, a number of workers can share a
  listen socket. However, even if a worker sets affinity to a core
  or set of cores on a NUMA domain, it will receive connections
  associated with all NUMA domains in the system. This will lead to
  cross-domain traffic when the server writes to the socket or
  calls sendfile(), and memory is allocated on the server's local
  NUMA node, but transmitted on the NUMA node associated with the
  TCP connection. Similarly, when the server reads from the socket,
  he will likely be reading memory allocated on the NUMA domain
  associated with the TCP connection.
  
  This change provides a new socket ioctl, TCP_REUSPORT_LB_NUMA. A
  server can now tell the kernel to filter traffic so that only
  incoming connections associated with the desired NUMA domain are
  given to the server. (Of course, in the case where there are no
  servers sharing the listen socket on some domain, then as a
  fallback, traffic will be hashed as normal to all servers sharing
  the listen socket regardless of domain). This allows a server to
  deal only with traffic that is local to its NUMA domain, and
  avoids cross-domain traffic in most cases.
  
  This patch, and a corresponding small patch to nginx to use
  TCP_REUSPORT_LB_NUMA allows us to serve 190Gb/s of kTLS encrypted
  https media content from dual-socket Xeons with only 13% (as
  measured by pcm.x) cross domain traffic on the memory controller.
  
  Reviewed by:  jhb, bz (earlier version), bcr (man page)
  Tested by: gonzo
  Sponsored by: Netfix
  Differential Revision:https://reviews.freebsd.org/D21636

Modified:
  head/share/man/man4/tcp.4
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/tcp.h
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet6/in6_pcb.c
  head/sys/netinet6/in6_pcb.h

Modified: head/share/man/man4/tcp.4
==
--- head/share/man/man4/tcp.4   Sat Dec 19 21:46:09 2020(r368818)
+++ head/share/man/man4/tcp.4   Sat Dec 19 22:04:46 2020(r368819)
@@ -34,7 +34,7 @@
 .\" From: @(#)tcp.48.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd November 25, 2020
+.Dd December 19, 2020
 .Dt TCP 4
 .Os
 .Sh NAME
@@ -314,6 +314,21 @@ Enable in-kernel TLS for data read from this socket.
 See
 .Xr ktls 4
 for more details.
+.It Dv TCP_REUSPORT_LB_NUMA
+Changes NUMA affinity filtering for an established TCP listen
+socket.
+This option takes a single integer argument which specifies
+the NUMA domain to filter on for this listen socket.
+The argument can also have the follwing special values:
+.Bl -tag -width "Dv TCP_REUSPORT_LB_NUMA"
+.It Dv TCP_REUSPORT_LB_NUMA_NODOM
+Remove NUMA filtering for this listen socket.
+.It Dv TCP_REUSPORT_LB_NUMA_CURDOM
+Filter traffic associated with the domain where the calling thread is
+currently executing.
+This is typically used after a process or thread inherits a listen
+socket from its parent, and sets its CPU affinity to a particular core.
+.El
 .El
 .Pp
 The option level for the

Modified: head/sys/netinet/in_pcb.c
==
--- head/sys/netinet/in_pcb.c   Sat Dec 19 21:46:09 2020(r368818)
+++ head/sys/netinet/in_pcb.c   Sat Dec 19 22:04:46 2020(r368819)
@@ -75,6 +75,7 @@ __FBSDID("$FreeBSD$");
 #endif
 
 #include 
+#include 
 
 #include 
 #include 
@@ -150,7 +151,8 @@ static void in_pcbremlists(struct inpcb *inp);
 static struct inpcb*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
struct in_addr faddr, u_int fport_arg,
struct in_addr laddr, u_int lport_arg,
-   int lookupflags, struct ifnet *ifp);
+   int lookupflags, struct ifnet *ifp,
+   uint8_t numa_domain);
 
 #define RANGECHK(var, min, max) \
if ((var) < (min)) { (var) = (min); } \
@@ -248,7 +250,8 @@ SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, 
 
 static struct inpcblbgroup *
 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
-uint16_t port, const union in_dependaddr *addr, int size)
+uint16_t port, const union in_dependaddr *addr, int size,
+uint8_t numa_domain)
 {
struct inpcblbgroup *grp;
size_t bytes;
@@ -259,6 +262,7 @@ in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_ch
return (NULL);
grp->il_vflag = vflag;
grp->il_lport = port;
+   grp->il_numa_domain = numa_domain;
grp->il_dependladdr = *addr;
grp->il_inpsiz = size;

svn commit: r368818 - head/sys/kern

2020-12-19 Thread Andrew Gallatin
Author: gallatin
Date: Sat Dec 19 21:46:09 2020
New Revision: 368818
URL: https://svnweb.freebsd.org/changeset/base/368818

Log:
  Optionally bind ktls threads to NUMA domains
  
  When ktls_bind_thread is 2, we pick a ktls worker thread that is
  bound to the same domain as the TCP connection associated with
  the socket. We use roughly the same code as netinet/tcp_hpts.c to
  do this. This allows crypto to run on the same domain as the TCP
  connection is associated with. Assuming TCP_REUSPORT_LB_NUMA
  (D21636) is in place & in use, this ensures that the crypto source
  and destination buffers are local to the same NUMA domain as we're
  running crypto on.
  
  This change (when TCP_REUSPORT_LB_NUMA, D21636, is used) reduces
  cross-domain traffic from over 37% down to about 13% as measured
  by pcm.x on a dual-socket Xeon using nginx and a Netflix workload.
  
  Reviewed by:  jhb
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D21648

Modified:
  head/sys/kern/uipc_ktls.c

Modified: head/sys/kern/uipc_ktls.c
==
--- head/sys/kern/uipc_ktls.c   Sat Dec 19 14:54:28 2020(r368817)
+++ head/sys/kern/uipc_ktls.c   Sat Dec 19 21:46:09 2020(r368818)
@@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -83,6 +84,12 @@ struct ktls_wq {
boolrunning;
 } __aligned(CACHE_LINE_SIZE);
 
+struct ktls_domain_info {
+   int count;
+   int cpu[MAXCPU];
+};
+
+struct ktls_domain_info ktls_domains[MAXMEMDOM];
 static struct ktls_wq *ktls_wq;
 static struct proc *ktls_proc;
 LIST_HEAD(, ktls_crypto_backend) ktls_backends;
@@ -316,6 +323,9 @@ static u_int
 ktls_get_cpu(struct socket *so)
 {
struct inpcb *inp;
+#ifdef NUMA
+   struct ktls_domain_info *di;
+#endif
u_int cpuid;
 
inp = sotoinpcb(so);
@@ -330,7 +340,13 @@ ktls_get_cpu(struct socket *so)
 * serialization provided by having the same connection use
 * the same queue.
 */
-   cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
+#ifdef NUMA
+   if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) {
+   di = _domains[inp->inp_numa_domain];
+   cpuid = di->cpu[inp->inp_flowid % di->count];
+   } else
+#endif
+   cpuid = ktls_cpuid_lookup[inp->inp_flowid % 
ktls_number_threads];
return (cpuid);
 }
 #endif
@@ -341,7 +357,7 @@ ktls_init(void *dummy __unused)
struct thread *td;
struct pcpu *pc;
cpuset_t mask;
-   int error, i;
+   int count, domain, error, i;
 
ktls_tasks_active = counter_u64_alloc(M_WAITOK);
ktls_cnt_tx_queued = counter_u64_alloc(M_WAITOK);
@@ -397,7 +413,11 @@ ktls_init(void *dummy __unused)
if (ktls_bind_threads) {
if (ktls_bind_threads > 1) {
pc = pcpu_find(i);
-   CPU_COPY(_domain[pc->pc_domain], );
+   domain = pc->pc_domain;
+   CPU_COPY(_domain[domain], );
+   count = ktls_domains[domain].count;
+   ktls_domains[domain].cpu[count] = i;
+   ktls_domains[domain].count++;
} else {
CPU_SETOF(i, );
}
@@ -410,6 +430,18 @@ ktls_init(void *dummy __unused)
ktls_cpuid_lookup[ktls_number_threads] = i;
ktls_number_threads++;
}
+
+   /*
+* If we somehow have an empty domain, fall back to choosing
+* among all KTLS threads.
+*/
+   for (i = 0; i < vm_ndomains; i++) {
+   if (ktls_domains[i].count == 0) {
+   ktls_bind_threads = 0;
+   break;
+   }
+   }
+
printf("KTLS: Initialized %d threads\n", ktls_number_threads);
 }
 SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL);
@@ -2093,6 +2125,10 @@ ktls_work_thread(void *ctx)
STAILQ_HEAD(, mbuf) local_m_head;
STAILQ_HEAD(, socket) local_so_head;
 
+   if (ktls_bind_threads > 1) {
+   curthread->td_domain.dr_policy =
+   DOMAINSET_PREF(PCPU_GET(domain));
+   }
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
fpu_kern_thread(0);
 #endif
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r368721 - head/stand/efi/loader

2020-12-17 Thread Andrew Gallatin

On 12/17/20 2:49 PM, Kyle Evans wrote:

On Thu, Dec 17, 2020 at 1:47 PM Andrew Gallatin  wrote:


On 12/17/20 12:02 PM, Warner Losh wrote:

Author: imp
Date: Thu Dec 17 17:02:09 2020
New Revision: 368721
URL: 
https://urldefense.com/v3/__https://svnweb.freebsd.org/changeset/base/368721__;!!OToaGQ!5c1mLnhtRtEV6Cv_MTWpzXWaGZEYYDp4TJ6wVDzjVZiehAItts7ZWC15uNnQYRa5Fg$

Log:
Drop EFI_STAGING_SIZE back down to 64M

vmware can't cope with anything larger than 64MB. Drop this back to
64MB everywhere but arm.


There were all kinds of booting problems before this was bumped up.
In fact, I still have EFI_STAGING_SIZE=128 in src.conf because I needed
it be be able to boot when using Nvidia graphics.  By reducing this, I
feel like we're just playing whack-a-mole.



IIRC those have long since become OBE as we'll now grow the staging
area to accommodate nvidia.



Ah, OK. cool!

Thanks & sorry for the noise.

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r368721 - head/stand/efi/loader

2020-12-17 Thread Andrew Gallatin

On 12/17/20 12:02 PM, Warner Losh wrote:

Author: imp
Date: Thu Dec 17 17:02:09 2020
New Revision: 368721
URL: 
https://urldefense.com/v3/__https://svnweb.freebsd.org/changeset/base/368721__;!!OToaGQ!5c1mLnhtRtEV6Cv_MTWpzXWaGZEYYDp4TJ6wVDzjVZiehAItts7ZWC15uNnQYRa5Fg$

Log:
   Drop EFI_STAGING_SIZE back down to 64M
   
   vmware can't cope with anything larger than 64MB. Drop this back to

   64MB everywhere but arm.


There were all kinds of booting problems before this was bumped up.
In fact, I still have EFI_STAGING_SIZE=128 in src.conf because I needed
it be be able to boot when using Nvidia graphics.  By reducing this, I
feel like we're just playing whack-a-mole.

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r367797 - head/sys/net

2020-11-18 Thread Andrew Gallatin
Author: gallatin
Date: Wed Nov 18 14:55:49 2020
New Revision: 367797
URL: https://svnweb.freebsd.org/changeset/base/367797

Log:
  LACP: When suppressing distributing, return ENOBUFS
  
  When links come and go, lacp goes into a "suppress distributing" mode
  where it drops traffic for 3 seconds. When in this mode, lagg/lacp
  historiclally drops traffic with ENETDOWN. That return value causes TCP
  to close any connection where it gets that value back from the lower
  parts of the stack.  This means that any TCP connection with active
  traffic during a 3-second windown when an LACP link comes or goes
  would get closed.
  
  TCP treats return values of ENOBUFS as transient errors, and re-schedules
  transmission later. So rather than returning ENETDOWN, lets
  return ENOBUFS instead.  This allows TCP connections to be preserved.
  
  I've tested this by repeatedly bouncing links on a Netlfix CDN server
  under a moderate (20Gb/s) load and overved ENOBUFS reported back to
  the TCP stack (as reported by a RACK TCP sysctl).
  
  Reviewed by:  jhb, jtl, rrs
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D27188

Modified:
  head/sys/net/ieee8023ad_lacp.c
  head/sys/net/ieee8023ad_lacp.h
  head/sys/net/if_lagg.c

Modified: head/sys/net/ieee8023ad_lacp.c
==
--- head/sys/net/ieee8023ad_lacp.c  Wed Nov 18 14:54:55 2020
(r367796)
+++ head/sys/net/ieee8023ad_lacp.c  Wed Nov 18 14:55:49 2020
(r367797)
@@ -832,7 +832,8 @@ lacp_stop(struct lagg_softc *sc)
 }
 
 struct lagg_port *
-lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t hash, uint8_t 
numa_domain)
+lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t hash,
+uint8_t numa_domain, int *err)
 {
struct lacp_softc *lsc = LACP_SOFTC(sc);
struct lacp_portmap *pm;
@@ -842,12 +843,14 @@ lacp_select_tx_port_by_hash(struct lagg_softc *sc, uin
 
if (__predict_false(lsc->lsc_suppress_distributing)) {
LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__));
+   *err = ENOBUFS;
return (NULL);
}
 
pm = >lsc_pmap[lsc->lsc_activemap];
if (pm->pm_count == 0) {
LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__));
+   *err = ENETDOWN;
return (NULL);
}
 
@@ -879,7 +882,7 @@ lacp_select_tx_port_by_hash(struct lagg_softc *sc, uin
 }
 
 struct lagg_port *
-lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m)
+lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m, int *err)
 {
struct lacp_softc *lsc = LACP_SOFTC(sc);
uint32_t hash;
@@ -892,7 +895,7 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf
hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey);
 
numa_domain = m->m_pkthdr.numa_domain;
-   return (lacp_select_tx_port_by_hash(sc, hash, numa_domain));
+   return (lacp_select_tx_port_by_hash(sc, hash, numa_domain, err));
 }
 
 /*

Modified: head/sys/net/ieee8023ad_lacp.h
==
--- head/sys/net/ieee8023ad_lacp.h  Wed Nov 18 14:54:55 2020
(r367796)
+++ head/sys/net/ieee8023ad_lacp.h  Wed Nov 18 14:55:49 2020
(r367797)
@@ -292,8 +292,10 @@ struct lacp_softc {
 #define LACP_LOCK_ASSERT(_lsc) mtx_assert(&(_lsc)->lsc_mtx, MA_OWNED)
 
 struct mbuf*lacp_input(struct lagg_port *, struct mbuf *);
-struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *);
-struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t, 
uint8_t);
+struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *,
+int *);
+struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t,
+uint8_t, int *);
 void   lacp_attach(struct lagg_softc *);
 void   lacp_detach(void *);
 void   lacp_init(struct lagg_softc *);

Modified: head/sys/net/if_lagg.c
==
--- head/sys/net/if_lagg.c  Wed Nov 18 14:54:55 2020(r367796)
+++ head/sys/net/if_lagg.c  Wed Nov 18 14:55:49 2020(r367797)
@@ -1763,6 +1763,7 @@ lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid
struct lagg_port *lp;
struct lagg_lb *lb;
uint32_t hash, p;
+   int err;
 
sc = ifp->if_softc;
 
@@ -1783,7 +1784,7 @@ lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid
flowtype == M_HASHTYPE_NONE)
return (NULL);
hash = flowid >> sc->flowid_shift;
-   return (lacp_select_tx_port_by_hash(sc, hash, numa_domain));
+   return (lacp_select_tx_port_by_hash(sc, hash, numa_domain, 
));
default:
return (NULL);
}
@@ -2580,12 +2581,13 @@ static int
 

Re: svn commit: r367288 - head/sys/compat/linux

2020-11-02 Thread Andrew Gallatin

On 11/2/20 8:19 PM, Conrad Meyer wrote:


Log:
   linux(4): Emulate Linux SOL_SOCKET:SO_PASSCRED
   
   This is required by some major linux applications, such as Chrome and

   Firefox.  (As well as Electron-using applications, which are essentially
   a bundled version of Chrome.)
   


Awesome!  Does this get electron apps working?

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r365071 - in head/sys: net net/altq net/route net80211 netgraph netgraph/atm netgraph/atm/ccatm netgraph/atm/sscfu netgraph/atm/sscop netgraph/atm/uni netgraph/bluetooth/common netgrap

2020-09-04 Thread Andrew Gallatin

On 2020-09-02 22:42, Alexey Dokuchaev wrote:


I want to understand which rules have to be followed (and why).


In general, FreeBSD code we write should follow style(9); it specifically
mentions "do not add whitespace at the end of a line" and "... followed by
one blank line" but doesn't go as far as explicitly forbidding multiple
consecutive newlines.  To me it's pretty obvious, and while others might
have different sens esthe'tique, usually it is lack thereof (no offense)
or mere ignorance.

./danfe

P.S.  Old-school tools like indent(1) or `uncrustify' were never widely
popular, I guess, because they did not possess enough knowledge of the
language to always produce correct results.  Perhaps new era tools, like
clang-format, could bring this to a whole new level.



I do the upstream sync between the Netflix tree and
FreeBSD-current about every 3 weeks (unless glebius beats
me to the punch and does it first :).  I anticipate that
this blank line sweep will cause lots of conflicts for us.
I understand this is progress, and I don't object, and I'm
not asking for a revert, but please understand that cleanups
like this do have hidden costs.  I expect that other commercial
entities who contribute to FreeBSD will have the same issue,
and I also anticipate it will cause problems with MFCs

Rather than doing more sweeps like this, is it possible to
come up with a clang-format rule that's 95% of style(9), do
just one more sweep of the tree to apply that rule, add that
rule as a pre-commit hook, and be done forever with style(9)
related changes?

Thanks,
Drew

___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r365331 - head/sys/kern

2020-09-04 Thread Andrew Gallatin
Author: gallatin
Date: Fri Sep  4 17:36:15 2020
New Revision: 365331
URL: https://svnweb.freebsd.org/changeset/base/365331

Log:
  ktls: Check for a NULL send tag in ktls_cleanup()
  
  When using ifnet ktls, and when ktls_reset_send_tag()
  fails to allocate a replacement tag, it leaves
  the tls session's snd_tag pointer NULL. ktls_cleanup()
  tries to release the send tag, and will trip over
  this NULL pointer and panic unless NULL is checked for.
  
  Reviewed by:  jhb
  Sponsored by: Netflix

Modified:
  head/sys/kern/uipc_ktls.c

Modified: head/sys/kern/uipc_ktls.c
==
--- head/sys/kern/uipc_ktls.c   Fri Sep  4 13:19:18 2020(r365330)
+++ head/sys/kern/uipc_ktls.c   Fri Sep  4 17:36:15 2020(r365331)
@@ -680,7 +680,8 @@ ktls_cleanup(struct ktls_session *tls)
counter_u64_add(ktls_ifnet_gcm, -1);
break;
}
-   m_snd_tag_rele(tls->snd_tag);
+   if (tls->snd_tag != NULL)
+   m_snd_tag_rele(tls->snd_tag);
break;
 #ifdef TCP_OFFLOAD
case TCP_TLS_MODE_TOE:
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r364986 - head/sys/kern

2020-08-31 Thread Andrew Gallatin
Author: gallatin
Date: Mon Aug 31 13:53:14 2020
New Revision: 364986
URL: https://svnweb.freebsd.org/changeset/base/364986

Log:
  make m_getm2() resilient to zone_jumbop exhaustion
  
  When the zone_jumbop is exhausted, most things using
  using sosend* (like sshd)  will eventually
  fail or hang if allocations are limited to the
  depleted jumbop zone.  This makes it imossible to
  communicate with a box which is under an attach which
  exhausts the jumbop zone.
  
  Rather than depending on the page size zone, also try cluster
  allocations to satisfy larger requests.  This allows me
  to ssh to, and serve 100Gb/s of traffic from a server which
  under attack and has had its page-sized zone exhausted.
  
  Reviewed by:  glebius, markj, rmacklem
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D26150

Modified:
  head/sys/kern/kern_mbuf.c

Modified: head/sys/kern/kern_mbuf.c
==
--- head/sys/kern/kern_mbuf.c   Mon Aug 31 12:14:20 2020(r364985)
+++ head/sys/kern/kern_mbuf.c   Mon Aug 31 13:53:14 2020(r364986)
@@ -1423,21 +1423,28 @@ m_getm2(struct mbuf *m, int len, int how, short type, 
 
/* Loop and append maximum sized mbufs to the chain tail. */
while (len > 0) {
-   if (len > MCLBYTES)
-   mb = m_getjcl(how, type, (flags & M_PKTHDR),
+   mb = NULL;
+   if (len > MCLBYTES) {
+   mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR),
MJUMPAGESIZE);
-   else if (len >= MINCLSIZE)
-   mb = m_getcl(how, type, (flags & M_PKTHDR));
-   else if (flags & M_PKTHDR)
-   mb = m_gethdr(how, type);
-   else
-   mb = m_get(how, type);
 
-   /* Fail the whole operation if one mbuf can't be allocated. */
+   }
if (mb == NULL) {
-   if (nm != NULL)
+   if (len >= MINCLSIZE)
+   mb = m_getcl(how, type, (flags & M_PKTHDR));
+   else if (flags & M_PKTHDR)
+   mb = m_gethdr(how, type);
+   else
+   mb = m_get(how, type);
+
+   /*
+* Fail the whole operation if one mbuf can't be
+* allocated.
+*/
+   if (mb == NULL) {
m_freem(nm);
-   return (NULL);
+   return (NULL);
+   }
}
 
/* Book keeping. */
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r364460 - head/sys/vm

2020-08-21 Thread Andrew Gallatin
Author: gallatin
Date: Fri Aug 21 18:31:57 2020
New Revision: 364460
URL: https://svnweb.freebsd.org/changeset/base/364460

Log:
  uma: record allocation failures due to zone limits
  
  The zone limit mechanism was recently reworked, and
  allocation failures due to limits being exceeded
  were inadvertently no longer being recorded. This
  would lead to, for example, mbuf allocation failures
  not being indicated in netstat -m or vmstat -z
  
  Reviewed by:  markj
  Sponsored by: Netflix

Modified:
  head/sys/vm/uma_core.c

Modified: head/sys/vm/uma_core.c
==
--- head/sys/vm/uma_core.c  Fri Aug 21 17:45:17 2020(r364459)
+++ head/sys/vm/uma_core.c  Fri Aug 21 18:31:57 2020(r364460)
@@ -3952,8 +3952,10 @@ zone_alloc_item(uma_zone_t zone, void *udata, int doma
 {
void *item;
 
-   if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0)
+   if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) {
+   counter_u64_add(zone->uz_fails, 1);
return (NULL);
+   }
 
/* Avoid allocs targeting empty domains. */
if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r364405 - in head/sys/netinet: . tcp_stacks

2020-08-19 Thread Andrew Gallatin
Author: gallatin
Date: Wed Aug 19 17:59:06 2020
New Revision: 364405
URL: https://svnweb.freebsd.org/changeset/base/364405

Log:
  TCP: remove special treatment for hardware (ifnet) TLS
  
  Remove most special treatment for ifnet TLS in the TCP stack, except
  for code to avoid mixing handshakes and bulk data.
  
  This code made heroic efforts to send down entire TLS records to
  NICs. It was added to improve the PCIe bus efficiency of older TLS
  offload NICs which did not keep state per-session, and so would need
  to re-DMA the first part(s) of a TLS record if a TLS record was sent
  in multiple TCP packets or TSOs. Newer TLS offload NICs do not need
  this feature.
  
  At Netflix, we've run extensive QoE tests which show that this feature
  reduces client quality metrics, presumably because the effort to send
  TLS records atomically causes the server to both wait too long to send
  data (leading to buffers running dry), and to send too much data at
  once (leading to packet loss).
  
  Reviewed by:  hselasky,  jhb, rrs
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D26103

Modified:
  head/sys/netinet/tcp_output.c
  head/sys/netinet/tcp_stacks/bbr.c
  head/sys/netinet/tcp_stacks/rack.c

Modified: head/sys/netinet/tcp_output.c
==
--- head/sys/netinet/tcp_output.c   Wed Aug 19 17:52:06 2020
(r364404)
+++ head/sys/netinet/tcp_output.c   Wed Aug 19 17:59:06 2020
(r364405)
@@ -1957,17 +1957,6 @@ tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *ple
*pkthdrlen = len_cp;
break;
}
-
-   /*
-* Don't end a send in the middle of a TLS
-* record if it spans multiple TLS records.
-*/
-   if (tls != NULL && (m != start) && len < m->m_len) {
-   *plen = len_cp;
-   if (pkthdrlen != NULL)
-   *pkthdrlen = len_cp;
-   break;
-   }
}
 #endif
mlen = min(len, m->m_len - off);

Modified: head/sys/netinet/tcp_stacks/bbr.c
==
--- head/sys/netinet/tcp_stacks/bbr.c   Wed Aug 19 17:52:06 2020
(r364404)
+++ head/sys/netinet/tcp_stacks/bbr.c   Wed Aug 19 17:59:06 2020
(r364405)
@@ -38,7 +38,6 @@ __FBSDID("$FreeBSD$");
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 #include "opt_ratelimit.h"
-#include "opt_kern_tls.h"
 #include 
 #include 
 #include 
@@ -52,9 +51,6 @@ __FBSDID("$FreeBSD$");
 #include 
 #include 
 #include 
-#ifdef KERN_TLS
-#include 
-#endif
 #include 
 #include 
 #ifdef STATS
@@ -4600,15 +4596,6 @@ bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
bbr_set_state(tp, bbr, 0);
BBR_STAT_INC(bbr_tlp_tot);
maxseg = tp->t_maxseg - bbr->rc_last_options;
-#ifdef KERN_TLS
-   if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
-   /*
-* For hardware TLS we do *not* want to send
-* new data.
-*/
-   goto need_retran;
-   }
-#endif
/*
 * A TLP timer has expired. We have been idle for 2 rtts. So we now
 * need to figure out how to force a full MSS segment out.
@@ -5802,8 +5789,6 @@ tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t c
 * Note we do set anything TSO size until we are past the initial
 * window. Before that we gnerally use either a single MSS
 * or we use the full IW size (so we burst a IW at a time)
-* Also note that Hardware-TLS is special and does alternate
-* things to minimize PCI Bus Bandwidth use.
 */
 
if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) {
@@ -5811,19 +5796,12 @@ tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t c
} else {
maxseg = BBR_MIN_SEG - bbr->rc_last_options;
}
-#ifdef KERN_TLS
-   if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
-   tls_seg =  ctf_get_opt_tls_size(bbr->rc_inp->inp_socket, 
bbr->rc_tp->snd_wnd);
-   bbr->r_ctl.rc_pace_min_segs = (tls_seg + bbr->rc_last_options);
-   }
-#endif
old_tso = bbr->r_ctl.rc_pace_max_segs;
if (bbr->rc_past_init_win == 0) {
/*
 * Not enough data has been acknowledged to make a
-* judgement unless we are hardware TLS. Set up
-* the initial TSO based on if we are sending a
-* full IW at once or not.
+* judgement. Set up the initial TSO based on if we
+* are sending a full IW at once or not.
 */

svn commit: r362789 - head/sys/kern

2020-06-29 Thread Andrew Gallatin
Author: gallatin
Date: Mon Jun 29 21:35:50 2020
New Revision: 362789
URL: https://svnweb.freebsd.org/changeset/base/362789

Log:
  Fix a panic when unloading firmware
  
  LIST_FOREACH_SAFE() is not safe in the presence
  of other threads removing list entries when a
  mutex is released.
  
  This is not in the critical path, so just restart
  the scan each time we drop the lock, rather than
  using a marker.
  
  Reviewed by:  jhb, markj
  Sponsored by: Netflix

Modified:
  head/sys/kern/subr_firmware.c

Modified: head/sys/kern/subr_firmware.c
==
--- head/sys/kern/subr_firmware.c   Mon Jun 29 19:30:35 2020
(r362788)
+++ head/sys/kern/subr_firmware.c   Mon Jun 29 21:35:50 2020
(r362789)
@@ -394,14 +394,12 @@ EVENTHANDLER_DEFINE(mountroot, firmware_mountroot, NUL
 static void
 unloadentry(void *unused1, int unused2)
 {
-   struct priv_fw *fp, *tmp;
+   struct priv_fw *fp;
int err;
-   bool changed;
 
mtx_lock(_mtx);
-   changed = false;
 restart:
-   LIST_FOREACH_SAFE(fp, _table, link, tmp) {
+   LIST_FOREACH(fp, _table, link) {
if (fp->file == NULL || fp->refcnt != 0 ||
(fp->flags & FW_UNLOAD) == 0)
continue;
@@ -412,7 +410,6 @@ restart:
 * 2. clear FW_UNLOAD so we don't try this entry again.
 * 3. release the lock while trying to unload the module.
 */
-   changed = true;
fp->flags &= ~FW_UNLOAD;/* do not try again */
 
/*
@@ -422,9 +419,11 @@ restart:
mtx_unlock(_mtx);
err = linker_release_module(NULL, NULL, fp->file);
mtx_lock(_mtx);
-   }
-   if (changed) {
-   changed = false;
+
+   /*
+* When we dropped the lock, another thread could have
+* removed an element, so we must restart the scan.
+*/
goto restart;
}
mtx_unlock(_mtx);
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r362112 - head/sys/x86/x86

2020-06-12 Thread Andrew Gallatin
Author: gallatin
Date: Fri Jun 12 18:41:12 2020
New Revision: 362112
URL: https://svnweb.freebsd.org/changeset/base/362112

Log:
  x86: Bump default msi/msix vector limit to 2048
  
  Given that 64c/128t CPUs are currently available, and that many
  devices (nvme, many NICs) desire to map 1 MSI-X vector per core,
  or even 1 per-thread, it is becoming far easier to see MSI-X interrupt
  setup fail due to msi vector exhaustion, and devices fail to attach at
  boot on large system.
  
  This bump costs 12KB on amd64 (and 6KB on i386), which seems
  worth the trade off for a better out of the box experience on
  high end hardware.
  
  Reviewed by:  jhb
  MFC after:21 days
  Sponsored by: Netflix

Modified:
  head/sys/x86/x86/msi.c

Modified: head/sys/x86/x86/msi.c
==
--- head/sys/x86/x86/msi.c  Fri Jun 12 18:13:32 2020(r362111)
+++ head/sys/x86/x86/msi.c  Fri Jun 12 18:41:12 2020(r362112)
@@ -156,7 +156,7 @@ u_int first_msi_irq;
 SYSCTL_UINT(_machdep, OID_AUTO, first_msi_irq, CTLFLAG_RD, _msi_irq, 0,
 "Number of first IRQ reserved for MSI and MSI-X interrupts");
 
-u_int num_msi_irqs = 512;
+u_int num_msi_irqs = 2048;
 SYSCTL_UINT(_machdep, OID_AUTO, num_msi_irqs, CTLFLAG_RDTUN, _msi_irqs, 0,
 "Number of IRQs reserved for MSI and MSI-X interrupts");
 
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r347418 - head/sys/net

2020-05-19 Thread Andrew Gallatin

On 2020-05-19 04:21, Kristof Provost wrote:

The if_bnxt driver initialises |.isc_nrxd_max = {INT32_MAX, INT32_MAX, 
INT32_MAX},|, so presumably that’s the cause.
I don’t know what a sane value would be though. I’ve defaulted to 4096 
(because that’s what some other iflib users seems to do) for now, and 
that seems to work. It doesn’t panic and I can get traffic through it at 
least:


You seem to be setting the max, not the default, and 4K max descriptors 
on a 100g device is going to basically cripple it.


How about setting to the next power of 2 below max int so as to keep 
with the authors intent?


If we don't already have a macro, something like  (INT32_MAX >> 1) + 1

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r360982 - head/sys/netinet6

2020-05-12 Thread Andrew Gallatin
Author: gallatin
Date: Tue May 12 17:18:44 2020
New Revision: 360982
URL: https://svnweb.freebsd.org/changeset/base/360982

Log:
  IPv6: Fix a panic in the nd6 code with unmapped mbufs.
  
  If the neighbor entry for an IPv6 TCP session using unmapped
  mbufs times out, IPv6 will send an icmp6 dest. unreachable
  message. In doing this, it will try to do a software checksum
  on the reflected packet. If this is a TCP session using unmapped
  mbufs, then there will be a kernel panic.
  
  To fix this, just free packets with unmapped mbufs, rather
  than sending the icmp.
  
  Reviewed by:  np, rrs
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D24821

Modified:
  head/sys/netinet6/nd6.c

Modified: head/sys/netinet6/nd6.c
==
--- head/sys/netinet6/nd6.c Tue May 12 17:07:28 2020(r360981)
+++ head/sys/netinet6/nd6.c Tue May 12 17:18:44 2020(r360982)
@@ -821,9 +821,27 @@ nd6_llinfo_timer(void *arg)
clear_llinfo_pqueue(ln);
}
nd6_free(, 0);
-   if (m != NULL)
-   icmp6_error2(m, ICMP6_DST_UNREACH,
-   ICMP6_DST_UNREACH_ADDR, 0, ifp);
+   if (m != NULL) {
+   struct mbuf *n = m;
+
+   /*
+* if there are any ummapped mbufs, we
+* must free them, rather than using
+* them for an ICMP, as they cannot be
+* checksummed.
+*/
+   while ((n = n->m_next) != NULL) {
+   if (n->m_flags & M_EXTPG)
+   break;
+   }
+   if (n != NULL) {
+   m_freem(m);
+   m = NULL;
+   } else {
+   icmp6_error2(m, ICMP6_DST_UNREACH,
+   ICMP6_DST_UNREACH_ADDR, 0, ifp);
+   }
+   }
}
break;
case ND6_LLINFO_REACHABLE:
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r360961 - head/sys/netinet6

2020-05-12 Thread Andrew Gallatin
Author: gallatin
Date: Tue May 12 14:01:12 2020
New Revision: 360961
URL: https://svnweb.freebsd.org/changeset/base/360961

Log:
  IPv6: sync IP_NO_SND_TAG_RL support from IPv4
  
  The IP_NO_SND_TAG_RL flag to ip{,6}_output() means that the packets
  being sent should bypass hardware rate limiting. This is typically used
  by modern TCP stacks for rexmits.
  
  This support was added to IPv4 in r352657, but never added to IPv6, even
  though rack and bbr call ip6_output() with this flag.
  
  Reviewed by:  rrs
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D24822

Modified:
  head/sys/netinet6/ip6_output.c

Modified: head/sys/netinet6/ip6_output.c
==
--- head/sys/netinet6/ip6_output.c  Tue May 12 13:23:25 2020
(r360960)
+++ head/sys/netinet6/ip6_output.c  Tue May 12 14:01:12 2020
(r360961)
@@ -322,7 +322,8 @@ ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int h
 
 static int
 ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp,
-struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro)
+struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro,
+bool stamp_tag)
 {
 #ifdef KERN_TLS
struct ktls_session *tls = NULL;
@@ -353,6 +354,10 @@ ip6_output_send(struct inpcb *inp, struct ifnet *ifp, 
error = EAGAIN;
goto done;
}
+   /*
+* Always stamp tags that include NIC ktls.
+*/
+   stamp_tag = true;
}
 #endif
 #ifdef RATELIMIT
@@ -366,7 +371,7 @@ ip6_output_send(struct inpcb *inp, struct ifnet *ifp, 
mst = inp->inp_snd_tag;
}
 #endif
-   if (mst != NULL) {
+   if (stamp_tag && mst != NULL) {
KASSERT(m->m_pkthdr.rcvif == NULL,
("trying to add a send tag to a forwarded packet"));
if (mst->ifp != ifp) {
@@ -1165,7 +1170,8 @@ passout:
m->m_pkthdr.len);
ifa_free(>ia_ifa);
}
-   error = ip6_output_send(inp, ifp, origifp, m, dst, ro);
+   error = ip6_output_send(inp, ifp, origifp, m, dst, ro,
+   (flags & IP_NO_SND_TAG_RL) ? false : true);
goto done;
}
 
@@ -1256,7 +1262,8 @@ sendorfree:
counter_u64_add(ia->ia_ifa.ifa_obytes,
m->m_pkthdr.len);
}
-   error = ip6_output_send(inp, ifp, origifp, m, dst, ro);
+   error = ip6_output_send(inp, ifp, origifp, m, dst, ro,
+   true);
} else
m_freem(m);
}
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r360930 - head/sys/netinet6

2020-05-11 Thread Andrew Gallatin
Author: gallatin
Date: Mon May 11 21:23:22 2020
New Revision: 360930
URL: https://svnweb.freebsd.org/changeset/base/360930

Log:
  Fix the build
  
  Back out the IPv6 portion of r360903, as the stamp_tag param
  is apparently not supported in upstream FreeBSD.
  
  Sponsored by: Netflix
  Pointy hat to: gallatin

Modified:
  head/sys/netinet6/ip6_output.c

Modified: head/sys/netinet6/ip6_output.c
==
--- head/sys/netinet6/ip6_output.c  Mon May 11 21:22:16 2020
(r360929)
+++ head/sys/netinet6/ip6_output.c  Mon May 11 21:23:22 2020
(r360930)
@@ -353,10 +353,6 @@ ip6_output_send(struct inpcb *inp, struct ifnet *ifp, 
error = EAGAIN;
goto done;
}
-   /*
-* Always stamp tags that include NIC ktls.
-*/
-   stamp_tag = true;
}
 #endif
 #ifdef RATELIMIT
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r360914 - in head/sys: netinet netinet6

2020-05-11 Thread Andrew Gallatin
Author: gallatin
Date: Mon May 11 19:17:33 2020
New Revision: 360914
URL: https://svnweb.freebsd.org/changeset/base/360914

Log:
  Ktls: never skip stamping tags for NIC TLS
  
  The newer RACK and BBR TCP stacks have added a mechanism
  to disable hardware packet pacing for TCP retransmits.
  This mechanism works by skipping the send-tag stamp
  on rate-limited connections when the TCP stack calls
  ip_output() with the IP_NO_SND_TAG_RL flag set.
  
  When doing NIC TLS, we must ignore this flag, as
  NIC TLS packets must always be stamped.  Failure
  to stamp a NIC TLS packet will result in crypto
  issues.
  
  Reviewed by:  hselasky, rrs
  Sponsored by: Netflix, Mellanox

Modified:
  head/sys/netinet/ip_output.c
  head/sys/netinet6/ip6_output.c

Modified: head/sys/netinet/ip_output.c
==
--- head/sys/netinet/ip_output.cMon May 11 19:16:49 2020
(r360913)
+++ head/sys/netinet/ip_output.cMon May 11 19:17:33 2020
(r360914)
@@ -242,6 +242,10 @@ ip_output_send(struct inpcb *inp, struct ifnet *ifp, s
error = EAGAIN;
goto done;
}
+   /*
+* Always stamp tags that include NIC ktls.
+*/
+   stamp_tag = true;
}
 #endif
 #ifdef RATELIMIT

Modified: head/sys/netinet6/ip6_output.c
==
--- head/sys/netinet6/ip6_output.c  Mon May 11 19:16:49 2020
(r360913)
+++ head/sys/netinet6/ip6_output.c  Mon May 11 19:17:33 2020
(r360914)
@@ -353,6 +353,10 @@ ip6_output_send(struct inpcb *inp, struct ifnet *ifp, 
error = EAGAIN;
goto done;
}
+   /*
+* Always stamp tags that include NIC ktls.
+*/
+   stamp_tag = true;
}
 #endif
 #ifdef RATELIMIT
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r359920 - head/sys/sys

2020-04-14 Thread Andrew Gallatin
Author: gallatin
Date: Tue Apr 14 14:48:00 2020
New Revision: 359920
URL: https://svnweb.freebsd.org/changeset/base/359920

Log:
  Bump FreeBSD version after r359919 (KTLS / unmapped mbuf changes)
  
  The above changes mbufs, and any module using unmapped mbufs
  would need to be re-compiled.
  
  Sponsored by: Netflix

Modified:
  head/sys/sys/param.h

Modified: head/sys/sys/param.h
==
--- head/sys/sys/param.hTue Apr 14 14:46:06 2020(r359919)
+++ head/sys/sys/param.hTue Apr 14 14:48:00 2020(r359920)
@@ -60,7 +60,7 @@
  * in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1300091  /* Master, propagated to newvers */
+#define __FreeBSD_version 1300092  /* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r359919 - in head/sys: dev/cxgbe dev/cxgbe/crypto dev/cxgbe/tom dev/mlx5/mlx5_en kern netinet netinet6 sys

2020-04-14 Thread Andrew Gallatin
Author: gallatin
Date: Tue Apr 14 14:46:06 2020
New Revision: 359919
URL: https://svnweb.freebsd.org/changeset/base/359919

Log:
  KTLS: Re-work unmapped mbufs to carry ext_pgs in the mbuf itself.
  
  While the original implementation of unmapped mbufs was a large
  step forward in terms of reducing cache misses by enabling mbufs
  to carry more than a single page for sendfile, they are rather
  cache unfriendly when accessing the ext_pgs metadata and
  data. This is because the ext_pgs part of the mbuf is allocated
  separately, and almost guaranteed to be cold in cache.
  
  This change takes advantage of the fact that unmapped mbufs
  are never used at the same time as pkthdr mbufs. Given this
  fact, we can overlap the ext_pgs metadata with the mbuf
  pkthdr, and carry the ext_pgs meta directly in the mbuf itself.
  Similarly, we can carry the ext_pgs data (TLS hdr/trailer/array
  of pages) directly after the existing m_ext.
  
  In order to be able to carry 5 pages (which is the minimum
  required for a 16K TLS record which is not perfectly aligned) on
  LP64, I've had to steal ext_arg2. The only user of this in the
  xmit path is sendfile, and I've adjusted it to use arg1 when
  using unmapped mbufs.
  
  This change is almost entirely mechanical, except that we
  change mb_alloc_ext_pgs() to no longer allow allocating
  pkthdrs, the change to avoid ext_arg2 as mentioned above,
  and the removal of the ext_pgs zone,
  
  This change saves roughly 2% "raw" CPU (~59% -> 57%), or over
  3% "scaled" CPU on a Netflix 100% software kTLS workload at
  90+ Gb/s on Broadwell Xeons.
  
  In a follow-on commit, I plan to remove some hacks to avoid
  access ext_pgs fields of mbufs, since they will now be in
  cache.
  
  Many thanks to glebius for helping to make this better in
  the Netflix tree.
  
  Reviewed by:  hselasky, jhb, rrs, glebius (early version)
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D24213

Modified:
  head/sys/dev/cxgbe/crypto/t4_kern_tls.c
  head/sys/dev/cxgbe/t4_sge.c
  head/sys/dev/cxgbe/tom/t4_cpl_io.c
  head/sys/dev/cxgbe/tom/t4_tls.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c
  head/sys/kern/kern_mbuf.c
  head/sys/kern/kern_sendfile.c
  head/sys/kern/subr_bus_dma.c
  head/sys/kern/subr_sglist.c
  head/sys/kern/uipc_ktls.c
  head/sys/kern/uipc_mbuf.c
  head/sys/kern/uipc_sockbuf.c
  head/sys/netinet/ip_output.c
  head/sys/netinet/tcp_output.c
  head/sys/netinet6/ip6_output.c
  head/sys/sys/mbuf.h

Modified: head/sys/dev/cxgbe/crypto/t4_kern_tls.c
==
--- head/sys/dev/cxgbe/crypto/t4_kern_tls.c Tue Apr 14 13:32:03 2020
(r359918)
+++ head/sys/dev/cxgbe/crypto/t4_kern_tls.c Tue Apr 14 14:46:06 2020
(r359919)
@@ -905,8 +905,8 @@ ktls_tcp_payload_length(struct tlspcb *tlsp, struct mb
u_int plen, mlen;
 
MBUF_EXT_PGS_ASSERT(m_tls);
-   ext_pgs = m_tls->m_ext.ext_pgs;
-   hdr = (void *)ext_pgs->hdr;
+   ext_pgs = _tls->m_ext_pgs;
+   hdr = (void *)ext_pgs->m_epg_hdr;
plen = ntohs(hdr->tls_length);
 
/*
@@ -961,8 +961,8 @@ ktls_payload_offset(struct tlspcb *tlsp, struct mbuf *
 #endif
 
MBUF_EXT_PGS_ASSERT(m_tls);
-   ext_pgs = m_tls->m_ext.ext_pgs;
-   hdr = (void *)ext_pgs->hdr;
+   ext_pgs = _tls->m_ext_pgs;
+   hdr = (void *)ext_pgs->m_epg_hdr;
plen = ntohs(hdr->tls_length);
 #ifdef INVARIANTS
mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len;
@@ -1008,7 +1008,7 @@ ktls_wr_len(struct tlspcb *tlsp, struct mbuf *m, struc
u_int imm_len, offset, plen, wr_len, tlen;
 
MBUF_EXT_PGS_ASSERT(m_tls);
-   ext_pgs = m_tls->m_ext.ext_pgs;
+   ext_pgs = _tls->m_ext_pgs;
 
/*
 * Determine the size of the TLS record payload to send
@@ -1040,7 +1040,7 @@ ktls_wr_len(struct tlspcb *tlsp, struct mbuf *m, struc
return (wr_len);
}
 
-   hdr = (void *)ext_pgs->hdr;
+   hdr = (void *)ext_pgs->m_epg_hdr;
plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - ext_pgs->trail_len;
if (tlen < plen) {
plen = tlen;
@@ -1474,7 +1474,7 @@ ktls_write_tunnel_packet(struct sge_txq *txq, void *ds
 
/* Locate the template TLS header. */
MBUF_EXT_PGS_ASSERT(m_tls);
-   ext_pgs = m_tls->m_ext.ext_pgs;
+   ext_pgs = _tls->m_ext_pgs;
 
/* This should always be the last TLS record in a chain. */
MPASS(m_tls->m_next == NULL);
@@ -1543,8 +1543,8 @@ ktls_write_tunnel_packet(struct sge_txq *txq, void *ds
(m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 
/* Copy the subset of the TLS header requested. */
-   copy_to_txd(>eq, (char *)ext_pgs->hdr + mtod(m_tls, vm_offset_t),
-   , m_tls->m_len);
+   copy_to_txd(>eq, (char *)ext_pgs->m_epg_hdr +
+   mtod(m_tls, vm_offset_t), , m_tls->m_len);
txq->imm_wrs++;
 
   

svn commit: r359908 - head/sys/net

2020-04-13 Thread Andrew Gallatin
Author: gallatin
Date: Mon Apr 13 23:06:56 2020
New Revision: 359908
URL: https://svnweb.freebsd.org/changeset/base/359908

Log:
  lagg: stop double-counting output errors and counting drops as errors
  
  Before this change, lagg double-counted errors from lagg members, and counted
  every drop by a lagg member as an error.  Eg, if lagg sent a packet, and the
  underlying hardware driver dropped it, a counter would be incremented by both
  lagg and the underlying driver.
  
  This change attempts to fix that by incrementing lagg's counters only for
  errors that do not come from underlying drivers.
  
  Reviewed by:  hselasky, jhb
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D24331

Modified:
  head/sys/net/if_lagg.c

Modified: head/sys/net/if_lagg.c
==
--- head/sys/net/if_lagg.c  Mon Apr 13 22:21:01 2020(r359907)
+++ head/sys/net/if_lagg.c  Mon Apr 13 23:06:56 2020(r359908)
@@ -1874,10 +1874,6 @@ lagg_transmit(struct ifnet *ifp, struct mbuf *m)
 
error = lagg_proto_start(sc, m);
LAGG_RUNLOCK();
-
-   if (error != 0)
-   if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
-
return (error);
 }
 
@@ -2100,6 +2096,7 @@ lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
 * port if the link is down or the port is NULL.
 */
if ((lp = lagg_link_active(sc, lp)) == NULL) {
+   if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
m_freem(m);
return (ENETDOWN);
}
@@ -2145,31 +2142,28 @@ lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m
errors++;
break;
}
-
-   ret = lagg_enqueue(last->lp_ifp, m0);
-   if (ret != 0)
-   errors++;
+   lagg_enqueue(last->lp_ifp, m0);
}
last = lp;
}
 
if (last == NULL) {
+   if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
m_freem(m);
return (ENOENT);
}
if ((last = lagg_link_active(sc, last)) == NULL) {
+   errors++;
+   if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
m_freem(m);
return (ENETDOWN);
}
 
ret = lagg_enqueue(last->lp_ifp, m);
-   if (ret != 0)
-   errors++;
+   if (errors != 0)
+   if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
 
-   if (errors == 0)
-   return (ret);
-
-   return (0);
+   return (ret);
 }
 
 static struct mbuf*
@@ -2192,6 +2186,7 @@ lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
 
/* Use the master port if active or the next available port */
if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
+   if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
m_freem(m);
return (ENETDOWN);
}
@@ -2315,6 +2310,7 @@ lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
 * port if the link is down or the port is NULL.
 */
if ((lp = lagg_link_active(sc, lp)) == NULL) {
+   if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
m_freem(m);
return (ENETDOWN);
}
@@ -2386,6 +2382,7 @@ lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
 
lp = lacp_select_tx_port(sc, m);
if (lp == NULL) {
+   if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
m_freem(m);
return (ENETDOWN);
}
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r359474 - head/sys/kern

2020-03-30 Thread Andrew Gallatin
Author: gallatin
Date: Mon Mar 30 23:29:53 2020
New Revision: 359474
URL: https://svnweb.freebsd.org/changeset/base/359474

Log:
  KTLS: Coalesce adjacent TLS trailers & headers to improve PCIe bus efficiency
  
  KTLS uses the embedded header and trailer fields of unmapped
  mbufs. This can lead to "silly" buffer lengths, where we have an
  mbuf chain that will create a scatter/gather lists with a
  regular pattern of 13 bytes followed by 16 bytes between each
  adjacent TLS record.
  
  For software ktls we typically wind up with a pattern where we
  have several TLS records encrypted, and made ready at once. When
  these records are made ready, we can coalesce these silly buffers
  in sbready_compress by copying 13b TLS header of the next record
  into the 16b TLS trailer of the current record. After doing so,
  we now have a small 29 byte chunk between each TLS record.
  
  This marginally increases PCIe bus efficiency. We've seen an
  almost 1Gb/s increase in peak throughput on Broadwell based Xeons
  running a 100% software TLS workload with Mellanox ConnectX-4
  NICs.
  
  Note that this change is ifdef'ed for KTLS, as KTLS is currently
  the only user of the hdr/trailer feature of unmapped mbufs, and
  peeking into them is expensive, since the ext_pgs struct lives in
  separately allocated memory, and may be cold in cache.
  
  This optimization is not applicable to HW ("NIC") TLS, as that
  depends on having the entire TLS record described by a single
  unmapped mbuf, so we cannot shift parts of the record between
  mbufs for HW TLS.
  
  Reviewed by:  jhb, hselasky, scottl
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D24204

Modified:
  head/sys/kern/uipc_sockbuf.c

Modified: head/sys/kern/uipc_sockbuf.c
==
--- head/sys/kern/uipc_sockbuf.cMon Mar 30 22:13:32 2020
(r359473)
+++ head/sys/kern/uipc_sockbuf.cMon Mar 30 23:29:53 2020
(r359474)
@@ -112,7 +112,42 @@ sbready_compress(struct sockbuf *sb, struct mbuf *m0, 
 
for (m = m0; m != end; m = m->m_next) {
MPASS((m->m_flags & M_NOTREADY) == 0);
+   /*
+* NB: In sbcompress(), 'n' is the last mbuf in the
+* socket buffer and 'm' is the new mbuf being copied
+* into the trailing space of 'n'.  Here, the roles
+* are reversed and 'n' is the next mbuf after 'm'
+* that is being copied into the trailing space of
+* 'm'.
+*/
+   n = m->m_next;
+#ifdef KERN_TLS
+   /* Try to coalesce adjacent ktls mbuf hdr/trailers. */
+   if ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 &&
+   (m->m_flags & M_NOMAP) &&
+   (n->m_flags & M_NOMAP) &&
+   !mbuf_has_tls_session(m) &&
+   !mbuf_has_tls_session(n)) {
+   struct mbuf_ext_pgs *mpgs, *npgs;
+   int hdr_len, trail_len;
 
+   mpgs = m->m_ext.ext_pgs;
+   npgs = n->m_ext.ext_pgs;
+   hdr_len = npgs->hdr_len;
+   trail_len = mpgs->trail_len;
+   if (trail_len != 0 && hdr_len != 0 &&
+   trail_len + hdr_len <= MBUF_PEXT_TRAIL_LEN) {
+   /* copy n's header to m's trailer */
+   memcpy(>trail[trail_len], npgs->hdr,
+   hdr_len);
+   mpgs->trail_len += hdr_len;
+   m->m_len += hdr_len;
+   npgs->hdr_len = 0;
+   n->m_len -= hdr_len;
+   }
+   }
+#endif
+
/* Compress small unmapped mbufs into plain mbufs. */
if ((m->m_flags & M_NOMAP) && m->m_len <= MLEN &&
!mbuf_has_tls_session(m)) {
@@ -124,15 +159,6 @@ sbready_compress(struct sockbuf *sb, struct mbuf *m0, 
}
}
 
-   /*
-* NB: In sbcompress(), 'n' is the last mbuf in the
-* socket buffer and 'm' is the new mbuf being copied
-* into the trailing space of 'n'.  Here, the roles
-* are reversed and 'n' is the next mbuf after 'm'
-* that is being copied into the trailing space of
-* 'm'.
-*/
-   n = m->m_next;
while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 &&
M_WRITABLE(m) &&
(m->m_flags & M_NOMAP) == 0 &&
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to 

svn commit: r359016 - head/sys/netinet

2020-03-16 Thread Andrew Gallatin
Author: gallatin
Date: Mon Mar 16 14:03:27 2020
New Revision: 359016
URL: https://svnweb.freebsd.org/changeset/base/359016

Log:
  Avoid a cache miss accessing an mbuf ext_pgs pointer when doing SW kTLS.
  
  For a Netflix 90Gb/s 100% TLS software kTLS workload, this reduces
  the CPI of tcp_m_copym() from ~3.5 to ~2.5 as reported by vtune.
  
  Reviewed by:  jtl, rrs
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D23998

Modified:
  head/sys/netinet/tcp_output.c

Modified: head/sys/netinet/tcp_output.c
==
--- head/sys/netinet/tcp_output.c   Mon Mar 16 13:53:29 2020
(r359015)
+++ head/sys/netinet/tcp_output.c   Mon Mar 16 14:03:27 2020
(r359016)
@@ -1907,7 +1907,7 @@ tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *ple
top = NULL;
pkthdrlen = NULL;
 #ifdef KERN_TLS
-   if (m->m_flags & M_NOMAP)
+   if (hw_tls && (m->m_flags & M_NOMAP))
tls = m->m_ext.ext_pgs->tls;
else
tls = NULL;
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r358808 - in head/sys: kern net netinet

2020-03-09 Thread Andrew Gallatin

On 2020-03-09 09:44, Andrew Gallatin wrote:

Author: gallatin
Date: Mon Mar  9 13:44:51 2020
New Revision: 358808
URL: 
https://urldefense.com/v3/__https://svnweb.freebsd.org/changeset/base/358808__;!!OToaGQ!5mmRl2ROq7G4c4x2Xe2uHppYyETGlCRsREj-jHw0ZWcNqt3GhQju3BHBkM_vsrVvkQ$

Log:
   make lacp's use_numa hashing aware of send tags
   
   When I did the use_numa support, I missed the fact that there is

   a separate hash function for send tag nic selection. So when
   use_numa is enabled, ktls offload does not work properly, as it
   does not reliably allocate a send tag on the proper egress nic
   since different egress nics are selected for send-tag allocation
   and packet transmit. To fix this, this change:
   
   - refectors lacp_select_tx_port_by_hash() and

lacp_select_tx_port() to make lacp_select_tx_port_by_hash()
always called by lacp_select_tx_port()
   
   -   pre-shifts flowids to convert them to hashes when calling lacp_select_tx_port_by_hash()
   
   -   adds a numa_domain field to if_snd_tag_alloc_params
   
   -   plumbs the numa domain into places where we allocate send tags
   
   In testing with NIC TLS setup on a NUMA machine, I see thousands

   of output errors before the change when enabling
   kern.ipc.tls.ifnet.permitted=1. After the change, I see no
   errors, and I see the NIC sysctl counters showing active TLS
   offload sessions.
   
   Reviewed by:	rrs, hselasky, jhb

   Sponsored by:Netflix


Forgot: Differential: https://reviews.freebsd.org/D23811

___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r358808 - in head/sys: kern net netinet

2020-03-09 Thread Andrew Gallatin
Author: gallatin
Date: Mon Mar  9 13:44:51 2020
New Revision: 358808
URL: https://svnweb.freebsd.org/changeset/base/358808

Log:
  make lacp's use_numa hashing aware of send tags
  
  When I did the use_numa support, I missed the fact that there is
  a separate hash function for send tag nic selection. So when
  use_numa is enabled, ktls offload does not work properly, as it
  does not reliably allocate a send tag on the proper egress nic
  since different egress nics are selected for send-tag allocation
  and packet transmit. To fix this, this change:
  
  - refectors lacp_select_tx_port_by_hash() and
   lacp_select_tx_port() to make lacp_select_tx_port_by_hash()
   always called by lacp_select_tx_port()
  
  -   pre-shifts flowids to convert them to hashes when calling 
lacp_select_tx_port_by_hash()
  
  -   adds a numa_domain field to if_snd_tag_alloc_params
  
  -   plumbs the numa domain into places where we allocate send tags
  
  In testing with NIC TLS setup on a NUMA machine, I see thousands
  of output errors before the change when enabling
  kern.ipc.tls.ifnet.permitted=1. After the change, I see no
  errors, and I see the NIC sysctl counters showing active TLS
  offload sessions.
  
  Reviewed by:  rrs, hselasky, jhb
  Sponsored by: Netflix

Modified:
  head/sys/kern/uipc_ktls.c
  head/sys/net/ieee8023ad_lacp.c
  head/sys/net/ieee8023ad_lacp.h
  head/sys/net/if_lagg.c
  head/sys/net/if_var.h
  head/sys/netinet/in_pcb.c
  head/sys/netinet/tcp_ratelimit.c

Modified: head/sys/kern/uipc_ktls.c
==
--- head/sys/kern/uipc_ktls.c   Mon Mar  9 13:36:45 2020(r358807)
+++ head/sys/kern/uipc_ktls.c   Mon Mar  9 13:44:51 2020(r358808)
@@ -800,6 +800,7 @@ ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_sess
params.hdr.type = IF_SND_TAG_TYPE_TLS;
params.hdr.flowid = inp->inp_flowid;
params.hdr.flowtype = inp->inp_flowtype;
+   params.hdr.numa_domain = inp->inp_numa_domain;
params.tls.inp = inp;
params.tls.tls = tls;
INP_RUNLOCK(inp);

Modified: head/sys/net/ieee8023ad_lacp.c
==
--- head/sys/net/ieee8023ad_lacp.c  Mon Mar  9 13:36:45 2020
(r358807)
+++ head/sys/net/ieee8023ad_lacp.c  Mon Mar  9 13:44:51 2020
(r358808)
@@ -832,13 +832,12 @@ lacp_stop(struct lagg_softc *sc)
 }
 
 struct lagg_port *
-lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m)
+lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t hash, uint8_t 
numa_domain)
 {
struct lacp_softc *lsc = LACP_SOFTC(sc);
struct lacp_portmap *pm;
struct lacp_port *lp;
struct lacp_port **map;
-   uint32_t hash;
int count;
 
if (__predict_false(lsc->lsc_suppress_distributing)) {
@@ -854,10 +853,10 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf
 
 #ifdef NUMA
if ((sc->sc_opts & LAGG_OPT_USE_NUMA) &&
-   pm->pm_num_dom > 1 && m->m_pkthdr.numa_domain < MAXMEMDOM) {
-   count = pm->pm_numa[m->m_pkthdr.numa_domain].count;
+   pm->pm_num_dom > 1 && numa_domain < MAXMEMDOM) {
+   count = pm->pm_numa[numa_domain].count;
if (count > 0) {
-   map = pm->pm_numa[m->m_pkthdr.numa_domain].map;
+   map = pm->pm_numa[numa_domain].map;
} else {
/* No ports on this domain; use global hash. */
map = pm->pm_map;
@@ -869,11 +868,6 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf
map = pm->pm_map;
count = pm->pm_count;
}
-   if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
-   M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
-   hash = m->m_pkthdr.flowid >> sc->flowid_shift;
-   else
-   hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey);
 
hash %= count;
lp = map[hash];
@@ -884,33 +878,22 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf
return (lp->lp_lagg);
 }
 
-#if defined(RATELIMIT) || defined(KERN_TLS)
 struct lagg_port *
-lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid)
+lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m)
 {
struct lacp_softc *lsc = LACP_SOFTC(sc);
-   struct lacp_portmap *pm;
-   struct lacp_port *lp;
uint32_t hash;
+   uint8_t numa_domain;
 
-   if (__predict_false(lsc->lsc_suppress_distributing)) {
-   LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__));
-   return (NULL);
-   }
+   if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
+   M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
+   hash = m->m_pkthdr.flowid >> sc->flowid_shift;
+   else
+   hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey);
 
-   pm = 

svn commit: r356866 - head/sys/vm

2020-01-18 Thread Andrew Gallatin
Author: gallatin
Date: Sat Jan 18 18:25:37 2020
New Revision: 356866
URL: https://svnweb.freebsd.org/changeset/base/356866

Log:
  pcpu_page_alloc: guard against empty NUMA domains
  
  Some systems, such as higher end Threadripper, may have
  NUMA domains with no physical memory, Don't allocate
  from these domains.
  
  This fixes a "panic: vm_wait in early boot" on my 2990WX desktop
  
  Reviewed by:  jeff
  Sponsored by: Netflix

Modified:
  head/sys/vm/uma_core.c

Modified: head/sys/vm/uma_core.c
==
--- head/sys/vm/uma_core.c  Sat Jan 18 10:55:38 2020(r356865)
+++ head/sys/vm/uma_core.c  Sat Jan 18 18:25:37 2020(r356866)
@@ -1521,7 +1521,11 @@ pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int 
p = vm_page_alloc(NULL, 0, flags);
 #else
pc = pcpu_find(cpu);
-   p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags);
+   if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain)))
+   p = NULL;
+   else
+   p = vm_page_alloc_domain(NULL, 0,
+   pc->pc_domain, flags);
if (__predict_false(p == NULL))
p = vm_page_alloc(NULL, 0, flags);
 #endif
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r354470 - head/sys/dev/hwpmc

2019-11-07 Thread Andrew Gallatin
Author: gallatin
Date: Thu Nov  7 19:54:24 2019
New Revision: 354470
URL: https://svnweb.freebsd.org/changeset/base/354470

Log:
  hwpmc : fix AMD perf counter MSR access
  
  - amd_intr() does not account for the offset (0x200) in the counter
  MSR address and ends up accessing invalid regions while reading
  counter value after the 4th counter (0xC001000[8,9,..]) and
  erroneously updates the counter values for counters [1-4].
  
  - amd_intr() should only check core pmcs for interrupts since
   other types of pmcs (L3,DF) cannot generate interrupts.
  
  - fix pmc NMI's being ignored due to NMI latency on newer AMD processors
  
  Note that this fixes a kernel panic due to GPFs accessing MSRs on
  higher core count AMD cpus (seen on both Rome 7502P, and
  Threadripper 2990WX 32-core CPUs)
  
  Discussed with: markj
  
  Submitted by: Shreyank Amartya
  Differential Revision:https://reviews.freebsd.org/D21553

Modified:
  head/sys/dev/hwpmc/hwpmc_amd.c
  head/sys/dev/hwpmc/hwpmc_amd.h

Modified: head/sys/dev/hwpmc/hwpmc_amd.c
==
--- head/sys/dev/hwpmc/hwpmc_amd.c  Thu Nov  7 19:54:08 2019
(r354469)
+++ head/sys/dev/hwpmc/hwpmc_amd.c  Thu Nov  7 19:54:24 2019
(r354470)
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -53,6 +54,10 @@ __FBSDID("$FreeBSD$");
 enum pmc_class amd_pmc_class;
 #endif
 
+#defineOVERFLOW_WAIT_COUNT 50
+
+DPCPU_DEFINE_STATIC(uint32_t, nmi_counter);
+
 /* AMD K7 & K8 PMCs */
 struct amd_descr {
struct pmc_descr pm_descr;  /* "base class" */
@@ -739,6 +744,7 @@ amd_stop_pmc(int cpu, int ri)
struct pmc_hw *phw;
const struct amd_descr *pd;
uint64_t config;
+   int i;
 
KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
("[amd,%d] illegal CPU value %d", __LINE__, cpu));
@@ -761,6 +767,21 @@ amd_stop_pmc(int cpu, int ri)
/* turn off the PMC ENABLE bit */
config = pm->pm_md.pm_amd.pm_amd_evsel & ~AMD_PMC_ENABLE;
wrmsr(pd->pm_evsel, config);
+
+   /*
+* Due to NMI latency on newer AMD processors
+* NMI interrupts are ignored, which leads to
+* panic or messages based on kernel configuraiton
+*/
+
+   /* Wait for the count to be reset */
+   for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) {
+   if (rdmsr(pd->pm_perfctr) & (1 << (pd->pm_descr.pd_width - 1)))
+   break;
+
+   DELAY(1);
+   }
+
return 0;
 }
 
@@ -779,6 +800,7 @@ amd_intr(struct trapframe *tf)
struct pmc *pm;
struct amd_cpu *pac;
pmc_value_t v;
+   uint32_t active = 0, count = 0;
 
cpu = curcpu;
KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
@@ -798,19 +820,21 @@ amd_intr(struct trapframe *tf)
 *
 * If found, we call a helper to process the interrupt.
 *
-* If multiple PMCs interrupt at the same time, the AMD64
-* processor appears to deliver as many NMIs as there are
-* outstanding PMC interrupts.  So we process only one NMI
-* interrupt at a time.
+* PMCs interrupting at the same time are collapsed into
+* a single interrupt. Check all the valid pmcs for
+* overflow.
 */
 
-   for (i = 0; retval == 0 && i < AMD_NPMCS; i++) {
+   for (i = 0; i < AMD_CORE_NPMCS; i++) {
 
if ((pm = pac->pc_amdpmcs[i].phw_pmc) == NULL ||
!PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) {
continue;
}
 
+   /* Consider pmc with valid handle as active */
+   active++;
+
if (!AMD_PMC_HAS_OVERFLOWED(i))
continue;
 
@@ -820,8 +844,8 @@ amd_intr(struct trapframe *tf)
continue;
 
/* Stop the PMC, reload count. */
-   evsel   = AMD_PMC_EVSEL_0 + i;
-   perfctr = AMD_PMC_PERFCTR_0 + i;
+   evsel   = amd_pmcdesc[i].pm_evsel;
+   perfctr = amd_pmcdesc[i].pm_perfctr;
v   = pm->pm_sc.pm_reloadcount;
config  = rdmsr(evsel);
 
@@ -837,6 +861,26 @@ amd_intr(struct trapframe *tf)
error = pmc_process_interrupt(PMC_HR, pm, tf);
if (error == 0)
wrmsr(evsel, config);
+   }
+
+   /*
+* Due to NMI latency, there can be a scenario in which
+* multiple pmcs gets serviced in an earlier NMI and we
+* do not find an overflow in the subsequent NMI.
+*
+* For such cases we keep a per-cpu count of active NMIs
+* and compare it with min(active pmcs, 2) to determine
+* if this NMI was for a pmc overflow which was serviced
+* in an earlier request or should be ignored.
+*/
+
+   if (retval) {
+   

svn commit: r354338 - head/sys/x86/x86

2019-11-04 Thread Andrew Gallatin
Author: gallatin
Date: Mon Nov  4 19:30:19 2019
New Revision: 354338
URL: https://svnweb.freebsd.org/changeset/base/354338

Log:
  Add tunable to allow interrupts on hyperthreaded cores
  
  Enabling interrupts on htt cores has benefits to workloads which are primarily
  interrupt driven by increasing the logical cores available for interrupt 
handling.
  The tunable is named machdep.hyperthreading_intr_allowed
  
  Reviewed by:  kib, jhb
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D22233

Modified:
  head/sys/x86/x86/mp_x86.c

Modified: head/sys/x86/x86/mp_x86.c
==
--- head/sys/x86/x86/mp_x86.c   Mon Nov  4 18:34:29 2019(r354337)
+++ head/sys/x86/x86/mp_x86.c   Mon Nov  4 19:30:19 2019(r354338)
@@ -144,6 +144,11 @@ static int hyperthreading_allowed = 1;
 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
_allowed, 0, "Use Intel HTT logical CPUs");
 
+static int hyperthreading_intr_allowed = 0;
+SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN,
+   _intr_allowed, 0,
+   "Allow interrupts on HTT logical CPUs");
+
 static struct topo_node topo_root;
 
 static int pkg_id_shift;
@@ -1121,7 +1126,8 @@ set_interrupt_apic_ids(void)
continue;
 
/* Don't let hyperthreads service interrupts. */
-   if (cpu_info[apic_id].cpu_hyperthread)
+   if (cpu_info[apic_id].cpu_hyperthread &&
+   !hyperthreading_intr_allowed)
continue;
 
intr_add_cpu(i);
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r354029 - head/sys/vm

2019-10-24 Thread Andrew Gallatin
Author: gallatin
Date: Thu Oct 24 18:39:05 2019
New Revision: 354029
URL: https://svnweb.freebsd.org/changeset/base/354029

Log:
  Add a tunable to set the pgcache zone's maxcache
  
  When it is set to 0 (the default), a heavy Netflix-style web workload
  suffers from heavy lock contention on the vm page free queue called from
  vm_page_zone_{import,release}() as the buckets are frequently drained.
  When setting the maxcache, this contention goes away.
  
  We should eventually try to autotune this, as well as make this
  zone eligable for uma_reclaim().
  
  Reviewed by:  alc, markj
  Not Objected to by: jeff
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D22112

Modified:
  head/sys/vm/vm_page.c

Modified: head/sys/vm/vm_page.c
==
--- head/sys/vm/vm_page.c   Thu Oct 24 18:13:26 2019(r354028)
+++ head/sys/vm/vm_page.c   Thu Oct 24 18:39:05 2019(r354029)
@@ -216,8 +216,10 @@ vm_page_init_cache_zones(void *dummy __unused)
 {
struct vm_domain *vmd;
struct vm_pgcache *pgcache;
-   int domain, pool;
+   int domain, maxcache, pool;
 
+   maxcache = 0;
+   TUNABLE_INT_FETCH("vm.pgcache_zone_max", );
for (domain = 0; domain < vm_ndomains; domain++) {
vmd = VM_DOMAIN(domain);
 
@@ -237,7 +239,7 @@ vm_page_init_cache_zones(void *dummy __unused)
sizeof(struct vm_page), NULL, NULL, NULL, NULL,
vm_page_zone_import, vm_page_zone_release, pgcache,
UMA_ZONE_MAXBUCKET | UMA_ZONE_VM);
-   (void)uma_zone_set_maxcache(pgcache->zone, 0);
+   (void)uma_zone_set_maxcache(pgcache->zone, maxcache);
}
}
 }
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r352816 - in head/sys: kern sys

2019-09-27 Thread Andrew Gallatin
Author: gallatin
Date: Fri Sep 27 20:08:19 2019
New Revision: 352816
URL: https://svnweb.freebsd.org/changeset/base/352816

Log:
  kTLS: Fix a bug where we would not encrypt anon data inplace.
  
  Software Kernel TLS needs to allocate a new destination crypto
  buffer when encrypting data from the page cache, so as to avoid
  overwriting shared clear-text file data with encrypted data
  specific to a single socket. When the data is anonymous, eg, not
  tied to a file, then we can encrypt in place and avoid allocating
  a new page. This fixes a bug where the existing code always
  assumes the data is private, and never encrypts in place. This
  results in unneeded page allocations and potentially more memory
  bandwidth consumption when doing socket writes.
  
  When the code was written at Netflix, ktls_encrypt() looked at
  private sendfile flags to determine if the pages being encrypted
  where part of the page cache (coming from sendfile) or
  anonymous (coming from sosend). This was broken internally at
  Netflix when the sendfile flags were made private, and the
  M_WRITABLE() check was added. Unfortunately, M_WRITABLE() will
  always be false for M_NOMAP mbufs, since one cannot just mtod()
  them.
  
  This change introduces a new flags field to the mbuf_ext_pgs
  struct by stealing a byte from the tls hdr. Note that the current
  header is still 2 bytes larger than the largest header we
  support: AES-CBC with explicit IV. We set MBUF_PEXT_FLAG_ANON
  when creating an unmapped mbuf in m_uiotombuf_nomap() (which is
  the path that socket writes take), and we check for that flag in
  ktls_encrypt() when looking for anon pages.
  
  Reviewed by:  jhb
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D21796

Modified:
  head/sys/kern/kern_mbuf.c
  head/sys/kern/uipc_ktls.c
  head/sys/kern/uipc_mbuf.c
  head/sys/sys/mbuf.h

Modified: head/sys/kern/kern_mbuf.c
==
--- head/sys/kern/kern_mbuf.c   Fri Sep 27 19:26:52 2019(r352815)
+++ head/sys/kern/kern_mbuf.c   Fri Sep 27 20:08:19 2019(r352816)
@@ -1171,6 +1171,7 @@ mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ex
ext_pgs->nrdy = 0;
ext_pgs->first_pg_off = 0;
ext_pgs->last_pg_len = 0;
+   ext_pgs->flags = 0;
ext_pgs->hdr_len = 0;
ext_pgs->trail_len = 0;
ext_pgs->tls = NULL;

Modified: head/sys/kern/uipc_ktls.c
==
--- head/sys/kern/uipc_ktls.c   Fri Sep 27 19:26:52 2019(r352815)
+++ head/sys/kern/uipc_ktls.c   Fri Sep 27 20:08:19 2019(r352816)
@@ -1363,7 +1363,7 @@ ktls_encrypt(struct mbuf_ext_pgs *pgs)
 * (from sendfile), anonymous wired pages are
 * allocated and assigned to the destination iovec.
 */
-   is_anon = M_WRITABLE(m);
+   is_anon = (pgs->flags & MBUF_PEXT_FLAG_ANON) != 0;
 
off = pgs->first_pg_off;
for (i = 0; i < pgs->npgs; i++, off = 0) {
@@ -1416,6 +1416,9 @@ retry_page:
 
/* Use the basic free routine. */
m->m_ext.ext_free = mb_free_mext_pgs;
+
+   /* Pages are now writable. */
+   pgs->flags |= MBUF_PEXT_FLAG_ANON;
}
 
/*

Modified: head/sys/kern/uipc_mbuf.c
==
--- head/sys/kern/uipc_mbuf.c   Fri Sep 27 19:26:52 2019(r352815)
+++ head/sys/kern/uipc_mbuf.c   Fri Sep 27 20:08:19 2019(r352816)
@@ -1664,6 +1664,7 @@ m_uiotombuf_nomap(struct uio *uio, int how, int len, i
prev->m_next = mb;
prev = mb;
pgs = mb->m_ext.ext_pgs;
+   pgs->flags = MBUF_PEXT_FLAG_ANON;
needed = length = MIN(maxseg, total);
for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
 retry_page:

Modified: head/sys/sys/mbuf.h
==
--- head/sys/sys/mbuf.h Fri Sep 27 19:26:52 2019(r352815)
+++ head/sys/sys/mbuf.h Fri Sep 27 20:08:19 2019(r352816)
@@ -312,7 +312,7 @@ struct socket;
  * - 21 (AES-CBC with explicit IV)
  * - 13 (AES-GCM with 8 byte explicit IV)
  */
-#defineMBUF_PEXT_HDR_LEN   24
+#defineMBUF_PEXT_HDR_LEN   23
 
 /*
  * TLS records for TLS 1.0-1.2 can have the following maximum trailer
@@ -333,6 +333,8 @@ struct socket;
 #defineMBUF_PEXT_MAX_BYTES 
\
 (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN)
 
+#define MBUF_PEXT_FLAG_ANON1   /* Data can be encrypted in place. */
+
 /*
  * This struct is 256 bytes in size and is arranged so that the most
  * common case (accessing the first 4 

svn commit: r352814 - in head/sys: kern net opencrypto sys

2019-09-27 Thread Andrew Gallatin
Author: gallatin
Date: Fri Sep 27 19:17:40 2019
New Revision: 352814
URL: https://svnweb.freebsd.org/changeset/base/352814

Log:
  kTLS support for TLS 1.3
  
  TLS 1.3 requires a few changes because 1.3 pretends to be 1.2
  with a record type of application data. The "real" record type is
  then included at the end of the user-supplied plaintext
  data. This required adding a field to the mbuf_ext_pgs struct to
  save the record type, and passing the real record type to the
  sw_encrypt() ktls backend functions.
  
  Reviewed by:  jhb, hselasky
  Sponsored by: Netflix
  Differential Revision:D21801

Modified:
  head/sys/kern/uipc_ktls.c
  head/sys/net/iflib.c
  head/sys/opencrypto/ktls_ocf.c
  head/sys/sys/ktls.h
  head/sys/sys/mbuf.h

Modified: head/sys/kern/uipc_ktls.c
==
--- head/sys/kern/uipc_ktls.c   Fri Sep 27 19:14:03 2019(r352813)
+++ head/sys/kern/uipc_ktls.c   Fri Sep 27 19:17:40 2019(r352814)
@@ -389,14 +389,14 @@ ktls_create_session(struct socket *so, struct tls_enab
if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
return (EINVAL);
if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
-   en->tls_vminor > TLS_MINOR_VER_TWO)
+   en->tls_vminor > TLS_MINOR_VER_THREE)
return (EINVAL);
 
if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
return (EINVAL);
if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
return (EINVAL);
-   if (en->iv_len < 0 || en->iv_len > TLS_MAX_PARAM_SIZE)
+   if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
return (EINVAL);
 
/* All supported algorithms require a cipher key. */
@@ -425,7 +425,10 @@ ktls_create_session(struct socket *so, struct tls_enab
}
if (en->auth_key_len != 0)
return (EINVAL);
-   if (en->iv_len != TLS_AEAD_GCM_LEN)
+   if ((en->tls_vminor == TLS_MINOR_VER_TWO &&
+   en->iv_len != TLS_AEAD_GCM_LEN) ||
+   (en->tls_vminor == TLS_MINOR_VER_THREE &&
+   en->iv_len != TLS_1_3_GCM_IV_LEN))
return (EINVAL);
break;
case CRYPTO_AES_CBC:
@@ -477,8 +480,22 @@ ktls_create_session(struct socket *so, struct tls_enab
tls->params.tls_hlen = sizeof(struct tls_record_layer);
switch (en->cipher_algorithm) {
case CRYPTO_AES_NIST_GCM_16:
-   tls->params.tls_hlen += 8;
+   /*
+* TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
+* nonce.  TLS 1.3 uses a 12 byte implicit IV.
+*/
+   if (en->tls_vminor < TLS_MINOR_VER_THREE)
+   tls->params.tls_hlen += sizeof(uint64_t);
tls->params.tls_tlen = AES_GMAC_HASH_LEN;
+
+   /*
+* TLS 1.3 includes optional padding which we
+* do not support, and also puts the "real" record
+* type at the end of the encrypted data.
+*/
+   if (en->tls_vminor == TLS_MINOR_VER_THREE)
+   tls->params.tls_tlen += sizeof(uint8_t);
+
tls->params.tls_bs = 1;
break;
case CRYPTO_AES_CBC:
@@ -539,7 +556,6 @@ ktls_create_session(struct socket *so, struct tls_enab
 * of the IV are generated in ktls_frame() and ktls_seq().
 */
if (en->iv_len != 0) {
-   MPASS(en->iv_len <= sizeof(tls->params.iv));
tls->params.iv_len = en->iv_len;
error = copyin(en->iv, tls->params.iv, en->iv_len);
if (error)
@@ -1188,8 +1204,21 @@ ktls_frame(struct mbuf *top, struct ktls_session *tls,
/* Populate the TLS header. */
tlshdr = (void *)pgs->hdr;
tlshdr->tls_vmajor = tls->params.tls_vmajor;
-   tlshdr->tls_vminor = tls->params.tls_vminor;
-   tlshdr->tls_type = record_type;
+
+   /*
+* TLS 1.3 masquarades as TLS 1.2 with a record type
+* of TLS_RLTYPE_APP.
+*/
+   if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
+   tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
+   tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
+   tlshdr->tls_type = TLS_RLTYPE_APP;
+   /* save the real record type for later */
+   pgs->record_type = record_type;
+   } else {
+   tlshdr->tls_vminor = tls->params.tls_vminor;
+   tlshdr->tls_type = record_type;
+   }
tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
 
/*
@@ -1365,7 +1394,8 @@ retry_page:
 

svn commit: r352552 - head/sys/kern

2019-09-20 Thread Andrew Gallatin
Author: gallatin
Date: Fri Sep 20 09:36:07 2019
New Revision: 352552
URL: https://svnweb.freebsd.org/changeset/base/352552

Log:
  remove redundant "ktls" in  KTLS thr name
  
  This reducesthe string width of the ktls thread name
  and improves "ps" output.
  
  Glanced at by: jhb
  Event: EuroBSDCon hackathon
  Sponsored by: Netflix

Modified:
  head/sys/kern/uipc_ktls.c

Modified: head/sys/kern/uipc_ktls.c
==
--- head/sys/kern/uipc_ktls.c   Fri Sep 20 09:04:52 2019(r352551)
+++ head/sys/kern/uipc_ktls.c   Fri Sep 20 09:36:07 2019(r352552)
@@ -349,7 +349,7 @@ ktls_init(void *dummy __unused)
STAILQ_INIT(_wq[i].head);
mtx_init(_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
error = kproc_kthread_add(ktls_work_thread, _wq[i],
-   _proc, , 0, 0, "KTLS", "ktls_thr_%d", i);
+   _proc, , 0, 0, "KTLS", "thr_%d", i);
if (error)
panic("Can't add KTLS thread %d error %d", i, error);
 
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r352228 - head/sys/netinet

2019-09-13 Thread Andrew Gallatin

On 2019-09-11 17:16, Conrad Meyer wrote:

Small nitpick:

On Wed, Sep 11, 2019 at 11:48 AM Andrew Gallatin  wrote:

   Note that on a system under a syn flood attack, arc4random()
   becomes quite expensive, and the chacha_poly crypto that it calls


arc4random uses chacha20 — there is no "poly" involved.

Best,
Conrad



Sorry for the mis-statement.  poly is associated with chacha in my mind.
In any case, calling arc4random() and the chacha it uses millions of
times per second is expensive, and avoiding it provides some headroom.

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r352228 - head/sys/netinet

2019-09-11 Thread Andrew Gallatin
Author: gallatin
Date: Wed Sep 11 18:48:26 2019
New Revision: 352228
URL: https://svnweb.freebsd.org/changeset/base/352228

Log:
  Avoid unneeded call to arc4random() in syncache_add()
  
  Don't call arc4random() unconditionally to initialize sc_iss, and
  then when syncookies are enabled, just overwrite it with the
  return value from from syncookie_generate(). Instead, only call
  arc4random() to initialize sc_iss when syncookies are not
  enabled.
  
  Note that on a system under a syn flood attack, arc4random()
  becomes quite expensive, and the chacha_poly crypto that it calls
  is one of the more expensive things happening on the
  system. Removing this unneeded arc4random() call reduces CPU from
  about 40% to about 35% in my test scenario (Broadwell Xeon, 6Mpps
  syn flood attack).
  
  Reviewed by:  rrs, tuxen, bz
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D21591

Modified:
  head/sys/netinet/tcp_syncache.c

Modified: head/sys/netinet/tcp_syncache.c
==
--- head/sys/netinet/tcp_syncache.c Wed Sep 11 18:40:05 2019
(r352227)
+++ head/sys/netinet/tcp_syncache.c Wed Sep 11 18:48:26 2019
(r352228)
@@ -1543,7 +1543,6 @@ skip_alloc:
sc->sc_todctx = todctx;
 #endif
sc->sc_irs = th->th_seq;
-   sc->sc_iss = arc4random();
sc->sc_flags = 0;
sc->sc_flowlabel = 0;
 
@@ -1617,6 +1616,8 @@ skip_alloc:
 
if (V_tcp_syncookies)
sc->sc_iss = syncookie_generate(sch, sc);
+   else
+   sc->sc_iss = arc4random();
 #ifdef INET6
if (autoflowlabel) {
if (V_tcp_syncookies)
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r346632 - head/sys/net

2019-09-03 Thread Andrew Gallatin
Author: gallatin
Date: Wed Apr 24 13:32:04 2019
New Revision: 346632
URL: https://svnweb.freebsd.org/changeset/base/346632

Log:
  iflib: Add pfil hooks
  
  As with mlx5en, the idea is to drop unwanted traffic as early
  in receive as possible, before mbufs are allocated and anything
  is passed up the stack.  This can save considerable CPU time
  when a machine is under a flooding style DOS attack.
  
  The major change here is to remove the unneeded abstraction where
  callers of rxd_frag_to_sd() get back a pointer to the mbuf ring, and
  are responsible for NULL'ing that mbuf themselves. Now this happens
  directly in rxd_frag_to_sd(), and it returns an mbuf. This allows us
  to use the decision (and potentially mbuf) returned by the pfil
  hooks. The driver can now recycle mbufs to avoid re-allocation when
  packets are dropped.
  
  Reviewed by:  marius  (shurd and erj also provided feedback)
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D19645

Modified:
  head/sys/net/iflib.c

Modified: head/sys/net/iflib.c
==
--- head/sys/net/iflib.cWed Apr 24 13:15:56 2019(r346631)
+++ head/sys/net/iflib.cWed Apr 24 13:32:04 2019(r346632)
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -432,6 +433,7 @@ struct iflib_rxq {
if_ctx_tifr_ctx;
iflib_fl_t  ifr_fl;
uint64_tifr_rx_irq;
+   struct pfil_head*pfil;
uint16_tifr_id;
uint8_t ifr_lro_enabled;
uint8_t ifr_nfl;
@@ -451,7 +453,6 @@ struct iflib_rxq {
 
 typedef struct if_rxsd {
caddr_t *ifsd_cl;
-   struct mbuf **ifsd_m;
iflib_fl_t ifsd_fl;
qidx_t ifsd_cidx;
 } *if_rxsd_t;
@@ -652,7 +653,6 @@ static int iflib_fast_intrs;
 static int iflib_rx_unavail;
 static int iflib_rx_ctx_inactive;
 static int iflib_rx_if_input;
-static int iflib_rx_mbuf_null;
 static int iflib_rxd_flush;
 
 static int iflib_verbose_debug;
@@ -669,8 +669,6 @@ SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLF
   _rx_ctx_inactive, 0, "# times rxeof called with 
inactive context");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
   _rx_if_input, 0, "# times rxeof called if_input");
-SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD,
-  _rx_mbuf_null, 0, "# times rxeof got null mbuf");
 SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
 _rxd_flush, 0, "# times rxd_flush called");
 SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
@@ -689,7 +687,7 @@ iflib_debug_reset(void)
iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
iflib_rx_unavail =
iflib_rx_ctx_inactive = iflib_rx_if_input =
-   iflib_rx_mbuf_null = iflib_rxd_flush = 0;
+   iflib_rxd_flush = 0;
 }
 
 #else
@@ -2002,11 +2000,12 @@ _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int coun
bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
BUS_DMASYNC_PREREAD);
 
-   MPASS(sd_m[frag_idx] == NULL);
-   if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
-   break;
+   if (sd_m[frag_idx] == NULL) {
+   if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
+   break;
+   }
+   sd_m[frag_idx] = m;
}
-   sd_m[frag_idx] = m;
bit_set(fl->ifl_rx_bitmap, frag_idx);
 #if MEMORY_LOGGING
fl->ifl_m_enqueued++;
@@ -2483,13 +2482,15 @@ prefetch_pkts(iflib_fl_t fl, int cidx)
prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
 }
 
-static void
-rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int unload, if_rxsd_t sd)
+static struct mbuf *
+rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd,
+int *pf_rv, if_rxd_info_t ri)
 {
-   int flid, cidx;
bus_dmamap_t map;
iflib_fl_t fl;
-   int next;
+   caddr_t payload;
+   struct mbuf *m;
+   int flid, cidx, len, next;
 
map = NULL;
flid = irf->irf_flid;
@@ -2497,7 +2498,7 @@ rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int
fl = >ifr_fl[flid];
sd->ifsd_fl = fl;
sd->ifsd_cidx = cidx;
-   sd->ifsd_m = >ifl_sds.ifsd_m[cidx];
+   m = fl->ifl_sds.ifsd_m[cidx];
sd->ifsd_cl = >ifl_sds.ifsd_cl[cidx];
fl->ifl_credits--;
 #if MEMORY_LOGGING
@@ -2513,39 +2514,89 @@ rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int
/* not valid assert if bxe really does SGE from non-contiguous elements 
*/
MPASS(fl->ifl_cidx == cidx);
bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
+
+   if (rxq->pfil != 

svn commit: r346579 - in head: share/man/man9 sys/dev/cxgbe sys/dev/mlx5/mlx5_en sys/net

2019-09-03 Thread Andrew Gallatin
Author: gallatin
Date: Mon Apr 22 19:24:21 2019
New Revision: 346579
URL: https://svnweb.freebsd.org/changeset/base/346579

Log:
  Track device's NUMA domain in ifnet & alloc ifnet from NUMA local memory
  
  This commit adds new if_alloc_domain() and if_alloc_dev() methods to
  allocate ifnets.  When called with a domain on a NUMA machine,
  ifalloc_domain() will record the NUMA domain in the ifnet, and it will
  allocate the ifnet struct from memory which is local to that NUMA
  node.  Similarly, if_alloc_dev() is a wrapper for if_alloc_domain
  which uses a driver supplied device_t to call ifalloc_domain() with
  the appropriate domain.
  
  Note that the new if_numa_domain field fits in an alignment pad in
  struct ifnet, and so does not alter the size of the structure.
  
  Reviewed by:  glebius, kib, markj
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D19930

Modified:
  head/share/man/man9/Makefile
  head/share/man/man9/ifnet.9
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
  head/sys/net/if.c
  head/sys/net/if_var.h

Modified: head/share/man/man9/Makefile
==
--- head/share/man/man9/MakefileMon Apr 22 19:21:35 2019
(r346578)
+++ head/share/man/man9/MakefileMon Apr 22 19:24:21 2019
(r346579)
@@ -1175,6 +1175,8 @@ MLINKS+=iflibtxrx.9 isc_rxd_available.9 \
iflibtxrx.9 isc_txd_flush.9
 MLINKS+=ifnet.9 if_addmulti.9 \
ifnet.9 if_alloc.9 \
+   ifnet.9 if_alloc_dev.9 \
+   ifnet.9 if_alloc_domain.9 \
ifnet.9 if_allmulti.9 \
ifnet.9 if_attach.9 \
ifnet.9 if_data.9 \

Modified: head/share/man/man9/ifnet.9
==
--- head/share/man/man9/ifnet.9 Mon Apr 22 19:21:35 2019(r346578)
+++ head/share/man/man9/ifnet.9 Mon Apr 22 19:24:21 2019(r346579)
@@ -48,6 +48,10 @@
 .Ss "Interface Manipulation Functions"
 .Ft "struct ifnet *"
 .Fn if_alloc "u_char type"
+.Ft "struct ifnet *"
+.Fn if_alloc_dev "u_char type" "device_t dev"
+.Ft "struct ifnet *"
+.Fn if_alloc_domain "u_char type" "int numa_domain"
 .Ft void
 .Fn if_attach "struct ifnet *ifp"
 .Ft void
@@ -440,6 +444,15 @@ It is used to cache the type passed to
 but unlike
 .Va if_type ,
 it would not be changed by drivers.
+.It Va if_numa_domain
+.Pq Vt uint8_t
+The NUMA domain of the hardware device associated with the interface.
+This is filled in with a wildcard value unless the kernel is NUMA
+aware, the system is a NUMA system, and the ifnet is allocated
+using
+.Fn if_alloc_dev
+or
+.Fn if_alloc_domain .
 .El
 .Pp
 References to
@@ -1151,6 +1164,24 @@ include the allocation of a
 .Fa type
 specific structure in
 .Va if_l2com .
+.It Fn if_alloc_dev
+Allocate and initialize
+.Vt "struct ifnet"
+as
+.Fn if_alloc
+does, with the addition that the ifnet can be tagged with the
+appropriate NUMA domain derived from the
+.Fa dev
+argument passed by the caller.
+.It Fn if_alloc_domain
+Allocate and initialize
+.Vt "struct ifnet"
+as
+.Fn if_alloc
+does, with the addition that the ifnet will be tagged with the NUMA
+domain via the
+.Fa numa_domain
+argument passed by the caller.
 .It Fn if_attach
 Link the specified interface
 .Fa ifp
@@ -1168,7 +1199,10 @@ function.)
 The
 .Fa ifp
 must have been allocated by
-.Fn if_alloc .
+.Fn if_alloc ,
+.Fn if_alloc_dev
+or
+.Fn if_alloc_domain .
 .It Fn if_detach
 Shut down and unlink the specified
 .Fa ifp

Modified: head/sys/dev/cxgbe/t4_main.c
==
--- head/sys/dev/cxgbe/t4_main.cMon Apr 22 19:21:35 2019
(r346578)
+++ head/sys/dev/cxgbe/t4_main.cMon Apr 22 19:24:21 2019
(r346579)
@@ -1636,7 +1636,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
callout_init(>tick, 1);
 
/* Allocate an ifnet and set it up */
-   ifp = if_alloc(IFT_ETHER);
+   ifp = if_alloc_dev(IFT_ETHER, dev);
if (ifp == NULL) {
device_printf(dev, "Cannot allocate ifnet\n");
return (ENOMEM);

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 22 19:21:35 2019
(r346578)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 22 19:24:21 2019
(r346579)
@@ -3682,7 +3682,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
M_MLX5EN, M_WAITOK | M_ZERO);
mlx5e_priv_mtx_init(priv);
 
-   ifp = priv->ifp = if_alloc(IFT_ETHER);
+   ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev);
if (ifp == NULL) {
mlx5_core_err(mdev, "if_alloc() failed\n");
goto err_free_priv;

Modified: head/sys/net/if.c

svn commit: r346281 - head/sys/sys

2019-09-03 Thread Andrew Gallatin
Author: gallatin
Date: Tue Apr 16 16:49:34 2019
New Revision: 346281
URL: https://svnweb.freebsd.org/changeset/base/346281

Log:
  Replace cosqos with numa_domain in mbuf pkthdr
  
  The cosqos field was added nearly 6 years ago in r254804, and it is
  still unused by any in-tree consumers.  I have a patchset that I'm
  working on which aligns many network resources by NUMA domain,
  including inps, inpcb lb group, tcp pacing, lagg output link
  selection, backing pages for sendfile, and more.  It reduces
  cross-domain traffic by roughly 50% for a real web workload.
  
  This patchset relies on being able to store the numa domain in the
  mbuf, and grabbing the unused cosqos field for this purpose is the
  first step in starting to usptream it.
  
  Reviewed by:  kib, markj
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D19862

Modified:
  head/sys/sys/mbuf.h

Modified: head/sys/sys/mbuf.h
==
--- head/sys/sys/mbuf.h Tue Apr 16 15:52:04 2019(r346280)
+++ head/sys/sys/mbuf.h Tue Apr 16 16:49:34 2019(r346281)
@@ -98,6 +98,7 @@ struct mbuf;
 #defineMLEN((int)(MSIZE - MHSIZE))
 #defineMHLEN   ((int)(MSIZE - MPKTHSIZE))
 #defineMINCLSIZE   (MHLEN + 1)
+#defineM_NODOM 255
 
 #ifdef _KERNEL
 /*-
@@ -158,7 +159,7 @@ struct pkthdr {
uint32_t flowid;/* packet's 4-tuple system */
uint32_t csum_flags;/* checksum and offload features */
uint16_t fibnum;/* this packet should use this fib */
-   uint8_t  cosqos;/* class/quality of service */
+   uint8_t  numa_domain;   /* NUMA domain of recvd pkt */
uint8_t  rsstype;   /* hash type */
union {
uint64_trcv_tstmp;  /* timestamp in ns */
@@ -405,33 +406,6 @@ struct mbuf {
 #defineM_HASHTYPE_SET(m, v)((m)->m_pkthdr.rsstype = (v))
 #defineM_HASHTYPE_TEST(m, v)   (M_HASHTYPE_GET(m) == (v))
 #defineM_HASHTYPE_ISHASH(m)(M_HASHTYPE_GET(m) & 
M_HASHTYPE_HASHPROP)
-
-/*
- * COS/QOS class and quality of service tags.
- * It uses DSCP code points as base.
- */
-#defineQOS_DSCP_CS00x00
-#defineQOS_DSCP_DEFQOS_DSCP_CS0
-#defineQOS_DSCP_CS10x20
-#defineQOS_DSCP_AF11   0x28
-#defineQOS_DSCP_AF12   0x30
-#defineQOS_DSCP_AF13   0x38
-#defineQOS_DSCP_CS20x40
-#defineQOS_DSCP_AF21   0x48
-#defineQOS_DSCP_AF22   0x50
-#defineQOS_DSCP_AF23   0x58
-#defineQOS_DSCP_CS30x60
-#defineQOS_DSCP_AF31   0x68
-#defineQOS_DSCP_AF32   0x70
-#defineQOS_DSCP_AF33   0x78
-#defineQOS_DSCP_CS40x80
-#defineQOS_DSCP_AF41   0x88
-#defineQOS_DSCP_AF42   0x90
-#defineQOS_DSCP_AF43   0x98
-#defineQOS_DSCP_CS50xa0
-#defineQOS_DSCP_EF 0xb8
-#defineQOS_DSCP_CS60xc0
-#defineQOS_DSCP_CS70xe0
 
 /*
  * External mbuf storage buffer types.


___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r346247 - head/sys/dev/mlx5/mlx5_en

2019-09-03 Thread Andrew Gallatin
Author: gallatin
Date: Mon Apr 15 17:14:50 2019
New Revision: 346247
URL: https://svnweb.freebsd.org/changeset/base/346247

Log:
  mlx5en: Enable new pfil(9) KPI ethernet filtering hooks
  
  This allows efficient filtering at packet ingress on mlx5en.
  
  Note that the packets are filtered (and potentially dropped) *before*
  the driver has committed to (re)allocating an mbuf for the
  packet. Dropped packets are treated essentially the same as an
  error. Nothing is allocated, and the existing buffer is recycled. This
  allows us to drop malicious packets at close to line rate with very
  little CPU use.
  
  Reviewed by:  hselasky, slavash, kib
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D19063

Modified:
  head/sys/dev/mlx5/mlx5_en/en.h
  head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c

Modified: head/sys/dev/mlx5/mlx5_en/en.h
==
--- head/sys/dev/mlx5/mlx5_en/en.h  Mon Apr 15 16:57:27 2019
(r346246)
+++ head/sys/dev/mlx5/mlx5_en/en.h  Mon Apr 15 17:14:50 2019
(r346247)
@@ -48,6 +48,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -838,6 +839,7 @@ struct mlx5e_priv {
struct mlx5e_clbr_point clbr_points[2];
u_int   clbr_gen;
 
+   struct pfil_head *pfil;
struct mlx5e_channel channel[];
 };
 

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 15 16:57:27 2019
(r346246)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 15 17:14:50 2019
(r346247)
@@ -3664,6 +3664,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
struct sysctl_oid_list *child;
int ncv = mdev->priv.eq_table.num_comp_vectors;
char unit[16];
+   struct pfil_head_args pa;
int err;
int i;
u32 eth_proto_cap;
@@ -3898,6 +3899,12 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
callout_init(>tstmp_clbr, CALLOUT_DIRECT);
mlx5e_reset_calibration_callout(priv);
 
+   pa.pa_version = PFIL_VERSION;
+   pa.pa_flags = PFIL_IN;
+   pa.pa_type = PFIL_TYPE_ETHERNET;
+   pa.pa_headname = ifp->if_xname;
+   priv->pfil = pfil_head_register();
+
return (priv);
 
 #ifdef RATELIMIT
@@ -3972,6 +3979,12 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vp
if_printf(priv->ifp, "Waiting for all unlimited connections "
"to terminate\n");
pause("W", hz);
+   }
+
+   /* deregister pfil */
+   if (priv->pfil != NULL) {
+   pfil_head_unregister(priv->pfil);
+   priv->pfil = NULL;
}
 
/* unregister device */

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
==
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c  Mon Apr 15 16:57:27 2019
(r346246)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c  Mon Apr 15 17:14:50 2019
(r346247)
@@ -430,15 +430,18 @@ mlx5e_decompress_cqes(struct mlx5e_cq *cq)
 static int
 mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget)
 {
-   int i;
+   struct pfil_head *pfil;
+   int i, rv;
 
+   CURVNET_SET_QUIET(rq->ifp->if_vnet);
+   pfil = rq->channel->priv->pfil;
for (i = 0; i < budget; i++) {
struct mlx5e_rx_wqe *wqe;
struct mlx5_cqe64 *cqe;
struct mbuf *mb;
__be16 wqe_counter_be;
u16 wqe_counter;
-   u32 byte_cnt;
+   u32 byte_cnt, seglen;
 
cqe = mlx5e_get_cqe(>cq);
if (!cqe)
@@ -462,6 +465,39 @@ mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget)
rq->stats.wqe_err++;
goto wq_ll_pop;
}
+   if (pfil != NULL && PFIL_HOOKED_IN(pfil)) {
+   seglen = MIN(byte_cnt, MLX5E_MAX_RX_BYTES);
+   rv = pfil_run_hooks(rq->channel->priv->pfil,
+   rq->mbuf[wqe_counter].data, rq->ifp,
+   seglen | PFIL_MEMPTR | PFIL_IN, NULL);
+
+   switch (rv) {
+   case PFIL_DROPPED:
+   case PFIL_CONSUMED:
+   /*
+* Filter dropped or consumed it. In
+* either case, we can just recycle
+* buffer; there is no more work to do.
+*/
+   rq->stats.packets++;
+   goto wq_ll_pop;
+   case PFIL_REALLOCED:
+   /*
+* Filter copied it; recycle buffer
+

Re: svn commit: r351200 - in head/sys: amd64/amd64 dev/acpica

2019-08-19 Thread Andrew Gallatin

On 2019-08-18 19:44, Jeff Roberson wrote:

Author: jeff
Date: Sun Aug 18 23:44:23 2019
New Revision: 351200

<..>

Log:
   Allocate all per-cpu datastructures in domain correct memory.
   
   Reviewed by:	kib, gallatin (some objections)


No objection to what you actually committed.   The only objection was 
this issues I found on non-NUMA, which you fixed in the committed code.


Thanks!

Drew


___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r350245 - head/usr.sbin/pciconf

2019-07-23 Thread Andrew Gallatin
Author: gallatin
Date: Tue Jul 23 16:28:17 2019
New Revision: 350245
URL: https://svnweb.freebsd.org/changeset/base/350245

Log:
  pciconf: report PCI Gen4 speeds
  
  PCIe gen4 runs at 16GT/s.  Report this as
  the speed of Gen4 links.
  
  Reviewed by:  imp
  MFC after:7 days
  Sponsored by: Netflix

Modified:
  head/usr.sbin/pciconf/cap.c

Modified: head/usr.sbin/pciconf/cap.c
==
--- head/usr.sbin/pciconf/cap.c Tue Jul 23 16:27:36 2019(r350244)
+++ head/usr.sbin/pciconf/cap.c Tue Jul 23 16:28:17 2019(r350245)
@@ -389,6 +389,8 @@ link_speed_string(uint8_t speed)
return ("5.0");
case 3:
return ("8.0");
+   case 4:
+   return ("16.0");
default:
return ("undef");
}
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r349055 - head/sys/net

2019-06-15 Thread Andrew Gallatin

On 2019-06-15 11:59, Marius Strobl wrote:

On Sat, Jun 15, 2019 at 09:08:05AM -0400, Andrew Gallatin wrote:

On 2019-06-15 07:07, Marius Strobl wrote:

Author: marius
Date: Sat Jun 15 11:07:41 2019
New Revision: 349055



Log:
- Replace unused and only ever written to members of public iflib(9)
  structs with placeholders (in the latter case, IFLIB_MAX_TX_BYTES
  etc. are also only ever used for these write-only members if at all,
  so both these macros and members can just go). Using these spares
  may render it possible to merge certain iflib(9) fixes to stable/12.
  Otherwise, changes extending struct if_irq or struct if_shared_ctx
  in any way would break KBI as instances of these are allocated by
  the driver front-ends (by contrast, struct if_pkt_info as well as
  struct if_softc_ctx instances are provided by iflib(9) and, thus,
  may grow at least at the end without breaking KBI).


Given the above, why replace ipi_tcp_sum in if_pkt_info with a spare?
Given that if_pkt_info can grow, I would also expect it to be able to
shrink.  So I don't quite see why the spare is needed here.

I also worry about carrying the other spares around forever.


Yes, KBI-wise it should be also safe for instances of structures allocated
by iflib(9) to shrink at the end (though shrinking structures usually isn't
a concern when MFCing as such parts may just be omitted); changes altering
the offsets of members would be a problem regarding KBI.
Still, I don't like changing the size of publicly visible structures in
stable branches without a real good reason even if such a change doesn't
strictly break the KBI. So the plan is to MFC the spares but then to get
rid of the ones whose removal doesn't break KBI in head.

Marius



Thanks, that makes sense

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r349055 - head/sys/net

2019-06-15 Thread Andrew Gallatin

On 2019-06-15 07:07, Marius Strobl wrote:

Author: marius
Date: Sat Jun 15 11:07:41 2019
New Revision: 349055



Log:
   - Replace unused and only ever written to members of public iflib(9)
 structs with placeholders (in the latter case, IFLIB_MAX_TX_BYTES
 etc. are also only ever used for these write-only members if at all,
 so both these macros and members can just go). Using these spares
 may render it possible to merge certain iflib(9) fixes to stable/12.
 Otherwise, changes extending struct if_irq or struct if_shared_ctx
 in any way would break KBI as instances of these are allocated by
 the driver front-ends (by contrast, struct if_pkt_info as well as
 struct if_softc_ctx instances are provided by iflib(9) and, thus,
 may grow at least at the end without breaking KBI).


Given the above, why replace ipi_tcp_sum in if_pkt_info with a spare? 
Given that if_pkt_info can grow, I would also expect it to be able to 
shrink.  So I don't quite see why the spare is needed here.


I also worry about carrying the other spares around forever.

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r348241 - head

2019-05-24 Thread Andrew Gallatin

On 2019-05-24 11:45, Mark Johnston wrote:


   Modernize the MAKE_JUST_KERNELS hint in the top-level makefile.
   
   It doesn't make sense to limit to -j12 anymore, build scalability

   is better than it used to be.  Fold the hint into the description
   of the universe target.
   
   Reviewed by:	imp



Dumb question about this: Will it update toolchains, or just use what 
can find?


Thanks,

Drew

___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r348109 - in head/sys/x86: include x86

2019-05-23 Thread Andrew Gallatin

On 2019-05-22 13:09, Andriy Gapon wrote:

On 22/05/2019 16:44, Andrew Gallatin wrote:

   This is needed for AMD SMCA processors, as SMCA uses different
   MSR address for access MCA banks.


Just curious, what is SMCA?




" Scalable Machine Check Architecture "

See 
https://www.nextplatform.com/2017/07/12/heart-amds-epyc-comeback-infinity-fabric/


Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r348109 - in head/sys/x86: include x86

2019-05-22 Thread Andrew Gallatin
Author: gallatin
Date: Wed May 22 13:44:15 2019
New Revision: 348109
URL: https://svnweb.freebsd.org/changeset/base/348109

Log:
  x86 MCA: introduce MCA hooks for different vendor implementations
  
  This is needed for AMD SMCA processors, as SMCA uses different
  MSR address for access MCA banks.
  
  Use IA32 specific msr_ops as defualt, and use SMCA-specific msr_ops
  when on an SMCA-enabled processor
  
  Submitted by: chandu from amd dot com
  Reviewed by:  cem
  Differential Revision:https://reviews.freebsd.org/D18055

Modified:
  head/sys/x86/include/specialreg.h
  head/sys/x86/x86/mca.c

Modified: head/sys/x86/include/specialreg.h
==
--- head/sys/x86/include/specialreg.h   Wed May 22 08:30:03 2019
(r348108)
+++ head/sys/x86/include/specialreg.h   Wed May 22 13:44:15 2019
(r348109)
@@ -944,6 +944,16 @@
 #defineMC_MISC_AMD_PTR_MASK0xff00  /* Pointer to 
additional registers */
 #defineMC_MISC_AMD_PTR_SHIFT   24
 
+/* AMD Scalable MCA */
+#define MSR_SMCA_MC0_CTL  0xc0002000
+#define MSR_SMCA_MC0_STATUS   0xc0002001
+#define MSR_SMCA_MC0_ADDR 0xc0002002
+#define MSR_SMCA_MC0_MISC00xc0002003
+#define MSR_SMCA_MC_CTL(x)   (MSR_SMCA_MC0_CTL + 0x10 * (x))
+#define MSR_SMCA_MC_STATUS(x)(MSR_SMCA_MC0_STATUS + 0x10 * (x))
+#define MSR_SMCA_MC_ADDR(x)  (MSR_SMCA_MC0_ADDR + 0x10 * (x))
+#define MSR_SMCA_MC_MISC(x)  (MSR_SMCA_MC0_MISC0 + 0x10 * (x))
+
 /*
  * The following four 3-byte registers control the non-cacheable regions.
  * These registers must be written as three separate bytes.

Modified: head/sys/x86/x86/mca.c
==
--- head/sys/x86/x86/mca.c  Wed May 22 08:30:03 2019(r348108)
+++ head/sys/x86/x86/mca.c  Wed May 22 13:44:15 2019(r348109)
@@ -90,6 +90,13 @@ struct mca_internal {
STAILQ_ENTRY(mca_internal) link;
 };
 
+struct mca_enumerator_ops {
+unsigned int (*ctl)(int);
+unsigned int (*status)(int);
+unsigned int (*addr)(int);
+unsigned int (*misc)(int);
+};
+
 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
 
 static volatile int mca_count; /* Number of records stored. */
@@ -124,6 +131,61 @@ static struct taskqueue *mca_tq;
 static struct task mca_refill_task, mca_scan_task;
 static struct mtx mca_lock;
 
+static unsigned int
+mca_ia32_ctl_reg(int bank)
+{
+   return (MSR_MC_CTL(bank));
+}
+
+static unsigned int
+mca_ia32_status_reg(int bank)
+{
+   return (MSR_MC_STATUS(bank));
+}
+
+static unsigned int
+mca_ia32_addr_reg(int bank)
+{
+   return (MSR_MC_ADDR(bank));
+}
+
+static unsigned int
+mca_ia32_misc_reg(int bank)
+{
+   return (MSR_MC_MISC(bank));
+}
+
+static unsigned int
+mca_smca_ctl_reg(int bank)
+{
+return (MSR_SMCA_MC_CTL(bank));
+}
+
+static unsigned int
+mca_smca_status_reg(int bank)
+{
+return (MSR_SMCA_MC_STATUS(bank));
+}
+
+static unsigned int
+mca_smca_addr_reg(int bank)
+{
+return (MSR_SMCA_MC_ADDR(bank));
+}
+
+static unsigned int
+mca_smca_misc_reg(int bank)
+{
+return (MSR_SMCA_MC_MISC(bank));
+}
+
+static struct mca_enumerator_ops mca_msr_ops = {
+.ctl= mca_ia32_ctl_reg,
+.status = mca_ia32_status_reg,
+.addr   = mca_ia32_addr_reg,
+.misc   = mca_ia32_misc_reg
+};
+
 #ifdef DEV_APIC
 static struct cmc_state **cmc_state;   /* Indexed by cpuid, bank. */
 static struct amd_et_state **amd_et_state; /* Indexed by cpuid, bank. */
@@ -462,7 +524,7 @@ mca_check_status(int bank, struct mca_record *rec)
uint64_t status;
u_int p[4];
 
-   status = rdmsr(MSR_MC_STATUS(bank));
+   status = rdmsr(mca_msr_ops.status(bank));
if (!(status & MC_STATUS_VAL))
return (0);
 
@@ -471,10 +533,10 @@ mca_check_status(int bank, struct mca_record *rec)
rec->mr_bank = bank;
rec->mr_addr = 0;
if (status & MC_STATUS_ADDRV)
-   rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
+   rec->mr_addr = rdmsr(mca_msr_ops.addr(bank));
rec->mr_misc = 0;
if (status & MC_STATUS_MISCV)
-   rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
+   rec->mr_misc = rdmsr(mca_msr_ops.misc(bank));
rec->mr_tsc = rdtsc();
rec->mr_apic_id = PCPU_GET(apic_id);
rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
@@ -488,7 +550,7 @@ mca_check_status(int bank, struct mca_record *rec)
 * errors so that the BIOS can see them.
 */
if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
-   wrmsr(MSR_MC_STATUS(bank), 0);
+   wrmsr(mca_msr_ops.status(bank), 0);
do_cpuid(0, p);
}
return (1);
@@ -648,7 +710,7 @@ amd_thresholding_update(enum scan_mode mode, int bank,
int count;
 
cc = 

Re: svn commit: r347410 - in head: . sys/amd64/conf sys/arm/conf sys/arm64/conf sys/i386/conf sys/powerpc/conf sys/riscv/conf sys/sparc64/conf

2019-05-11 Thread Andrew Gallatin

On 2019-05-10 11:50, Kristof Provost wrote:

On 10 May 2019, at 8:31, Andrew Gallatin wrote:

On 2019-05-10 08:44, Slawa Olhovchenkov wrote:

pf have ifdef for IPSEC, but don't have support IPSEC_SUPPORT
(netpfil/pf/if_pfsync.c).

Thanks for pointing this out. It seems like IPSEC_SUPPORT would work
for this. I've made a patch, and it compiles and the pf module loads.
However, I have no knowledge of how to test it. Is this something
that you use, and which you can test?

I suspect this code has not actually been enabled for a long time.
gettdb() doesn’t actually appear to be defined anywhere, so I wouldn’t 
expect it to ever compile.


gettdb() does exist in OpenBSD, so my current guess is that this is just 
an import artefact, and we should |#ifdef OPENBSD| it or something, or 
just remove it completely.


For completeness, and because I never shut up about this: to test pf 
|kldload pfsync|, |cd /usr/tests/sys/netpfil/pf| and |sudo kyua test|


There’s more information in the current edition of the FreeBSD journal.

Regards,
Kristof



Thanks, you are correct.  Including options_ipsec.h reveals that the 
code does not even compile (cannot find gettdb(), which does not appear 
to be defined anywhere in our tree).


Given that it is dead code, I'd rather just not touch it.

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r347410 - in head: . sys/amd64/conf sys/arm/conf sys/arm64/conf sys/i386/conf sys/powerpc/conf sys/riscv/conf sys/sparc64/conf

2019-05-10 Thread Andrew Gallatin

On 2019-05-10 08:44, Slawa Olhovchenkov wrote:


pf have ifdef for IPSEC, but don't have support IPSEC_SUPPORT
(netpfil/pf/if_pfsync.c).



Thanks for pointing this out.  It seems like IPSEC_SUPPORT would work 
for this.  I've made a patch, and it compiles and the pf module loads.

However, I have no knowledge of how to test it.  Is this something
that you use, and which you can test?

Thanks,

Drew

diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c
index 45b1e090f95c..cc06637b862e 100644
--- a/sys/netpfil/pf/if_pfsync.c
+++ b/sys/netpfil/pf/if_pfsync.c
@@ -308,7 +308,7 @@ static void	pfsync_bulk_update(void *);
 static void	pfsync_bulk_fail(void *);
 
 static void	pfsync_detach_ifnet(struct ifnet *);
-#ifdef IPSEC
+#ifdef IPSEC_SUPPORT
 static void	pfsync_update_net_tdb(struct pfsync_tdb *);
 #endif
 static struct pfsync_bucket	*pfsync_get_bucket(struct pfsync_softc *,
@@ -1228,7 +1228,7 @@ pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	int len = count * sizeof(struct pfsync_tdb);
 
-#if defined(IPSEC)
+#if defined(IPSEC_SUPPORT)
 	struct pfsync_tdb *tp;
 	struct mbuf *mp;
 	int offp;
@@ -1249,7 +1249,7 @@ pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 	return (len);
 }
 
-#if defined(IPSEC)
+#if defined(IPSEC_SUPPORT)
 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
 static void
 pfsync_update_net_tdb(struct pfsync_tdb *pt)
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r347430 - in head/sys: kern netinet sys

2019-05-10 Thread Andrew Gallatin
Author: gallatin
Date: Fri May 10 13:41:19 2019
New Revision: 347430
URL: https://svnweb.freebsd.org/changeset/base/347430

Log:
  Bind TCP HPTS (pacer) threads to NUMA domains
  
  Bind the TCP pacer threads to NUMA domains and build per-domain
  pacer-thread lookup tables. These tables allow us to use the
  inpcb's NUMA domain information to match an inpcb with a pacer
  thread on the same domain.
  
  The motivation for this is to keep the TCP connection local to a
  NUMA domain as much as possible.
  
  Thanks to jhb for pre-reviewing an earlier version of the patch.
  
  Reviewed by:  rrs
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D20134

Modified:
  head/sys/kern/kern_intr.c
  head/sys/netinet/tcp_hpts.c
  head/sys/sys/interrupt.h

Modified: head/sys/kern/kern_intr.c
==
--- head/sys/kern/kern_intr.c   Fri May 10 13:18:22 2019(r347429)
+++ head/sys/kern/kern_intr.c   Fri May 10 13:41:19 2019(r347430)
@@ -380,6 +380,25 @@ intr_event_bind_ithread(struct intr_event *ie, int cpu
return (_intr_event_bind(ie, cpu, false, true));
 }
 
+/*
+ * Bind an interrupt event's ithread to the specified cpuset.
+ */
+int
+intr_event_bind_ithread_cpuset(struct intr_event *ie, cpuset_t *cs)
+{
+   lwpid_t id;
+
+   mtx_lock(>ie_lock);
+   if (ie->ie_thread != NULL) {
+   id = ie->ie_thread->it_thread->td_tid;
+   mtx_unlock(>ie_lock);
+   return (cpuset_setthread(id, cs));
+   } else {
+   mtx_unlock(>ie_lock);
+   }
+   return (ENODEV);
+}
+
 static struct intr_event *
 intr_lookup(int irq)
 {

Modified: head/sys/netinet/tcp_hpts.c
==
--- head/sys/netinet/tcp_hpts.c Fri May 10 13:18:22 2019(r347429)
+++ head/sys/netinet/tcp_hpts.c Fri May 10 13:41:19 2019(r347430)
@@ -131,6 +131,7 @@ __FBSDID("$FreeBSD$");
 #include 
 
 #include 
+#include 
 
 #include 
 #include 
@@ -171,7 +172,7 @@ MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
 #include 
 static int tcp_bind_threads = 1;
 #else
-static int tcp_bind_threads = 0;
+static int tcp_bind_threads = 2;
 #endif
 TUNABLE_INT("net.inet.tcp.bind_hptss", _bind_threads);
 
@@ -207,6 +208,13 @@ static int32_t logging_on = 0;
 static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
 static int32_t tcp_hpts_precision = 120;
 
+struct hpts_domain_info {
+   int count;
+   int cpu[MAXCPU];
+};
+
+struct hpts_domain_info hpts_domains[MAXMEMDOM];
+
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
 _hpts_precision, 120,
 "Value for PRE() precision of callout");
@@ -1079,8 +1087,10 @@ hpts_random_cpu(struct inpcb *inp){
 static uint16_t
 hpts_cpuid(struct inpcb *inp){
u_int cpuid;
+#ifdef NUMA
+   struct hpts_domain_info *di;
+#endif
 
-
/*
 * If one has been set use it i.e. we want both in and out on the
 * same hpts.
@@ -1103,11 +1113,21 @@ hpts_cpuid(struct inpcb *inp){
 * unknown cpuids to curcpu.  Not the best, but apparently better
 * than defaulting to swi 0.
 */
-   if (inp->inp_flowtype != M_HASHTYPE_NONE) {
+   
+   if (inp->inp_flowtype == M_HASHTYPE_NONE)
+   return (hpts_random_cpu(inp));
+   /*
+* Hash to a thread based on the flowid.  If we are using numa,
+* then restrict the hash to the numa domain where the inp lives.
+*/
+#ifdef NUMA
+   if (tcp_bind_threads == 2 && inp->inp_numa_domain != M_NODOM) {
+   di = _domains[inp->inp_numa_domain];
+   cpuid = di->cpu[inp->inp_flowid % di->count];
+   } else
+#endif
cpuid = inp->inp_flowid % mp_ncpus;
-   return (cpuid);
-   }
-   cpuid = hpts_random_cpu(inp);
+
return (cpuid);
 #endif
 }
@@ -1781,8 +1801,11 @@ tcp_init_hptsi(void *st)
struct timeval tv;
sbintime_t sb;
struct tcp_hpts_entry *hpts;
+   struct pcpu *pc;
+   cpuset_t cs;
char unit[16];
uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+   int count, domain;
 
tcp_pace.rp_proc = NULL;
tcp_pace.rp_num_hptss = ncpus;
@@ -1861,6 +1884,11 @@ tcp_init_hptsi(void *st)
}
callout_init(>co, 1);
}
+
+   /* Don't try to bind to NUMA domains if we don't have any */
+   if (vm_ndomains == 1 && tcp_bind_threads == 2)
+   tcp_bind_threads = 0;
+
/*
 * Now lets start ithreads to handle the hptss.
 */
@@ -1875,9 +1903,20 @@ tcp_init_hptsi(void *st)
hpts, i, error);
}
created++;
-   if (tcp_bind_threads) {
+   if (tcp_bind_threads == 1) {
if (intr_event_bind(hpts->ie, i) == 0)

svn commit: r347410 - in head: . sys/amd64/conf sys/arm/conf sys/arm64/conf sys/i386/conf sys/powerpc/conf sys/riscv/conf sys/sparc64/conf

2019-05-09 Thread Andrew Gallatin
Author: gallatin
Date: Thu May  9 22:38:15 2019
New Revision: 347410
URL: https://svnweb.freebsd.org/changeset/base/347410

Log:
  Remove IPSEC from GENERIC due to performance issues
  
  Having IPSEC compiled into the kernel imposes a non-trivial
  performance penalty on multi-threaded workloads due to IPSEC
  refcounting. In my benchmarks of multi-threaded UDP
  transmit (connected sockets), I've seen a roughly 20% performance
  penalty when the IPSEC option is included in the kernel (16.8Mpps
  vs 13.8Mpps with 32 senders on a 14 core / 28 HTT Xeon
  2697v3)). This is largely due to key_addref() incrementing and
  decrementing an atomic reference count on the default
  policy. This cause all CPUs to stall on the same cacheline, as it
  bounces between different CPUs.
  
  Given that relatively few users use ipsec, and that it can be
  loaded as a module, it seems reasonable to ask those users to
  load the ipsec module so as to avoid imposing this penalty on the
  GENERIC kernel. Its my hope that this will make FreeBSD look
  better in "out of the box" benchmark comparisons with other
  operating systems.
  
  Many thanks to ae for fixing auto-loading of ipsec.ko when
  ifconfig tries to configure ipsec, and to cy for volunteering
  to ensure the the racoon ports will load the ipsec.ko module
  
  Reviewed by:  cem, cy, delphij, gnn, jhb, jpaetzel
  Differential Revision:https://reviews.freebsd.org/D20163

Modified:
  head/UPDATING
  head/sys/amd64/conf/GENERIC
  head/sys/arm/conf/std.armv6
  head/sys/arm/conf/std.armv7
  head/sys/arm64/conf/GENERIC
  head/sys/i386/conf/GENERIC
  head/sys/powerpc/conf/GENERIC
  head/sys/powerpc/conf/GENERIC64
  head/sys/riscv/conf/GENERIC
  head/sys/sparc64/conf/GENERIC

Modified: head/UPDATING
==
--- head/UPDATING   Thu May  9 22:31:47 2019(r347409)
+++ head/UPDATING   Thu May  9 22:38:15 2019(r347410)
@@ -32,6 +32,10 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 13.x IS SLOW:
"ln -s 'abort:false,junk:false' /etc/malloc.conf".)
 
 20190507:
+   The IPSEC option has been removed from GENERIC.  Users requiring
+   ipsec(4) must now load the ipsec(4) kernel module.
+
+20190507:
The tap(4) driver has been folded into tun(4), and the module has been
renamed to tuntap.  You should update any kld_load="if_tap" or
kld_load="if_tun" entries in /etc/rc.conf, if_tap_load="YES" or

Modified: head/sys/amd64/conf/GENERIC
==
--- head/sys/amd64/conf/GENERIC Thu May  9 22:31:47 2019(r347409)
+++ head/sys/amd64/conf/GENERIC Thu May  9 22:38:15 2019(r347410)
@@ -30,7 +30,6 @@ options   PREEMPTION  # Enable kernel thread 
preemption
 optionsVIMAGE  # Subsystem virtualization, e.g. VNET
 optionsINET# InterNETworking
 optionsINET6   # IPv6 communications protocols
-optionsIPSEC   # IP (v4/v6) security
 optionsIPSEC_SUPPORT   # Allow kldload of ipsec and tcpmd5
 optionsTCP_OFFLOAD # TCP offload
 optionsTCP_BLACKBOX# Enhanced TCP event logging

Modified: head/sys/arm/conf/std.armv6
==
--- head/sys/arm/conf/std.armv6 Thu May  9 22:31:47 2019(r347409)
+++ head/sys/arm/conf/std.armv6 Thu May  9 22:38:15 2019(r347410)
@@ -11,7 +11,7 @@ options   INET# InterNETworking
 optionsINET6   # IPv6 communications protocols
 optionsTCP_HHOOK   # hhook(9) framework for TCP
 device crypto  # core crypto support
-optionsIPSEC   # IP (v4/v6) security
+optionsIPSEC_SUPPORT   # Allow kldload of ipsec and tcpmd5
 optionsSCTP# Stream Control Transmission Protocol
 optionsFFS # Berkeley Fast Filesystem
 optionsSOFTUPDATES # Enable FFS soft updates support

Modified: head/sys/arm/conf/std.armv7
==
--- head/sys/arm/conf/std.armv7 Thu May  9 22:31:47 2019(r347409)
+++ head/sys/arm/conf/std.armv7 Thu May  9 22:38:15 2019(r347410)
@@ -11,7 +11,7 @@ options   INET# InterNETworking
 optionsINET6   # IPv6 communications protocols
 optionsTCP_HHOOK   # hhook(9) framework for TCP
 device crypto  # core crypto support
-optionsIPSEC   # IP (v4/v6) security
+optionsIPSEC_SUPPORT   # Allow kldload of ipsec and tcpmd5
 optionsSCTP# Stream Control Transmission Protocol
 optionsFFS  

svn commit: r347055 - in head: sbin/ifconfig sys/net

2019-05-03 Thread Andrew Gallatin
Author: gallatin
Date: Fri May  3 14:43:21 2019
New Revision: 347055
URL: https://svnweb.freebsd.org/changeset/base/347055

Log:
  Select lacp egress ports based on NUMA domain
  
  This change creates an array of port maps indexed by numa domain
  for lacp port selection. If we have lacp interfaces in more than
  one domain, then we select the egress port by indexing into the
  numa port maps and picking a port on the appropriate numa domain.
  
  This is behavior is controlled by the new ifconfig use_numa flag
  and net.link.lagg.use_numa sysctl/tunable (both modeled after the
  existing use_flowid), which default to enabled.
  
  Reviewed by:  bz, hselasky, markj (and scottl, earlier version)
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D20060

Modified:
  head/sbin/ifconfig/ifconfig.8
  head/sbin/ifconfig/iflagg.c
  head/sys/net/ieee8023ad_lacp.c
  head/sys/net/ieee8023ad_lacp.h
  head/sys/net/if_lagg.c
  head/sys/net/if_lagg.h

Modified: head/sbin/ifconfig/ifconfig.8
==
--- head/sbin/ifconfig/ifconfig.8   Fri May  3 13:06:46 2019
(r347054)
+++ head/sbin/ifconfig/ifconfig.8   Fri May  3 14:43:21 2019
(r347055)
@@ -28,7 +28,7 @@
 .\" From: @(#)ifconfig.8   8.3 (Berkeley) 1/5/94
 .\" $FreeBSD$
 .\"
-.Dd June 27, 2018
+.Dd May 3, 2019
 .Dt IFCONFIG 8
 .Os
 .Sh NAME
@@ -2497,6 +2497,22 @@ Use the RSS hash from the network card if available.
 Set a shift parameter for RSS local hash computation.
 Hash is calculated by using flowid bits in a packet header mbuf
 which are shifted by the number of this parameter.
+.It Cm use_numa
+Enable selection of egress ports based on the native
+.Xr NUMA 4
+domain for the packets being transmitted.
+This is currently only implemented for lacp mode.
+This works only on
+.Xr NUMA 4
+hardware, running a kernel compiled with the
+.Xr NUMA 4
+option, and when interfaces from multiple
+.Xr NUMA 4
+domains are ports of the aggregation interface.
+.It Cm -use_numa
+Disable selection of egress ports based on the native
+.Xr NUMA 4
+domain for the packets being transmitted.
 .It Cm lacp_fast_timeout
 Enable lacp fast-timeout on the interface.
 .It Cm -lacp_fast_timeout

Modified: head/sbin/ifconfig/iflagg.c
==
--- head/sbin/ifconfig/iflagg.c Fri May  3 13:06:46 2019(r347054)
+++ head/sbin/ifconfig/iflagg.c Fri May  3 14:43:21 2019(r347055)
@@ -130,6 +130,8 @@ setlaggsetopt(const char *val, int d, int s, const str
switch (ro.ro_opts) {
case LAGG_OPT_USE_FLOWID:
case -LAGG_OPT_USE_FLOWID:
+   case LAGG_OPT_USE_NUMA:
+   case -LAGG_OPT_USE_NUMA:
case LAGG_OPT_LACP_STRICT:
case -LAGG_OPT_LACP_STRICT:
case LAGG_OPT_LACP_TXTEST:
@@ -303,6 +305,8 @@ static struct cmd lagg_cmds[] = {
DEF_CMD_ARG("lagghash", setlagghash),
DEF_CMD("use_flowid",   LAGG_OPT_USE_FLOWID,setlaggsetopt),
DEF_CMD("-use_flowid",  -LAGG_OPT_USE_FLOWID,   setlaggsetopt),
+   DEF_CMD("use_numa", LAGG_OPT_USE_NUMA,  setlaggsetopt),
+   DEF_CMD("-use_numa",-LAGG_OPT_USE_NUMA, setlaggsetopt),
DEF_CMD("lacp_strict",  LAGG_OPT_LACP_STRICT,   setlaggsetopt),
DEF_CMD("-lacp_strict", -LAGG_OPT_LACP_STRICT,  setlaggsetopt),
DEF_CMD("lacp_txtest",  LAGG_OPT_LACP_TXTEST,   setlaggsetopt),

Modified: head/sys/net/ieee8023ad_lacp.c
==
--- head/sys/net/ieee8023ad_lacp.c  Fri May  3 13:06:46 2019
(r347054)
+++ head/sys/net/ieee8023ad_lacp.c  Fri May  3 14:43:21 2019
(r347055)
@@ -835,7 +835,9 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf
struct lacp_softc *lsc = LACP_SOFTC(sc);
struct lacp_portmap *pm;
struct lacp_port *lp;
+   struct lacp_port **map;
uint32_t hash;
+   int count;
 
if (__predict_false(lsc->lsc_suppress_distributing)) {
LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__));
@@ -848,14 +850,32 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf
return (NULL);
}
 
+#ifdef NUMA
+   if ((sc->sc_opts & LAGG_OPT_USE_NUMA) &&
+   pm->pm_num_dom > 1 && m->m_pkthdr.numa_domain < MAXMEMDOM) {
+   count = pm->pm_numa[m->m_pkthdr.numa_domain].count;
+   if (count > 0) {
+   map = pm->pm_numa[m->m_pkthdr.numa_domain].map;
+   } else {
+   /* No ports on this domain; use global hash. */
+   map = pm->pm_map;
+   count = pm->pm_count;
+   }
+   } else
+#endif
+   {
+   map = pm->pm_map;
+   count = pm->pm_count;
+   }
if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&

Re: svn commit: r346598 - head/sys/modules

2019-04-29 Thread Andrew Gallatin

On 2019-04-29 10:54, Emmanuel Vadot wrote:

On Mon, 29 Apr 2019 10:49:01 -0400
Andrew Gallatin  wrote:


On 2019-04-29 10:21, Rodney W. Grimes wrote:

On Tue, 23 Apr 2019 at 13:26, Rodney W. Grimes
 wrote:


Very cool, now how do I get a PCIe slot into a RPI3!!! lol  :-)


I know you're joking but the comment does highlight an issue in the
AArch64 world - there's a lack of good mid-range developer platforms.


I may of been joking with respect to the RPI3, but at the same
time I do know that the RockPro64 exists and does have that
PCIe slot I want, I also know that Michael Dexter has one he would
loan me should I wish to investigate our state of support.


Does anybody know what PCIe Generation / speed that slot runs at?
All I can find them saying is "PCIe x4", which implies Gen 1, 2.5GT/s
speeds, which is not terribly useful.  Gen2 or better would be enough
to run 10GbE, which would be fun :)

Drew


  It/s PCIe 2.1 compatible. See
http://rockchip.fr/Rockchip%20RK3399%20TRM%20V1.3%20Part2.pdf



Everything I'm seeing there says Gen1 vs Gen2 depends on 
"PCIE_GENERATION_SEL", and that if its set to 0, you get

Gen1 2.5Gt/s and if it is set to 1, you get Gen2, 5.0Gt/s.
But I don't see anything specifying this value for the
RockPro64 board.

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r346598 - head/sys/modules

2019-04-29 Thread Andrew Gallatin

On 2019-04-29 10:21, Rodney W. Grimes wrote:

On Tue, 23 Apr 2019 at 13:26, Rodney W. Grimes
 wrote:


Very cool, now how do I get a PCIe slot into a RPI3!!! lol  :-)


I know you're joking but the comment does highlight an issue in the
AArch64 world - there's a lack of good mid-range developer platforms.


I may of been joking with respect to the RPI3, but at the same
time I do know that the RockPro64 exists and does have that
PCIe slot I want, I also know that Michael Dexter has one he would
loan me should I wish to investigate our state of support.


Does anybody know what PCIe Generation / speed that slot runs at?
All I can find them saying is "PCIe x4", which implies Gen 1, 2.5GT/s
speeds, which is not terribly useful.  Gen2 or better would be enough
to run 10GbE, which would be fun :)

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r346677 - in head/sys: dev/cxgbe dev/mlx5/mlx5_en kern netinet netinet6

2019-04-25 Thread Andrew Gallatin
Author: gallatin
Date: Thu Apr 25 15:37:28 2019
New Revision: 346677
URL: https://svnweb.freebsd.org/changeset/base/346677

Log:
  Track TCP connection's NUMA domain in the inpcb
  
  Drivers can now pass up numa domain information via the
  mbuf numa domain field.  This information is then used
  by TCP syncache_socket() to associate that information
  with the inpcb. The domain information is then fed back
  into transmitted mbufs in ip{6}_output(). This mechanism
  is nearly identical to what is done to track RSS hash values
  in the inp_flowid.
  
  Follow on changes will use this information for lacp egress
  port selection, binding TCP pacers to the appropriate NUMA
  domain, etc.
  
  Reviewed by:  markj, kib, slavash, bz, scottl, jtl, tuexen
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D20028

Modified:
  head/sys/dev/cxgbe/t4_sge.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
  head/sys/kern/uipc_mbuf.c
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/ip_output.c
  head/sys/netinet/tcp_syncache.c
  head/sys/netinet6/ip6_output.c

Modified: head/sys/dev/cxgbe/t4_sge.c
==
--- head/sys/dev/cxgbe/t4_sge.c Thu Apr 25 15:31:35 2019(r346676)
+++ head/sys/dev/cxgbe/t4_sge.c Thu Apr 25 15:37:28 2019(r346677)
@@ -2046,6 +2046,9 @@ t4_eth_rx(struct sge_iq *iq, const struct rss_header *
rxq->vlan_extraction++;
}
 
+#ifdef NUMA
+   m0->m_pkthdr.numa_domain = ifp->if_numa_domain;
+#endif
 #if defined(INET) || defined(INET6)
if (iq->flags & IQ_LRO_ENABLED) {
if (sort_before_lro(lro)) {

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
==
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c  Thu Apr 25 15:31:35 2019
(r346676)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c  Thu Apr 25 15:37:28 2019
(r346677)
@@ -520,6 +520,9 @@ rx_common:
mlx5e_build_rx_mbuf(cqe, rq, mb, byte_cnt);
rq->stats.bytes += byte_cnt;
rq->stats.packets++;
+#ifdef NUMA
+   mb->m_pkthdr.numa_domain = rq->ifp->if_numa_domain;
+#endif
 
 #if !defined(HAVE_TCP_LRO_RX)
tcp_lro_queue_mbuf(>lro, mb);

Modified: head/sys/kern/uipc_mbuf.c
==
--- head/sys/kern/uipc_mbuf.c   Thu Apr 25 15:31:35 2019(r346676)
+++ head/sys/kern/uipc_mbuf.c   Thu Apr 25 15:37:28 2019(r346677)
@@ -341,6 +341,9 @@ m_pkthdr_init(struct mbuf *m, int how)
 #endif
m->m_data = m->m_pktdat;
bzero(>m_pkthdr, sizeof(m->m_pkthdr));
+#ifdef NUMA
+   m->m_pkthdr.numa_domain = M_NODOM;
+#endif
 #ifdef MAC
/* If the label init fails, fail the alloc */
error = mac_mbuf_init(m, how);

Modified: head/sys/netinet/in_pcb.c
==
--- head/sys/netinet/in_pcb.c   Thu Apr 25 15:31:35 2019(r346676)
+++ head/sys/netinet/in_pcb.c   Thu Apr 25 15:37:28 2019(r346677)
@@ -510,6 +510,9 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbin
if (inp == NULL)
return (ENOBUFS);
bzero(>inp_start_zero, inp_zero_size);
+#ifdef NUMA
+   inp->inp_numa_domain = M_NODOM;
+#endif
inp->inp_pcbinfo = pcbinfo;
inp->inp_socket = so;
inp->inp_cred = crhold(so->so_cred);

Modified: head/sys/netinet/in_pcb.h
==
--- head/sys/netinet/in_pcb.h   Thu Apr 25 15:31:35 2019(r346676)
+++ head/sys/netinet/in_pcb.h   Thu Apr 25 15:37:28 2019(r346677)
@@ -272,7 +272,7 @@ struct inpcb {
 inp_hpts_calls :1, /* (i) from output hpts */
 inp_input_calls :1,/* (i) from input hpts */
 inp_spare_bits2 : 4;
-   uint8_t inp_spare_byte; /* Compiler hole */
+   uint8_t inp_numa_domain;/* numa domain */
void*inp_ppcb;  /* (i) pointer to per-protocol pcb */
struct  socket *inp_socket; /* (i) back pointer to socket */
uint32_t inp_hptsslot;  /* Hpts wheel slot this tcb is 
Lock(i) */

Modified: head/sys/netinet/ip_output.c
==
--- head/sys/netinet/ip_output.cThu Apr 25 15:31:35 2019
(r346676)
+++ head/sys/netinet/ip_output.cThu Apr 25 15:37:28 2019
(r346677)
@@ -247,6 +247,9 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct rou
m->m_pkthdr.flowid = inp->inp_flowid;
M_HASHTYPE_SET(m, inp->inp_flowtype);
}
+#ifdef NUMA
+   m->m_pkthdr.numa_domain = inp->inp_numa_domain;
+#endif
   

svn commit: r346632 - head/sys/net

2019-04-24 Thread Andrew Gallatin
Author: gallatin
Date: Wed Apr 24 13:32:04 2019
New Revision: 346632
URL: https://svnweb.freebsd.org/changeset/base/346632

Log:
  iflib: Add pfil hooks
  
  As with mlx5en, the idea is to drop unwanted traffic as early
  in receive as possible, before mbufs are allocated and anything
  is passed up the stack.  This can save considerable CPU time
  when a machine is under a flooding style DOS attack.
  
  The major change here is to remove the unneeded abstraction where
  callers of rxd_frag_to_sd() get back a pointer to the mbuf ring, and
  are responsible for NULL'ing that mbuf themselves. Now this happens
  directly in rxd_frag_to_sd(), and it returns an mbuf. This allows us
  to use the decision (and potentially mbuf) returned by the pfil
  hooks. The driver can now recycle mbufs to avoid re-allocation when
  packets are dropped.
  
  Reviewed by:  marius  (shurd and erj also provided feedback)
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D19645

Modified:
  head/sys/net/iflib.c

Modified: head/sys/net/iflib.c
==
--- head/sys/net/iflib.cWed Apr 24 13:15:56 2019(r346631)
+++ head/sys/net/iflib.cWed Apr 24 13:32:04 2019(r346632)
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -432,6 +433,7 @@ struct iflib_rxq {
if_ctx_tifr_ctx;
iflib_fl_t  ifr_fl;
uint64_tifr_rx_irq;
+   struct pfil_head*pfil;
uint16_tifr_id;
uint8_t ifr_lro_enabled;
uint8_t ifr_nfl;
@@ -451,7 +453,6 @@ struct iflib_rxq {
 
 typedef struct if_rxsd {
caddr_t *ifsd_cl;
-   struct mbuf **ifsd_m;
iflib_fl_t ifsd_fl;
qidx_t ifsd_cidx;
 } *if_rxsd_t;
@@ -652,7 +653,6 @@ static int iflib_fast_intrs;
 static int iflib_rx_unavail;
 static int iflib_rx_ctx_inactive;
 static int iflib_rx_if_input;
-static int iflib_rx_mbuf_null;
 static int iflib_rxd_flush;
 
 static int iflib_verbose_debug;
@@ -669,8 +669,6 @@ SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLF
   _rx_ctx_inactive, 0, "# times rxeof called with 
inactive context");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
   _rx_if_input, 0, "# times rxeof called if_input");
-SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD,
-  _rx_mbuf_null, 0, "# times rxeof got null mbuf");
 SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
 _rxd_flush, 0, "# times rxd_flush called");
 SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
@@ -689,7 +687,7 @@ iflib_debug_reset(void)
iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
iflib_rx_unavail =
iflib_rx_ctx_inactive = iflib_rx_if_input =
-   iflib_rx_mbuf_null = iflib_rxd_flush = 0;
+   iflib_rxd_flush = 0;
 }
 
 #else
@@ -2002,11 +2000,12 @@ _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int coun
bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
BUS_DMASYNC_PREREAD);
 
-   MPASS(sd_m[frag_idx] == NULL);
-   if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
-   break;
+   if (sd_m[frag_idx] == NULL) {
+   if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
+   break;
+   }
+   sd_m[frag_idx] = m;
}
-   sd_m[frag_idx] = m;
bit_set(fl->ifl_rx_bitmap, frag_idx);
 #if MEMORY_LOGGING
fl->ifl_m_enqueued++;
@@ -2483,13 +2482,15 @@ prefetch_pkts(iflib_fl_t fl, int cidx)
prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
 }
 
-static void
-rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int unload, if_rxsd_t sd)
+static struct mbuf *
+rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd,
+int *pf_rv, if_rxd_info_t ri)
 {
-   int flid, cidx;
bus_dmamap_t map;
iflib_fl_t fl;
-   int next;
+   caddr_t payload;
+   struct mbuf *m;
+   int flid, cidx, len, next;
 
map = NULL;
flid = irf->irf_flid;
@@ -2497,7 +2498,7 @@ rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int
fl = >ifr_fl[flid];
sd->ifsd_fl = fl;
sd->ifsd_cidx = cidx;
-   sd->ifsd_m = >ifl_sds.ifsd_m[cidx];
+   m = fl->ifl_sds.ifsd_m[cidx];
sd->ifsd_cl = >ifl_sds.ifsd_cl[cidx];
fl->ifl_credits--;
 #if MEMORY_LOGGING
@@ -2513,39 +2514,89 @@ rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int
/* not valid assert if bxe really does SGE from non-contiguous elements 
*/
MPASS(fl->ifl_cidx == cidx);
bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
+
+   if (rxq->pfil != 

svn commit: r346579 - in head: share/man/man9 sys/dev/cxgbe sys/dev/mlx5/mlx5_en sys/net

2019-04-22 Thread Andrew Gallatin
Author: gallatin
Date: Mon Apr 22 19:24:21 2019
New Revision: 346579
URL: https://svnweb.freebsd.org/changeset/base/346579

Log:
  Track device's NUMA domain in ifnet & alloc ifnet from NUMA local memory
  
  This commit adds new if_alloc_domain() and if_alloc_dev() methods to
  allocate ifnets.  When called with a domain on a NUMA machine,
  ifalloc_domain() will record the NUMA domain in the ifnet, and it will
  allocate the ifnet struct from memory which is local to that NUMA
  node.  Similarly, if_alloc_dev() is a wrapper for if_alloc_domain
  which uses a driver supplied device_t to call ifalloc_domain() with
  the appropriate domain.
  
  Note that the new if_numa_domain field fits in an alignment pad in
  struct ifnet, and so does not alter the size of the structure.
  
  Reviewed by:  glebius, kib, markj
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D19930

Modified:
  head/share/man/man9/Makefile
  head/share/man/man9/ifnet.9
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
  head/sys/net/if.c
  head/sys/net/if_var.h

Modified: head/share/man/man9/Makefile
==
--- head/share/man/man9/MakefileMon Apr 22 19:21:35 2019
(r346578)
+++ head/share/man/man9/MakefileMon Apr 22 19:24:21 2019
(r346579)
@@ -1175,6 +1175,8 @@ MLINKS+=iflibtxrx.9 isc_rxd_available.9 \
iflibtxrx.9 isc_txd_flush.9
 MLINKS+=ifnet.9 if_addmulti.9 \
ifnet.9 if_alloc.9 \
+   ifnet.9 if_alloc_dev.9 \
+   ifnet.9 if_alloc_domain.9 \
ifnet.9 if_allmulti.9 \
ifnet.9 if_attach.9 \
ifnet.9 if_data.9 \

Modified: head/share/man/man9/ifnet.9
==
--- head/share/man/man9/ifnet.9 Mon Apr 22 19:21:35 2019(r346578)
+++ head/share/man/man9/ifnet.9 Mon Apr 22 19:24:21 2019(r346579)
@@ -48,6 +48,10 @@
 .Ss "Interface Manipulation Functions"
 .Ft "struct ifnet *"
 .Fn if_alloc "u_char type"
+.Ft "struct ifnet *"
+.Fn if_alloc_dev "u_char type" "device_t dev"
+.Ft "struct ifnet *"
+.Fn if_alloc_domain "u_char type" "int numa_domain"
 .Ft void
 .Fn if_attach "struct ifnet *ifp"
 .Ft void
@@ -440,6 +444,15 @@ It is used to cache the type passed to
 but unlike
 .Va if_type ,
 it would not be changed by drivers.
+.It Va if_numa_domain
+.Pq Vt uint8_t
+The NUMA domain of the hardware device associated with the interface.
+This is filled in with a wildcard value unless the kernel is NUMA
+aware, the system is a NUMA system, and the ifnet is allocated
+using
+.Fn if_alloc_dev
+or
+.Fn if_alloc_domain .
 .El
 .Pp
 References to
@@ -1151,6 +1164,24 @@ include the allocation of a
 .Fa type
 specific structure in
 .Va if_l2com .
+.It Fn if_alloc_dev
+Allocate and initialize
+.Vt "struct ifnet"
+as
+.Fn if_alloc
+does, with the addition that the ifnet can be tagged with the
+appropriate NUMA domain derived from the
+.Fa dev
+argument passed by the caller.
+.It Fn if_alloc_domain
+Allocate and initialize
+.Vt "struct ifnet"
+as
+.Fn if_alloc
+does, with the addition that the ifnet will be tagged with the NUMA
+domain via the
+.Fa numa_domain
+argument passed by the caller.
 .It Fn if_attach
 Link the specified interface
 .Fa ifp
@@ -1168,7 +1199,10 @@ function.)
 The
 .Fa ifp
 must have been allocated by
-.Fn if_alloc .
+.Fn if_alloc ,
+.Fn if_alloc_dev
+or
+.Fn if_alloc_domain .
 .It Fn if_detach
 Shut down and unlink the specified
 .Fa ifp

Modified: head/sys/dev/cxgbe/t4_main.c
==
--- head/sys/dev/cxgbe/t4_main.cMon Apr 22 19:21:35 2019
(r346578)
+++ head/sys/dev/cxgbe/t4_main.cMon Apr 22 19:24:21 2019
(r346579)
@@ -1636,7 +1636,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
callout_init(>tick, 1);
 
/* Allocate an ifnet and set it up */
-   ifp = if_alloc(IFT_ETHER);
+   ifp = if_alloc_dev(IFT_ETHER, dev);
if (ifp == NULL) {
device_printf(dev, "Cannot allocate ifnet\n");
return (ENOMEM);

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 22 19:21:35 2019
(r346578)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 22 19:24:21 2019
(r346579)
@@ -3682,7 +3682,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
M_MLX5EN, M_WAITOK | M_ZERO);
mlx5e_priv_mtx_init(priv);
 
-   ifp = priv->ifp = if_alloc(IFT_ETHER);
+   ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev);
if (ifp == NULL) {
mlx5_core_err(mdev, "if_alloc() failed\n");
goto err_free_priv;

Modified: head/sys/net/if.c

svn commit: r346281 - head/sys/sys

2019-04-16 Thread Andrew Gallatin
Author: gallatin
Date: Tue Apr 16 16:49:34 2019
New Revision: 346281
URL: https://svnweb.freebsd.org/changeset/base/346281

Log:
  Replace cosqos with numa_domain in mbuf pkthdr
  
  The cosqos field was added nearly 6 years ago in r254804, and it is
  still unused by any in-tree consumers.  I have a patchset that I'm
  working on which aligns many network resources by NUMA domain,
  including inps, inpcb lb group, tcp pacing, lagg output link
  selection, backing pages for sendfile, and more.  It reduces
  cross-domain traffic by roughly 50% for a real web workload.
  
  This patchset relies on being able to store the numa domain in the
  mbuf, and grabbing the unused cosqos field for this purpose is the
  first step in starting to usptream it.
  
  Reviewed by:  kib, markj
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D19862

Modified:
  head/sys/sys/mbuf.h

Modified: head/sys/sys/mbuf.h
==
--- head/sys/sys/mbuf.h Tue Apr 16 15:52:04 2019(r346280)
+++ head/sys/sys/mbuf.h Tue Apr 16 16:49:34 2019(r346281)
@@ -98,6 +98,7 @@ struct mbuf;
 #defineMLEN((int)(MSIZE - MHSIZE))
 #defineMHLEN   ((int)(MSIZE - MPKTHSIZE))
 #defineMINCLSIZE   (MHLEN + 1)
+#defineM_NODOM 255
 
 #ifdef _KERNEL
 /*-
@@ -158,7 +159,7 @@ struct pkthdr {
uint32_t flowid;/* packet's 4-tuple system */
uint32_t csum_flags;/* checksum and offload features */
uint16_t fibnum;/* this packet should use this fib */
-   uint8_t  cosqos;/* class/quality of service */
+   uint8_t  numa_domain;   /* NUMA domain of recvd pkt */
uint8_t  rsstype;   /* hash type */
union {
uint64_trcv_tstmp;  /* timestamp in ns */
@@ -405,33 +406,6 @@ struct mbuf {
 #defineM_HASHTYPE_SET(m, v)((m)->m_pkthdr.rsstype = (v))
 #defineM_HASHTYPE_TEST(m, v)   (M_HASHTYPE_GET(m) == (v))
 #defineM_HASHTYPE_ISHASH(m)(M_HASHTYPE_GET(m) & 
M_HASHTYPE_HASHPROP)
-
-/*
- * COS/QOS class and quality of service tags.
- * It uses DSCP code points as base.
- */
-#defineQOS_DSCP_CS00x00
-#defineQOS_DSCP_DEFQOS_DSCP_CS0
-#defineQOS_DSCP_CS10x20
-#defineQOS_DSCP_AF11   0x28
-#defineQOS_DSCP_AF12   0x30
-#defineQOS_DSCP_AF13   0x38
-#defineQOS_DSCP_CS20x40
-#defineQOS_DSCP_AF21   0x48
-#defineQOS_DSCP_AF22   0x50
-#defineQOS_DSCP_AF23   0x58
-#defineQOS_DSCP_CS30x60
-#defineQOS_DSCP_AF31   0x68
-#defineQOS_DSCP_AF32   0x70
-#defineQOS_DSCP_AF33   0x78
-#defineQOS_DSCP_CS40x80
-#defineQOS_DSCP_AF41   0x88
-#defineQOS_DSCP_AF42   0x90
-#defineQOS_DSCP_AF43   0x98
-#defineQOS_DSCP_CS50xa0
-#defineQOS_DSCP_EF 0xb8
-#defineQOS_DSCP_CS60xc0
-#defineQOS_DSCP_CS70xe0
 
 /*
  * External mbuf storage buffer types.
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r346247 - head/sys/dev/mlx5/mlx5_en

2019-04-15 Thread Andrew Gallatin
Author: gallatin
Date: Mon Apr 15 17:14:50 2019
New Revision: 346247
URL: https://svnweb.freebsd.org/changeset/base/346247

Log:
  mlx5en: Enable new pfil(9) KPI ethernet filtering hooks
  
  This allows efficient filtering at packet ingress on mlx5en.
  
  Note that the packets are filtered (and potentially dropped) *before*
  the driver has committed to (re)allocating an mbuf for the
  packet. Dropped packets are treated essentially the same as an
  error. Nothing is allocated, and the existing buffer is recycled. This
  allows us to drop malicious packets at close to line rate with very
  little CPU use.
  
  Reviewed by:  hselasky, slavash, kib
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D19063

Modified:
  head/sys/dev/mlx5/mlx5_en/en.h
  head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c

Modified: head/sys/dev/mlx5/mlx5_en/en.h
==
--- head/sys/dev/mlx5/mlx5_en/en.h  Mon Apr 15 16:57:27 2019
(r346246)
+++ head/sys/dev/mlx5/mlx5_en/en.h  Mon Apr 15 17:14:50 2019
(r346247)
@@ -48,6 +48,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -838,6 +839,7 @@ struct mlx5e_priv {
struct mlx5e_clbr_point clbr_points[2];
u_int   clbr_gen;
 
+   struct pfil_head *pfil;
struct mlx5e_channel channel[];
 };
 

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 15 16:57:27 2019
(r346246)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.cMon Apr 15 17:14:50 2019
(r346247)
@@ -3664,6 +3664,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
struct sysctl_oid_list *child;
int ncv = mdev->priv.eq_table.num_comp_vectors;
char unit[16];
+   struct pfil_head_args pa;
int err;
int i;
u32 eth_proto_cap;
@@ -3898,6 +3899,12 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
callout_init(>tstmp_clbr, CALLOUT_DIRECT);
mlx5e_reset_calibration_callout(priv);
 
+   pa.pa_version = PFIL_VERSION;
+   pa.pa_flags = PFIL_IN;
+   pa.pa_type = PFIL_TYPE_ETHERNET;
+   pa.pa_headname = ifp->if_xname;
+   priv->pfil = pfil_head_register();
+
return (priv);
 
 #ifdef RATELIMIT
@@ -3972,6 +3979,12 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vp
if_printf(priv->ifp, "Waiting for all unlimited connections "
"to terminate\n");
pause("W", hz);
+   }
+
+   /* deregister pfil */
+   if (priv->pfil != NULL) {
+   pfil_head_unregister(priv->pfil);
+   priv->pfil = NULL;
}
 
/* unregister device */

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
==
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c  Mon Apr 15 16:57:27 2019
(r346246)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c  Mon Apr 15 17:14:50 2019
(r346247)
@@ -430,15 +430,18 @@ mlx5e_decompress_cqes(struct mlx5e_cq *cq)
 static int
 mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget)
 {
-   int i;
+   struct pfil_head *pfil;
+   int i, rv;
 
+   CURVNET_SET_QUIET(rq->ifp->if_vnet);
+   pfil = rq->channel->priv->pfil;
for (i = 0; i < budget; i++) {
struct mlx5e_rx_wqe *wqe;
struct mlx5_cqe64 *cqe;
struct mbuf *mb;
__be16 wqe_counter_be;
u16 wqe_counter;
-   u32 byte_cnt;
+   u32 byte_cnt, seglen;
 
cqe = mlx5e_get_cqe(>cq);
if (!cqe)
@@ -462,6 +465,39 @@ mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget)
rq->stats.wqe_err++;
goto wq_ll_pop;
}
+   if (pfil != NULL && PFIL_HOOKED_IN(pfil)) {
+   seglen = MIN(byte_cnt, MLX5E_MAX_RX_BYTES);
+   rv = pfil_run_hooks(rq->channel->priv->pfil,
+   rq->mbuf[wqe_counter].data, rq->ifp,
+   seglen | PFIL_MEMPTR | PFIL_IN, NULL);
+
+   switch (rv) {
+   case PFIL_DROPPED:
+   case PFIL_CONSUMED:
+   /*
+* Filter dropped or consumed it. In
+* either case, we can just recycle
+* buffer; there is no more work to do.
+*/
+   rq->stats.packets++;
+   goto wq_ll_pop;
+   case PFIL_REALLOCED:
+   /*
+* Filter copied it; recycle buffer
+

svn commit: r345273 - head/sys/kern

2019-03-18 Thread Andrew Gallatin
Author: gallatin
Date: Mon Mar 18 12:41:42 2019
New Revision: 345273
URL: https://svnweb.freebsd.org/changeset/base/345273

Log:
  Fix a typo introduced in r344133
  
  The line was misedited to change tt to st instead of
  changing ut to st.
  
  The use of st as the denominator in mul64_by_fraction() will lead
  to an integer divide fault in the intr proc (the process holding
  ithreads) where st will be 0.  This divide by 0 happens after
  the total runtime for all ithreads exceeds 76 hours.
  
  Submitted by: bde

Modified:
  head/sys/kern/kern_resource.c

Modified: head/sys/kern/kern_resource.c
==
--- head/sys/kern/kern_resource.c   Mon Mar 18 12:34:13 2019
(r345272)
+++ head/sys/kern/kern_resource.c   Mon Mar 18 12:41:42 2019
(r345273)
@@ -978,7 +978,7 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struc
su = (tu * st) / tt;
} else {
uu = mul64_by_fraction(tu, ut, tt);
-   su = mul64_by_fraction(tu, ut, st);
+   su = mul64_by_fraction(tu, st, tt);
}
 
if (tu >= ruxp->rux_tu) {
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r345138 - head/share/man/man9

2019-03-15 Thread Andrew Gallatin

On 3/14/19 11:36 PM, Rodney W. Grimes wrote:

[ Charset UTF-8 unsupported, converting... ]

On Thu, 14 Mar 2019 at 22:39, Rodney W. Grimes
 wrote:


4. There is no easy way to show
"changed byte at offset 0x432 from 0xef to 0xfe"


How do we represent Copyright and License in such objects?
This is an issue that is totally left out of even .uu version.



This is an excellent point.  What I used to do for mxge
firmware when I worked at Myricom was to have a shell
script that created a source file with the uuencoded bits
as a static array.  That way, it had copyright info in
the file itself.

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r344817 - in head/sys: dev/e1000 net

2019-03-08 Thread Andrew Gallatin

On 3/5/19 4:06 PM, Matthew Macy wrote:

This represents a misunderstanding of how defines are used. This left
the option open to the user to enable the use of larger than page size
buffers as it does enable better performance. Over the course of a
long uptime memory can get too fragmented. However, this left it open
to the end consumer.

I'd like to see this reverted with perhaps a better name for the
define and the addition of an explanatory comment.



I'd strongly prefer that it stay removed.  Since it is not hooked to an 
option, no user is ever going to find it.  This really should have been 
a tuneable (since it is done at ring init time, rather than rx buffer 
alloc time), but nobody cared enough to make it actually usable.


From brief memories of performance tuning 10G adapters 14 years ago, 
the differences between page-sized and 9k jumbos were minimal even back 
then (1/3 as many mbuf alloc/free, smaller chains).  So I'm not 
convinced that it is worth bringing back in any form.


My general feeling is that the more of this code that we can remove, the 
better.  Iflib is tricky enough that it is already challenging to reason 
about and maintain.  Removing code which is for all intents and purposes 
unreachable and never tested is Good Thing.


Drew




___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r344099 - head/sys/net

2019-02-22 Thread Andrew Gallatin
I think the misunderstanding here is that I think he's not getting the 
ifp from the route.


My recollection is that he is holding the ifps when he enables HW pacing 
in BBR.  Due to limitations in different NIC hardware, you can only have 
N different rates, etc.  So he goes ahead and allocates those N rates up 
front so that he knows he can reserve them & know that he can always get 
them.


Then when the system reboots, BBR has an eventhandler that goes ahead 
and frees those reservations.  I think that he's using the ifp that he's 
holding here.


In the case that tripped him up, that ifp was lagg.

Your workaround would also work, but Randall does have a point about 
symmetric alloc/free especially when viewed from his perspective,



Drew


___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r343430 - head/sys/net

2019-01-25 Thread Andrew Gallatin
Author: gallatin
Date: Fri Jan 25 15:02:18 2019
New Revision: 343430
URL: https://svnweb.freebsd.org/changeset/base/343430

Log:
  Fix an iflib driver unload panic introduced in r343085
  
  The new loop to sync and unload descriptors was indexed
  by "i", rather than "j".   The panic was caused by "i"
  being advanced rather than "j", and eventually becoming
  out of bounds.
  
  Reviewed by:  kib
  MFC after:3 days
  Sponsored by: Netflix

Modified:
  head/sys/net/iflib.c

Modified: head/sys/net/iflib.c
==
--- head/sys/net/iflib.cFri Jan 25 14:46:13 2019(r343429)
+++ head/sys/net/iflib.cFri Jan 25 15:02:18 2019(r343430)
@@ -2197,17 +2197,17 @@ iflib_rx_sds_free(iflib_rxq_t rxq)
fl = >ifr_fl[i];
if (fl->ifl_desc_tag != NULL) {
if (fl->ifl_sds.ifsd_map != NULL) {
-   for (j = 0; j < fl->ifl_size; i++) {
-   if (fl->ifl_sds.ifsd_map[i] ==
+   for (j = 0; j < fl->ifl_size; j++) {
+   if (fl->ifl_sds.ifsd_map[j] ==
NULL)
-   continue;
+   continue;
bus_dmamap_sync(
fl->ifl_desc_tag,
-   fl->ifl_sds.ifsd_map[i],
+   fl->ifl_sds.ifsd_map[j],
BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(
fl->ifl_desc_tag,
-   fl->ifl_sds.ifsd_map[i]);
+   fl->ifl_sds.ifsd_map[j]);
}
}
bus_dma_tag_destroy(fl->ifl_desc_tag);
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r343269 - head/sys/dev/cxgbe

2019-01-21 Thread Andrew Gallatin

On 1/21/19 1:42 PM, Navdeep Parhar wrote:


Log:
   cxgbe(4): Allow negative values in hw.cxgbe.fw_install and take them to


Thank you!

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r343030 - in head/sys: cam conf dev/md dev/nvme fs/fuse fs/nfsclient fs/smbfs kern sys ufs/ffs vm

2019-01-15 Thread Andrew Gallatin

On 1/14/19 8:02 PM, Gleb Smirnoff wrote:


Log:
   Allocate pager bufs from UMA instead of 80-ish mutex protected linked list.


<...>


   Together with:   gallatin


Thank you so much for carrying this over the finish line!

Drew

___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r342774 - head/sys/conf

2019-01-04 Thread Andrew Gallatin
Author: gallatin
Date: Fri Jan  4 18:38:27 2019
New Revision: 342774
URL: https://svnweb.freebsd.org/changeset/base/342774

Log:
  Limit git history searches in newvers.sh
  
  newvers.sh takes upwards of 4-5 seconds to complete on trees checked
  out from github, due to searching the entire history for non-existent
  git-svn metadata. Similarly, if one does not check out notes, we
  again search the entire history for notes. That makes newvers.sh very
  slow for many github users.
  
  To fix this in a fair way, limit the history search to the last 10K
  commits: if you're more than 10K commits out of sync, then you've
  forked the project, and our SVN rev is no longer very important to you.
  
  Due to how git implements --grep in conjunction with -n, --grep has been
  removed for performance reasons (git does not seem to limit its search
  to the -n limit in this case, and takes just as long as it did with no
  limit).
  
  Reviewed by:  emaste, imp
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D18745

Modified:
  head/sys/conf/newvers.sh

Modified: head/sys/conf/newvers.sh
==
--- head/sys/conf/newvers.shFri Jan  4 18:35:25 2019(r342773)
+++ head/sys/conf/newvers.shFri Jan  4 18:38:27 2019(r342774)
@@ -243,11 +243,15 @@ if [ -n "$git_cmd" ] ; then
svn=" r${gitsvn}"
git="=${git}"
else
-   gitsvn=`$git_cmd log --grep '^git-svn-id:' | \
+#  Log searches are limited to 10k commits to speed up failures.
+#  We assume that if a tree is more than 10k commits out-of-sync
+#  with FreeBSD, it has forked the the OS and the SVN rev no
+#  longer matters.
+   gitsvn=`$git_cmd log -n 1 |
grep '^git-svn-id:' | head -1 | \
sed -n 's/^.*@\([0-9][0-9]*\).*$/\1/p'`
if [ -z "$gitsvn" ] ; then
-   gitsvn=`$git_cmd log --format='format:%N' | \
+   gitsvn=`$git_cmd log -n 1 --format='format:%N' | \
 grep '^svn ' | head -1 | \
 sed -n 's/^.*revision=\([0-9][0-9]*\).*$/\1/p'`
fi
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r341578 - head/sys/dev/mlx5/mlx5_en

2018-12-17 Thread Andrew Gallatin

On 12/17/18 2:08 PM, Bruce Evans wrote:

On Mon, 17 Dec 2018, Andrew Gallatin wrote:


On 12/5/18 9:20 AM, Slava Shwartsman wrote:

Author: slavash
Date: Wed Dec  5 14:20:57 2018
New Revision: 341578
URL: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_341578=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=BFp2c_-S0jnzRZJF2APwvTwmnmVFcyjcnBvHRZ3Locc=b7fvhOzf_b5bMVGquu4SaBhMNql5N8dVPAvpfKtz53Q= 



Log:
   mlx5en: Remove the DRBR and associated logic in the transmit path.
  The hardware queues are deep enough currently and using the 
DRBR and associated
   callbacks only leads to more task switching in the TX path. The is 
also a race

   setting the queue_state which can lead to hung TX rings.


The point of DRBR in the tx path is not simply to provide a software 
ring for queuing excess packets.  Rather it provides a mechanism to

avoid lock contention by shoving a packet into the software ring, where
it will later be found & processed, rather than blocking the caller on
a mtx lock.   I'm concerned you may have introduced a performance
regression for use cases where you have N:1  or N:M lock contention 
where many threads on different cores are contending for the same tx 
queue.  The state of the art for this is no longer DRBR, but mp_ring,

as used by both cxgbe and iflib.


iflib uses queuing techniques to significantly pessimize em NICs with 1
hardware queue.  On fast machines, it attempts to do 1 context switch per


This can happen even w/o contention when "abdicate" is enabled in mp
ring. I complained about this as well, and the default was changed in
mp ring to not always "abdicate" (eg, switch to the tq to handle the
packet). Abdication substantially pessimizes Netflix style web 
uncontended workloads, but it generally helps small packet forwarding.


It is interesting that you see the opposite.  I should try benchmarking
with just a single ring.




(small) tx packet and can't keep up.  On slow machines it has a chance of
handling multiple packets per context switch, but since the machine is too
slow it can't keep up and saturates at a slightly different point.  Results
for netblast $lanhost 5001 5 10 (5-byte payload for 10 seconds) on an I218V
on Haswell 4 cores x 2 threads @4.08GHz running i386:

Old results with no iflib and no EM_MULTIQUEUE except as indicated:

FBSD-10 UP    1377+0
FBSD-11 UP    1326+0
FBSD-11 SMP-1 1484+0
FBSD-11 SMP-8 1395+0
FBSD-12mod  SMP-1 1386+0
FBSD-12mod  SMP-8 1422+0
FBSD-12mod  SMP-1 1270+0   # use iflib (lose 8% performance)
FBSD-12mod  SMP-8 1279+0   # use iflib (lose 10% performance using more 
CPU)


1377+0 means 1377 kpps sent and 0 kpps errors, etc.  SMP-8 means use all 8
CPUs.  SMP-1 means restrict netblast to 1 CPU different from the taskqueue
CPUs using cpuset.

New results:

FBSD-11 SMP-8 1440+0   # no iflib, no EM_MULTIQUEUE
FBSD-11 SMP-8 1486+241 # no iflib, use EM_MULTIQUEUE (now saturate 
1Gbps)

FBSD-cur    SMP-8  533+0   # use iflib, use i386 with 4G KVA

iflib only decimates performance relative to the FreeBSD-11 version
with no EM_MULTIQUEUE, but EM_MULTIQUEUE gives better queueing using
more CPUs.  This gives the extra 10-20% of performance needed to
saturate the NIC and 1Gbps ethernet.  The FreeBSD-current version is
not directly comparable since using 4G KVA on i386 reduces performance
by about a factor of 2.5 for all loads with mostly small i/o's (for
128K disk i/o's the reduction is only 10-20%).  i386 ran at about the
same speed as amd64 when it had 1GB KVA, but I don't have any savd
results for amd64 to compare with precisely).  This is all with
security-related things like ibrs unavailable or turned off.

All versions use normal Intel interrupt moderation which gives an interrupt
rate of 8k/sec.

Old versions of em use a "fast" interrupt handler and a slow switch
to a taskqueue.  This gives a contex switch rate of about 16k/ sec.
In the SMP case, netblast normally runs on another CPU and I think it
fills h/w tx queue(s) synchronously, and the taskqueue only does minor
cleanups.  Old em also has a ping latency of about 10% smaller than
with iflib (73 usec instead of 80 usec after setting em.x.itr to 0 and
other tuning to kill interrupt moderation, and similar for a bge NIC
on the other end).  The synchronous queue filling probably improves
latency, but it is hard to see how it makes a difference of more than
1 usec.  73 is already too high.  An old PRO1000 Intel NIC has a latency
of only 50 usec on the same network.  The switch costs about 20 usec
of this.

iflib uses taskqueue more.  netblast normally runs on another CPU and
I think it only fills s/w tx queue(s) synchronously, and wakes up the
taskqueues for every packet.  The CPUs are almost fast enough to keep
up, and the system does about 1M context switches for this (in versions
other than i386 with 4G KVA).  That is slightly mor

Re: svn commit: r341578 - head/sys/dev/mlx5/mlx5_en

2018-12-17 Thread Andrew Gallatin

On 12/5/18 9:20 AM, Slava Shwartsman wrote:

Author: slavash
Date: Wed Dec  5 14:20:57 2018
New Revision: 341578
URL: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_341578=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=BFp2c_-S0jnzRZJF2APwvTwmnmVFcyjcnBvHRZ3Locc=b7fvhOzf_b5bMVGquu4SaBhMNql5N8dVPAvpfKtz53Q=

Log:
   mlx5en: Remove the DRBR and associated logic in the transmit path.
   
   The hardware queues are deep enough currently and using the DRBR and associated

   callbacks only leads to more task switching in the TX path. The is also a 
race
   setting the queue_state which can lead to hung TX rings.
   


The point of DRBR in the tx path is not simply to provide a software 
ring for queuing excess packets.  Rather it provides a mechanism to

avoid lock contention by shoving a packet into the software ring, where
it will later be found & processed, rather than blocking the caller on
a mtx lock.   I'm concerned you may have introduced a performance
regression for use cases where you have N:1  or N:M lock contention 
where many threads on different cores are contending for the same tx 
queue.  The state of the art for this is no longer DRBR, but mp_ring,

as used by both cxgbe and iflib.

For well behaved workloads (like Netflix's), I don't anticipate
this being a performance issue.  However, I worry that this will impact
other workloads and that you should consider running some testing of
N:1 contention.   Eg, 128 netperfs running in parallel with only
a few nic tx rings.

Sorry for the late reply.. I'm behind on my -committers email.  If you
have not already MFC'ed this, you may want to reconsider.

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r341095 - head/sys/net

2018-11-27 Thread Andrew Gallatin
Author: gallatin
Date: Tue Nov 27 20:01:05 2018
New Revision: 341095
URL: https://svnweb.freebsd.org/changeset/base/341095

Log:
  Use busdma unconditionally in iflib
  
  - Remove the complex mechanism to choose between using busdma
  and raw pmap_kextract at runtime.   The reduced complexity makes
  the code easier to read and maintain.
  
  - Fix a bug in the small packet receive path where clusters were
  repeatedly mapped but never unmapped. We now store the cluster's
  bus address and avoid re-mapping the cluster each time a small
  packet is received.
  
  This patch fixes bugs I've seen where ixl(4) will not even
  respond to ping without seeing DMAR faults.
  
  I see a small improvement (14%) on packet forwarding tests using
  a Haswell based Xeon E5-2697 v3.  Olivier sees a small
  regression (-3% to -6%) with lower end hardware.
  
  Reviewed by:  mmacy
  Not objected to by:   sbruno
  MFC after:8 weeks
  Sponsored by: Netflix, Inc
  Differential Revision:https://reviews.freebsd.org/D17901

Modified:
  head/sys/net/iflib.c
  head/sys/net/iflib_private.h

Modified: head/sys/net/iflib.c
==
--- head/sys/net/iflib.cTue Nov 27 19:50:58 2018(r341094)
+++ head/sys/net/iflib.cTue Nov 27 20:01:05 2018(r341095)
@@ -92,15 +92,6 @@ __FBSDID("$FreeBSD$");
 
 #include "ifdi_if.h"
 
-#if defined(__i386__) || defined(__amd64__)
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#endif
-
 #ifdef PCI_IOV
 #include 
 #endif
@@ -282,24 +273,16 @@ iflib_get_sctx(if_ctx_t ctx)
 #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
 #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
 
-#define RX_SW_DESC_MAP_CREATED (1 << 0)
-#define TX_SW_DESC_MAP_CREATED (1 << 1)
-#define RX_SW_DESC_INUSE(1 << 3)
-#define TX_SW_DESC_MAPPED   (1 << 4)
-
-#defineM_TOOBIGM_PROTO1
-
 typedef struct iflib_sw_rx_desc_array {
bus_dmamap_t*ifsd_map; /* bus_dma maps for packet */
struct mbuf **ifsd_m;   /* pkthdr mbufs */
caddr_t *ifsd_cl;  /* direct cluster pointer for rx */
-   uint8_t *ifsd_flags;
+   bus_addr_t  *ifsd_ba;  /* bus addr of cluster for rx */
 } iflib_rxsd_array_t;
 
 typedef struct iflib_sw_tx_desc_array {
bus_dmamap_t*ifsd_map; /* bus_dma maps for packet */
struct mbuf**ifsd_m;   /* pkthdr mbufs */
-   uint8_t *ifsd_flags;
 } if_txsd_vec_t;
 
 
@@ -940,9 +923,8 @@ iflib_netmap_txsync(struct netmap_kring *kring, int fl
if_ctx_t ctx = ifp->if_softc;
iflib_txq_t txq = >ifc_txqs[kring->ring_id];
 
-   if (txq->ift_sds.ifsd_map)
-   bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
-   BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+   bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
+   BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 
/*
@@ -1024,9 +1006,8 @@ iflib_netmap_txsync(struct netmap_kring *kring, int fl
kring->nr_hwcur = nm_i;
 
/* synchronize the NIC ring */
-   if (txq->ift_sds.ifsd_map)
-   bus_dmamap_sync(txq->ift_desc_tag, 
txq->ift_ifdi->idi_map,
-   BUS_DMASYNC_PREREAD | 
BUS_DMASYNC_PREWRITE);
+   bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
+   BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
/* (re)start the tx unit up to slot nic_i (excluded) */
ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
@@ -1129,9 +1110,8 @@ iflib_netmap_rxsync(struct netmap_kring *kring, int fl
error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, 
);
ring->slot[nm_i].len = error ? 0 : ri.iri_len - 
crclen;
ring->slot[nm_i].flags = 0;
-   if (fl->ifl_sds.ifsd_map)
-   bus_dmamap_sync(fl->ifl_ifdi->idi_tag,
-   
fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
+   bus_dmamap_sync(fl->ifl_ifdi->idi_tag,
+   fl->ifl_sds.ifsd_map[nic_i], 
BUS_DMASYNC_POSTREAD);
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
@@ -1210,9 +1190,6 @@ iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
if (slot == NULL)
return;
-   if (txq->ift_sds.ifsd_map == NULL)
-   return;
-
for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
 
/*
@@ 

Re: svn commit: r340097 - in head/sys: kern sys

2018-11-12 Thread Andrew Gallatin

On 11/2/18 11:43 PM, Matt Macy wrote:

Author: mmacy
Date: Sat Nov  3 03:43:32 2018
New Revision: 340097
URL: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_340097=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=C46M75X_gZcJY3aXGYy_P4DQJhD-uEFU00BP6AzHPik=JvPbkoXDB3zzo2IjmopaQxJ3kRcIwzosrpY4elq80LQ=

Log:
   Convert epoch to read / write records per cpu
   
   In discussing D17503 "Run epoch calls sooner and more reliably" with

   sbahra@ we came to the conclusion that epoch is currently misusing the
   ck_epoch API. It isn't safe to do a "write side" operation (ck_epoch_call
   or ck_epoch_poll) in the middle of a "read side" section. Since, by 
definition,
   it's possible to be preempted during the middle of an EPOCH_PREEMPT
   epoch the GC task might call ck_epoch_poll or another thread might call
   ck_epoch_call on the same section. The right solution is ultimately to change
   the way that ck_epoch works for this use case. However, as a stopgap for
   12 we agreed to simply have separate records for each use case.
   
   Tested by: pho@
   
   MFC after:	3 days



Hi Matt,

Can you elaborate why this is needed?

I seem to recall that Samy Al Bahra made some upstream changes to CK 
that modified the CK API to legitimize our use of the API, and these 
were brought into FreeBSD in r339375. Were these insufficient?


Also, it would be great if you could get review on epoch changes. Epoch 
is totally awesome, and I'm thrilled that you brought it in.  However, 
it is very tricky, and it seems like changes here could benefit from review.


Thanks,

Drew



___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r339043 - in head/sys: kern vm x86/acpica

2018-10-01 Thread Andrew Gallatin

On 10/1/18 10:14 AM, Andrew Gallatin wrote:

Author: gallatin
Date: Mon Oct  1 14:14:21 2018
New Revision: 339043
URL: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_339043=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=vFxrWMxnRsVgXYUUeDU3mY3EdLAlur-SanLWzMxFWow=a6s6FleHIdYhZF1D_SqEOf9apgxdQ2RBvF0HcKicCus=

Log:
   Allow empty NUMA memory domains to support Threadripper2
   
   The AMD Threadripper 2990WX is basically a slightly crippled Epyc.

   Rather than having 4 memory controllers, one per NUMA domain, it has
   only 2  memory controllers enabled. This means that only 2 of the
   4 NUMA domains can be populated with physical memory, and the
   others are empty.
   
   Add support to FreeBSD for empty NUMA domains by:
   
   - creating empty memory domains when parsing the SRAT table,

   rather than failing to parse the table
   - not running the pageout deamon threads in empty domains
   - adding defensive code to UMA to avoid allocating from empty domains
   - adding defensive code to cpuset to avoid binding to an empty domain
   Thanks to Jeff for suggesting this strategy.
   
   Reviewed by:	alc, markj

   Approved by: re (gjb@)
   Differential Revision:   https://reviews.freebsd.org/D1683


Whoops, cut-and-paste error.  The Differential Revision should have 
been: https://reviews.freebsd.org/D16836


Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r339043 - in head/sys: kern vm x86/acpica

2018-10-01 Thread Andrew Gallatin
Author: gallatin
Date: Mon Oct  1 14:14:21 2018
New Revision: 339043
URL: https://svnweb.freebsd.org/changeset/base/339043

Log:
  Allow empty NUMA memory domains to support Threadripper2
  
  The AMD Threadripper 2990WX is basically a slightly crippled Epyc.
  Rather than having 4 memory controllers, one per NUMA domain, it has
  only 2  memory controllers enabled. This means that only 2 of the
  4 NUMA domains can be populated with physical memory, and the
  others are empty.
  
  Add support to FreeBSD for empty NUMA domains by:
  
  - creating empty memory domains when parsing the SRAT table,
  rather than failing to parse the table
  - not running the pageout deamon threads in empty domains
  - adding defensive code to UMA to avoid allocating from empty domains
  - adding defensive code to cpuset to avoid binding to an empty domain
  Thanks to Jeff for suggesting this strategy.
  
  Reviewed by:  alc, markj
  Approved by:  re (gjb@)
  Differential Revision:https://reviews.freebsd.org/D1683

Modified:
  head/sys/kern/kern_cpuset.c
  head/sys/vm/uma_core.c
  head/sys/vm/vm_kern.c
  head/sys/vm/vm_pageout.c
  head/sys/vm/vm_pagequeue.h
  head/sys/x86/acpica/srat.c

Modified: head/sys/kern/kern_cpuset.c
==
--- head/sys/kern/kern_cpuset.c Mon Oct  1 14:05:31 2018(r339042)
+++ head/sys/kern/kern_cpuset.c Mon Oct  1 14:14:21 2018(r339043)
@@ -65,7 +65,12 @@ __FBSDID("$FreeBSD$");
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
+#include 
+#include 
+#include 
 
 #ifdef DDB
 #include 
@@ -479,6 +484,26 @@ _domainset_create(struct domainset *domain, struct dom
 }
 
 /*
+ * Are any of the domains in the mask empty? If so, silently
+ * remove them.  If only empty domains are present, we must
+ * return failure.
+ */
+static bool
+domainset_empty_vm(struct domainset *domain)
+{
+   int i, max;
+
+   max = DOMAINSET_FLS(>ds_mask) + 1;
+   for (i = 0; i < max; i++) {
+   if (DOMAINSET_ISSET(i, >ds_mask) &&
+   VM_DOMAIN_EMPTY(i))
+   DOMAINSET_CLR(i, >ds_mask);
+   }
+
+   return (DOMAINSET_EMPTY(>ds_mask));
+}
+
+/*
  * Create or lookup a domainset based on the key held in 'domain'.
  */
 struct domainset *
@@ -1360,6 +1385,7 @@ domainset_zero(void)
DOMAINSET_SET(i, >ds_mask);
dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH;
dset->ds_prefer = -1;
+   (void)domainset_empty_vm(dset);
curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
 
domainset_copy(dset, );
@@ -2086,6 +2112,13 @@ kern_cpuset_setdomain(struct thread *td, cpulevel_t le
/* This will be constrained by domainset_shadow(). */
DOMAINSET_FILL(_mask);
}
+
+   /*
+*  When given an impossible policy, fall back to interleaving
+*  across all domains
+*/
+   if (domainset_empty_vm())
+   domainset_copy(, );
 
switch (level) {
case CPU_LEVEL_ROOT:

Modified: head/sys/vm/uma_core.c
==
--- head/sys/vm/uma_core.c  Mon Oct  1 14:05:31 2018(r339042)
+++ head/sys/vm/uma_core.c  Mon Oct  1 14:14:21 2018(r339043)
@@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2469,9 +2470,11 @@ zalloc_start:
if (bucket != NULL)
bucket_free(zone, bucket, udata);
 
-   if (zone->uz_flags & UMA_ZONE_NUMA)
+   if (zone->uz_flags & UMA_ZONE_NUMA) {
domain = PCPU_GET(domain);
-   else
+   if (VM_DOMAIN_EMPTY(domain))
+   domain = UMA_ANYDOMAIN;
+   } else
domain = UMA_ANYDOMAIN;
 
/* Short-circuit for zones without buckets and low memory. */
@@ -2647,7 +2650,11 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdo
rdomain = 0;
rr = rdomain == UMA_ANYDOMAIN;
if (rr) {
-   keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
+   start = keg->uk_cursor;
+   do {
+   keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
+   domain = keg->uk_cursor;
+   } while (VM_DOMAIN_EMPTY(domain) && domain != start);
domain = start = keg->uk_cursor;
/* Only block on the second pass. */
if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK)
@@ -2698,8 +2705,11 @@ again:
LIST_INSERT_HEAD(>ud_part_slab, slab, us_link);
return (slab);
}
-   if (rr)
-   domain = (domain + 1) % vm_ndomains;
+   if (rr) {
+   do {
+   domain = (domain + 1) % vm_ndomains;
+  

svn commit: r338341 - head/sys/netinet6

2018-08-27 Thread Andrew Gallatin
Author: gallatin
Date: Mon Aug 27 18:13:20 2018
New Revision: 338341
URL: https://svnweb.freebsd.org/changeset/base/338341

Log:
  Reject IPv4 SO_REUSEPORT_LB groups when looking up an IPv6 listening socket
  
  Similar to how the IPv4 code will reject an IPv6 LB group,
  we must ignore IPv4 LB groups when looking up an IPv6
  listening socket.   If this is not done, a port only match
  may return an IPv4 socket, which causes problems (like
  sending IPv6 packets with a hopcount of 0, making them unrouteable).
  
  Thanks to rrs for all the work to diagnose this.
  
  Approved by:  re (rgrimes)
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D16899

Modified:
  head/sys/netinet6/in6_pcb.c

Modified: head/sys/netinet6/in6_pcb.c
==
--- head/sys/netinet6/in6_pcb.c Mon Aug 27 15:20:42 2018(r338340)
+++ head/sys/netinet6/in6_pcb.c Mon Aug 27 18:13:20 2018(r338341)
@@ -901,6 +901,10 @@ in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
 * - Load balanced does not contain IPv4 mapped INET6 wild sockets.
 */
LIST_FOREACH(grp, hdr, il_list) {
+#ifdef INET
+   if (!(grp->il_vflag & INP_IPV6))
+   continue;
+#endif
if (grp->il_lport == lport) {
idx = 0;
int pkt_hash = INP_PCBLBGROUP_PKTHASH(
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r337709 - head/sys/net

2018-08-14 Thread Andrew Gallatin
It could probably be MFCed if somebody could verify that it causes no 
harm in 11.


I have no way to test lagg/lacp on 11, so I did not mark it for MFC.

Drew

On 8/13/18 9:58 PM, Kubilay Kocak wrote:

On 14/08/2018 12:13 am, Andrew Gallatin wrote:

Author: gallatin
Date: Mon Aug 13 14:13:25 2018
New Revision: 337709
URL: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_337709=DwICaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=1cWDZxkgrPXh1V368in81GXeCx7nVoSbXY9khM0W2r0=psymcTlMReu-E0h72SEySpgbsxzW7KxTwtQDKS8ocBY=

Log:
   lagg: allow lacp to manage the link state
   
   Lacp needs to manage the link state itself. Unlike other

   lagg protocols, the ability of lacp to pass traffic
   depends not only on the lagg members having link, but also
   on the lacp protocol converging to a distributing state with the
   link partner.
   
   If we prematurely mark the link as up, then we will send a

   gratuitous arp (via arp_handle_ifllchange()) before the lacp
   interface is capable of passing traffic. When this happens,
   the gratuitous arp is lost, and our link partner may cache
   a stale mac address (eg, when the base mac address for the
   lagg bundle changes, due to a BIOS change re-ordering NIC
   unit numbers)


Hi Andrew

Can this be MFC'd?


   Reviewed by: jtl, hselasky
   Sponsored by: Netflix

Modified:
   head/sys/net/ieee8023ad_lacp.c
   head/sys/net/if_lagg.c

Modified: head/sys/net/ieee8023ad_lacp.c
==
--- head/sys/net/ieee8023ad_lacp.c  Mon Aug 13 13:58:45 2018
(r337708)
+++ head/sys/net/ieee8023ad_lacp.c  Mon Aug 13 14:13:25 2018
(r337709)
@@ -711,6 +711,8 @@ lacp_disable_distributing(struct lacp_port *lp)
}
  
  	lp->lp_state &= ~LACP_STATE_DISTRIBUTING;

+   if_link_state_change(sc->sc_ifp,
+   sc->sc_active ? LINK_STATE_UP : LINK_STATE_DOWN);
  }
  
  static void

@@ -745,6 +747,9 @@ lacp_enable_distributing(struct lacp_port *lp)
} else
/* try to become the active aggregator */
lacp_select_active_aggregator(lsc);
+
+   if_link_state_change(sc->sc_ifp,
+   sc->sc_active ? LINK_STATE_UP : LINK_STATE_DOWN);
  }
  
  static void


Modified: head/sys/net/if_lagg.c
==
--- head/sys/net/if_lagg.c  Mon Aug 13 13:58:45 2018(r337708)
+++ head/sys/net/if_lagg.c  Mon Aug 13 14:13:25 2018(r337709)
@@ -1737,6 +1737,10 @@ lagg_linkstate(struct lagg_softc *sc)
  
  	LAGG_XLOCK_ASSERT(sc);
  
+	/* LACP handles link state itself */

+   if (sc->sc_proto == LAGG_PROTO_LACP)
+   return;
+
/* Our link is considered up if at least one of our ports is active */
LAGG_RLOCK();
CK_SLIST_FOREACH(lp, >sc_ports, lp_entries) {
___
svn-src-head@freebsd.org mailing list
https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.freebsd.org_mailman_listinfo_svn-2Dsrc-2Dhead=DwICaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=1cWDZxkgrPXh1V368in81GXeCx7nVoSbXY9khM0W2r0=SLnmQNpAX0j6HgJ5_yIcrQJAf9xCWtNqoEJ2qbOy7_E=
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"



___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r337709 - head/sys/net

2018-08-13 Thread Andrew Gallatin
Author: gallatin
Date: Mon Aug 13 14:13:25 2018
New Revision: 337709
URL: https://svnweb.freebsd.org/changeset/base/337709

Log:
  lagg: allow lacp to manage the link state
  
  Lacp needs to manage the link state itself. Unlike other
  lagg protocols, the ability of lacp to pass traffic
  depends not only on the lagg members having link, but also
  on the lacp protocol converging to a distributing state with the
  link partner.
  
  If we prematurely mark the link as up, then we will send a
  gratuitous arp (via arp_handle_ifllchange()) before the lacp
  interface is capable of passing traffic. When this happens,
  the gratuitous arp is lost, and our link partner may cache
  a stale mac address (eg, when the base mac address for the
  lagg bundle changes, due to a BIOS change re-ordering NIC
  unit numbers)
  
  Reviewed by: jtl, hselasky
  Sponsored by: Netflix

Modified:
  head/sys/net/ieee8023ad_lacp.c
  head/sys/net/if_lagg.c

Modified: head/sys/net/ieee8023ad_lacp.c
==
--- head/sys/net/ieee8023ad_lacp.c  Mon Aug 13 13:58:45 2018
(r337708)
+++ head/sys/net/ieee8023ad_lacp.c  Mon Aug 13 14:13:25 2018
(r337709)
@@ -711,6 +711,8 @@ lacp_disable_distributing(struct lacp_port *lp)
}
 
lp->lp_state &= ~LACP_STATE_DISTRIBUTING;
+   if_link_state_change(sc->sc_ifp,
+   sc->sc_active ? LINK_STATE_UP : LINK_STATE_DOWN);
 }
 
 static void
@@ -745,6 +747,9 @@ lacp_enable_distributing(struct lacp_port *lp)
} else
/* try to become the active aggregator */
lacp_select_active_aggregator(lsc);
+
+   if_link_state_change(sc->sc_ifp,
+   sc->sc_active ? LINK_STATE_UP : LINK_STATE_DOWN);
 }
 
 static void

Modified: head/sys/net/if_lagg.c
==
--- head/sys/net/if_lagg.c  Mon Aug 13 13:58:45 2018(r337708)
+++ head/sys/net/if_lagg.c  Mon Aug 13 14:13:25 2018(r337709)
@@ -1737,6 +1737,10 @@ lagg_linkstate(struct lagg_softc *sc)
 
LAGG_XLOCK_ASSERT(sc);
 
+   /* LACP handles link state itself */
+   if (sc->sc_proto == LAGG_PROTO_LACP)
+   return;
+
/* Our link is considered up if at least one of our ports is active */
LAGG_RLOCK();
CK_SLIST_FOREACH(lp, >sc_ports, lp_entries) {
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r335916 - head/sys/conf

2018-07-07 Thread Andrew Gallatin

On 07/05/18 19:59, John Baldwin wrote:


You misunderstand.  /usr/local/sys/modules would hold module sources so that
they can be recompiled when building a kernel without having to rebuild the
package or reinstall the package.  Binary modules would continue to be
installed in /boot/modules.



This is very similar to the approach that many Linux distributions take
with DKMS.  The kernel sources for out-of-tree modules are kept around,
and every time a kernel is installed, its new header files are used to
re-compile the out-of-tree module.   Similarly, when you install a
package containing a kernel module, it is re-compiled and installed
for every installed kernel.

One thing that was tangentially brought up is that the ability
to compile out-of-tree modules requires keeping the kernel-headers
around.  So we may need to identify all the headers that a module might
need, and install them in /boot/$KERNEL/sys or some-such.  This would
be needed if, for example, we wanted to install a new Nvidia or Virtual
Box module and have it work for older installed kernel versions too
(eg, across ABI breaking changes in -current).

This would certainly make life easier for people running -current.
This system works quite well on Linux.  For comparison, I used an
Ubuntu based desktop with Nvidia graphics at a previous employers,
and a FreeBSD-current desktop w/Nvidia graphics now. I've been left w/o
graphics  accidentally much more often on FreeBSD than I ever
had been on Ubuntu, even when compiling my own kernels from git..

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r336042 - head/sys/dev/cxgbe/common

2018-07-07 Thread Andrew Gallatin

On 07/06/18 15:33, Navdeep Parhar wrote:



Log:
   cxgbe(4): Assume that any unknown flash on the card is 4MB and has 64KB
   sectors, instead of refusing to attach to the card.
   


Thank you!

Drew

___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r335967 - head/sys/dev/mxge

2018-07-07 Thread Andrew Gallatin

On 07/05/18 17:14, Rick Macklem wrote:

Andrew Gallatin wrote:
On 7/4/18 9:20 PM, Rodney W. Grimes wrote:
[stuff snipped]


It is using a magic constant twice, where one has a
derived value that is dependent on the value of the other.
That is bad and error prone and does not document that
one depends on the other.  Please fix this.  Or at least
make 65536 a #define so that it only needs changed one
place and clearly shows the interdependence of these
values.


To me, 65536 is one of the few cases where the magic number is
more meaningful than a name.  But fine, if you feel that
strongly about it, I'll change it for you.


Btw, in general, if_hw_tsomax and if_hw_tsomaxsegsize are not
related or the same value. It just happens that they both appear
to be related to 64K in this case. (I believe this is fairly common,
since the original Microsoft "standard" used 64K as a limit, since
it was stored in 16bits.)


Yes; exactly.


if_hw_tsomax is the maximum size of the entire TSO segment,
including MAC level headers (commonly 64K, due to Mircosoft...
but could be larger if the hardware guys chose to do so).


Given that we do TSO like Linux, and not like MS (meaning
we express the size of the pre-segmented packet using the
a 16-bit value in the IPv4/IPv6 header), supporting more
than 64K is not possible in FreeBSD, so I'm basically
saying "nerf this constraint".

MS windows does it better / different; they express the
size of the pre-segmented packet in packet metadata,
leaving ip->ip_len = 0.  This is better, since
then the pseudo hdr checksum in the template header can be
re-used (with the len added) for every segment by the NIC.
If you've ever seen a driver set ip->ip_len = 0, and re-calc
the pseudo-hdr checksum, that's why.   This is also why
MS LSOv2 can support TSO of packets larger than 64K, since they're
not constrained by the 16-bit value in the IP{4,6} header.
The value of TSO larger than 64K is questionable at best though.
Without pacing, you'd just get more packets dropped when
talking across the internet..


if_hw_tsomaxsegsize is the maximum size of contiguous memory
that a "chunk" of the TSO segment can be stored in for handling by
the driver's transmit side. Since higher


And this is what I object to.  TCP should not care about
this.  Drivers should use busdma, or otherwise be capable of
chopping large contig regions down to chunks that they can
handle.   If a driver can really only handle 2K, then it should
be having busdma give it an s/g list that is 2x as long, not having
TCP call m_dupcl() 2x as often on page-sized data generated by
sendfile (or more on non-x86 with larger pages).


level code such as NFS (and iSCSI, I think?) uses MCLBYTE clusters,
anything 2K or higher normally works the same.  Not sure about
sosend(), but I think it also copies the data into MCLBYTE clusters?
This would change if someday jumbo mbuf clusters become the norm.
(I tried changing the NFS code to use jumbo clusters, but it would
  result in fragmentation of the memory used for mbuf cluster allocation,
  so I never committed it.)



At least for sendfile(), vm pages are wrapped up and attached to
mbufs, so you have 4K (and potentially much more on non-x86).
Doesn't NFS do something similar when sending data, or do you copy
into clusters?

I have changes which I have not upstreamed yet which enhance mbufs to
carry TLS metadata & vector of physical addresses (which I call
unmapped mbufs) for sendfile and kernel TLS.  As part of that,
sosend (for kTLS) can allocate many pages and attach them to one mbuf.
The idea (for kTLS) is that you can keep an entire TLS record (with
framing information) in a single unmapped mbuf, which saves a
huge amount of CPU which would be lost to cache misses doing
pointer-chasing of really long mbuf chains (TLS hdrs and trailers
are generally 13 and 16 bytes).  The goal was to regain CPU
during Netflix's transition to https streaming.  However, it
is unintentionally quite helpful on i386, since it reduces
overhead from having to map/unmap sf_bufs. FWIW, these mbufs
have been in production at Netflix for over a year, and carry
a large fraction of the worlds internet traffic :)



rick
ps: And I'll admit I don't find 65536 very magic;-)



:)

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r335973 - head/sys/dev/mxge

2018-07-04 Thread Andrew Gallatin
Author: gallatin
Date: Thu Jul  5 02:43:10 2018
New Revision: 335973
URL: https://svnweb.freebsd.org/changeset/base/335973

Log:
  mxge: replace 65536 with IP_MAXPACKET in tso settings.

Modified:
  head/sys/dev/mxge/if_mxge.c

Modified: head/sys/dev/mxge/if_mxge.c
==
--- head/sys/dev/mxge/if_mxge.c Thu Jul  5 02:08:57 2018(r335972)
+++ head/sys/dev/mxge/if_mxge.c Thu Jul  5 02:43:10 2018(r335973)
@@ -4984,9 +4984,9 @@ mxge_attach(device_t dev)
ifp->if_ioctl = mxge_ioctl;
ifp->if_start = mxge_start;
ifp->if_get_counter = mxge_get_counter;
-   ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+   ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + 
ETHER_VLAN_ENCAP_LEN);
ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
-   ifp->if_hw_tsomaxsegsize = 65536;
+   ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
/* Initialise the ifmedia structure */
ifmedia_init(>media, 0, mxge_media_change,
 mxge_media_status);
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r335967 - head/sys/dev/mxge

2018-07-04 Thread Andrew Gallatin

On 7/4/18 9:20 PM, Rodney W. Grimes wrote:

On 07/04/18 15:46, Rodney W. Grimes wrote:

Author: gallatin
Date: Wed Jul  4 19:29:06 2018
New Revision: 335967
URL: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_335967=DwICAg=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=2rIiw5AUJ2ishkBkygGMa_9kr0LJOaonX8ni3BF2BHk=MwCt6_IgNah0XklsYThsXFcwZD54Xl78TRlnFXJ4zWs=

Log:
mxge: choose appropriate values for hw tso

Modified:
head/sys/dev/mxge/if_mxge.c

Modified: head/sys/dev/mxge/if_mxge.c
==
--- head/sys/dev/mxge/if_mxge.c Wed Jul  4 18:54:44 2018(r335966)
+++ head/sys/dev/mxge/if_mxge.c Wed Jul  4 19:29:06 2018(r335967)
@@ -4984,6 +4984,9 @@ mxge_attach(device_t dev)
ifp->if_ioctl = mxge_ioctl;
ifp->if_start = mxge_start;
ifp->if_get_counter = mxge_get_counter;
+   ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);


Would not this be more accurate (need to reorder assigns):
ifp->if_hw_tsomax = ifp->if_hw_tsomaxsegsize - (ETHER_HDR_LEN + 
ETHER_VLAN_ENCAP_LEN);


+   ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
+   ifp->if_hw_tsomaxsegsize = 65536;



It seems simpler as-is to me.


It is using a magic constant twice, where one has a
derived value that is dependent on the value of the other.
That is bad and error prone and does not document that
one depends on the other.  Please fix this.  Or at least
make 65536 a #define so that it only needs changed one
place and clearly shows the interdependence of these
values.


To me, 65536 is one of the few cases where the magic number is
more meaningful than a name.  But fine, if you feel that
strongly about it, I'll change it for you.



___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r335967 - head/sys/dev/mxge

2018-07-04 Thread Andrew Gallatin

On 07/04/18 15:46, Rodney W. Grimes wrote:

Author: gallatin
Date: Wed Jul  4 19:29:06 2018
New Revision: 335967
URL: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_335967=DwICAg=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=2rIiw5AUJ2ishkBkygGMa_9kr0LJOaonX8ni3BF2BHk=MwCt6_IgNah0XklsYThsXFcwZD54Xl78TRlnFXJ4zWs=

Log:
   mxge: choose appropriate values for hw tso

Modified:
   head/sys/dev/mxge/if_mxge.c

Modified: head/sys/dev/mxge/if_mxge.c
==
--- head/sys/dev/mxge/if_mxge.c Wed Jul  4 18:54:44 2018(r335966)
+++ head/sys/dev/mxge/if_mxge.c Wed Jul  4 19:29:06 2018(r335967)
@@ -4984,6 +4984,9 @@ mxge_attach(device_t dev)
ifp->if_ioctl = mxge_ioctl;
ifp->if_start = mxge_start;
ifp->if_get_counter = mxge_get_counter;
+   ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);


Would not this be more accurate (need to reorder assigns):
ifp->if_hw_tsomax = ifp->if_hw_tsomaxsegsize - (ETHER_HDR_LEN + 
ETHER_VLAN_ENCAP_LEN);


+   ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
+   ifp->if_hw_tsomaxsegsize = 65536;



It seems simpler as-is to me.  Looking around at other drivers, I see
at least one (cxgbe) which does the same thing.

After doing the grep, I'm more concerned with drivers which may
be setting their tsomaxsegsize  incorrectly to be too small and
hurting their performance by causing TCP to chop needlessly
at smaller boundaries which are already enforced by their
busdma tags.  PAGE_SIZE, which seems to be the common mistaken
size, won't hurt too much I suppose.  But the default of 2K
is probably not very good.

Drew

___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r335967 - head/sys/dev/mxge

2018-07-04 Thread Andrew Gallatin
Author: gallatin
Date: Wed Jul  4 19:29:06 2018
New Revision: 335967
URL: https://svnweb.freebsd.org/changeset/base/335967

Log:
  mxge: choose appropriate values for hw tso

Modified:
  head/sys/dev/mxge/if_mxge.c

Modified: head/sys/dev/mxge/if_mxge.c
==
--- head/sys/dev/mxge/if_mxge.c Wed Jul  4 18:54:44 2018(r335966)
+++ head/sys/dev/mxge/if_mxge.c Wed Jul  4 19:29:06 2018(r335967)
@@ -4984,6 +4984,9 @@ mxge_attach(device_t dev)
ifp->if_ioctl = mxge_ioctl;
ifp->if_start = mxge_start;
ifp->if_get_counter = mxge_get_counter;
+   ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+   ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
+   ifp->if_hw_tsomaxsegsize = 65536;
/* Initialise the ifmedia structure */
ifmedia_init(>media, 0, mxge_media_change,
 mxge_media_status);
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r335966 - head/sys/dev/mxge

2018-07-04 Thread Andrew Gallatin

On 07/04/18 14:54, Andrew Gallatin wrote:


   mxge: Add SIOCGI2C support for devices with SFP/XFP cages



Note that I do not have any XFP devices to test with, only SFP
and CX4.   If this causes problems for XFP devices, I can
restrict  SIOCGI2C support to just SFP if needed.

Drew

___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r335966 - head/sys/dev/mxge

2018-07-04 Thread Andrew Gallatin
Author: gallatin
Date: Wed Jul  4 18:54:44 2018
New Revision: 335966
URL: https://svnweb.freebsd.org/changeset/base/335966

Log:
  mxge: Add SIOCGI2C support for devices with SFP/XFP cages

Modified:
  head/sys/dev/mxge/if_mxge.c

Modified: head/sys/dev/mxge/if_mxge.c
==
--- head/sys/dev/mxge/if_mxge.c Wed Jul  4 18:03:19 2018(r335965)
+++ head/sys/dev/mxge/if_mxge.c Wed Jul  4 18:54:44 2018(r335966)
@@ -4154,10 +4154,50 @@ mxge_media_status(struct ifnet *ifp, struct ifmediareq
 }
 
 static int
+mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
+{
+   mxge_cmd_t cmd;
+   uint32_t i2c_args;
+   int i, ms, err;
+
+
+   if (i2c->dev_addr != 0xA0 &&
+   i2c->dev_addr != 0xA2)
+   return (EINVAL);
+   if (i2c->len > sizeof(i2c->data))
+   return (EINVAL);
+
+   for (i = 0; i < i2c->len; i++) {
+   i2c_args = i2c->dev_addr << 0x8;
+   i2c_args |= i2c->offset + i;
+   cmd.data0 = 0;   /* just fetch 1 byte, not all 256 */
+   cmd.data1 = i2c_args;
+   err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, );
+
+   if (err != MXGEFW_CMD_OK)
+   return (EIO);
+   /* now we wait for the data to be cached */
+   cmd.data0 = i2c_args & 0xff;
+   err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, );
+   for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
+   cmd.data0 = i2c_args & 0xff;
+   err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, );
+   if (err == EBUSY)
+   DELAY(1000);
+   }
+   if (err != MXGEFW_CMD_OK)
+   return (EIO);
+   i2c->data[i] = cmd.data0;
+   }
+   return (0);
+}
+
+static int
 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
mxge_softc_t *sc = ifp->if_softc;
struct ifreq *ifr = (struct ifreq *)data;
+   struct ifi2creq i2c;
int err, mask;
 
err = 0;
@@ -4292,6 +4332,26 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t 
>media, command);
break;
 
+   case SIOCGI2C:
+   if (sc->connector != MXGE_XFP &&
+   sc->connector != MXGE_SFP) {
+   err = ENXIO;
+   break;
+   }
+   err = copyin(ifr_data_get_ptr(ifr), , sizeof(i2c));
+   if (err != 0)
+   break;
+   mtx_lock(>driver_mtx);
+   if (sc->dying) {
+   mtx_unlock(>driver_mtx);
+   return (EINVAL);
+   }
+   err = mxge_fetch_i2c(sc, );
+   mtx_unlock(>driver_mtx);
+   if (err == 0)
+   err = copyout(, ifr->ifr_ifru.ifru_data,
+   sizeof(i2c));
+   break;
default:
err = ether_ioctl(ifp, command, data);
break;
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r335957 - head/sys/dev/mxge

2018-07-04 Thread Andrew Gallatin
Author: gallatin
Date: Wed Jul  4 14:25:38 2018
New Revision: 335957
URL: https://svnweb.freebsd.org/changeset/base/335957

Log:
  mxge: fix panic at module unload
  
  r333175 (multicast changes) exposed a bug where
  mxge was not checking to see if the driver was being
  unloaded while handing ioctls that touch hardware.
  As a result, now that in6m_disconnect() is run from
  an async gtaskq, it was busy-waiting in mxge_send_cmd()
  while the mcast list was destroyed.

Modified:
  head/sys/dev/mxge/if_mxge.c

Modified: head/sys/dev/mxge/if_mxge.c
==
--- head/sys/dev/mxge/if_mxge.c Wed Jul  4 14:20:19 2018(r335956)
+++ head/sys/dev/mxge/if_mxge.c Wed Jul  4 14:25:38 2018(r335957)
@@ -4193,6 +4193,10 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t 
case SIOCADDMULTI:
case SIOCDELMULTI:
mtx_lock(>driver_mtx);
+   if (sc->dying) {
+   mtx_unlock(>driver_mtx);
+   return (EINVAL);
+   }
mxge_set_multicast_list(sc);
mtx_unlock(>driver_mtx);
break;
@@ -4278,6 +4282,10 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t 
 
case SIOCGIFMEDIA:
mtx_lock(>driver_mtx);
+   if (sc->dying) {
+   mtx_unlock(>driver_mtx);
+   return (EINVAL);
+   }
mxge_media_probe(sc);
mtx_unlock(>driver_mtx);
err = ifmedia_ioctl(ifp, (struct ifreq *)data,
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r334143 - head/sys/dev/cxgbe

2018-05-24 Thread Andrew Gallatin

On 05/24/18 06:18, Navdeep Parhar wrote:


Log:
   cxgbe(4): Data path for rate-limited tx.
   
   This is hardware support for the SO_MAX_PACING_RATE sockopt (see

   setsockopt(2)), which is available in kernels built with "options
   RATELIMIT".
   
   Relnotes:	Yes

   Sponsored by:Chelsio Communications



Hurray!  Thanks so much for supporting this!

Drew

___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r333793 - head/usr.sbin/pmcannotate

2018-05-18 Thread Andrew Gallatin
Author: gallatin
Date: Fri May 18 14:14:04 2018
New Revision: 333793
URL: https://svnweb.freebsd.org/changeset/base/333793

Log:
  Teach pmcannotate about $TMPDIR and _PATH_TMP
  
  Convert pmcannotate to using $TMPDIR and _PATH_TMP rather than hard
  coding /tmp for temporary files.  Pmcannotate sometimes needs quite a
  lot of space to store the output from objdump, and will fail in odd
  ways if that output is truncated due to lack of space in /tmp.
  
  Reviewed by:  jtl
  Sponsored by: Netflix

Modified:
  head/usr.sbin/pmcannotate/pmcannotate.c

Modified: head/usr.sbin/pmcannotate/pmcannotate.c
==
--- head/usr.sbin/pmcannotate/pmcannotate.c Fri May 18 13:49:12 2018
(r333792)
+++ head/usr.sbin/pmcannotate/pmcannotate.c Fri May 18 14:14:04 2018
(r333793)
@@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$");
 #include 
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -47,7 +48,7 @@ __FBSDID("$FreeBSD$");
 #defineFNBUFF  512
 #defineLNBUFF  512
 
-#defineTMPPATH "/tmp/pmcannotate.XX"
+#defineTMPNAME "pmcannotate.XX"
 
 #defineFATAL(ptr, x ...) do {  
\
fqueue_deleteall(); \
@@ -671,7 +672,8 @@ usage(const char *progname)
 int
 main(int argc, char *argv[])
 {
-   char buffer[LNBUFF], fname[FNBUFF], tbfl[] = TMPPATH, tofl[] = TMPPATH;
+   char buffer[LNBUFF], fname[FNBUFF];
+   char *tbfl, *tofl, *tmpdir;
char tmpf[MAXPATHLEN * 2 + 50];
float limit;
char *bin, *exec, *kfile, *ofile;
@@ -721,6 +723,17 @@ main(int argc, char *argv[])
exec);
 
bzero(tmpf, sizeof(tmpf));
+   tmpdir = getenv("TMPDIR");
+   if (tmpdir == NULL) {
+   asprintf(, "%s/%s", _PATH_TMP, TMPNAME);
+   asprintf(, "%s/%s", _PATH_TMP, TMPNAME);
+   } else {
+   asprintf(, "%s/%s", tmpdir, TMPNAME);
+   asprintf(, "%s/%s", tmpdir, TMPNAME);
+   }
+   if (tofl == NULL || tbfl == NULL)
+   FATAL(exec, "%s: Cannot create tempfile templates\n",
+   exec);
if (mkstemp(tofl) == -1)
FATAL(exec, "%s: Impossible to create the tmp file\n",
exec);
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r333703 - head/sys/vm

2018-05-17 Thread Andrew Gallatin

On 05/17/18 14:07, Mark Johnston wrote:

On Thu, May 17, 2018 at 10:07:34AM -0700, Conrad Meyer wrote:

On Wed, May 16, 2018 at 9:27 PM, Mark Johnston  wrote:

Author: markj
Date: Thu May 17 04:27:08 2018
New Revision: 333703
URL: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_333703=DwIBAg=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=6lhtci2MYxtyrK5Ub70QC0DcEiQ77Ry2LTAb6cDtW5A=z0SOGvNGORjI-SySfy-aovuyFzy_K5CtCfbNeWbRGLA=

Log:
   Fix a race in vm_page_pagequeue_lockptr().

   The value of m->queue must be cached after comparing it with PQ_NONE,
   since it may be concurrently changing.

   Reported by:  glebius


What were the symptoms of this issue?  The test plan in the linked
phabricator revision says:

"Gleb reported seeing panics as a result of the use of a bogus index
into the pagequeue array, and also reported that this patch fixed the
panics."

So an attempt to lock pagequeues[PQ_NONE=255].pq_mutex, which is
either something later in the vm_domain object, or bogus memory?  One
of the mtx asserts trips?


I think it was "mtx_lock() of spin mutex"; I didn't get a lot of
details.

I failed to note in the commit message that this race was introduced in
r332974.



The most common stack was:


panic: mtx_lock() of spin mutex (null) @ 
/data/ocafirmware.alt/FreeBSD/sys/vm/vm_page.c:3344

cpuid = 4
time = 1526415167
KDB: stack backtrace:
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 
0xfe158af62380

vpanic() at vpanic+0x1a3/frame 0xfe158af623e0
doadump() at doadump/frame 0xfe158af62460
__mtx_lock_flags() at __mtx_lock_flags+0x11a/frame 0xfe158af624a0
vm_page_dequeue() at vm_page_dequeue+0x8a/frame 0xfe158af624e0
vm_page_alloc_domain_after() at vm_page_alloc_domain_after+0x2cb/frame 
0xfe158af62560

vm_page_grab_pages() at vm_page_grab_pages+0x274/frame 0xfe158af62610
vn_sendfile() at vn_sendfile+0x83a/frame 0xfe158af628e0
[Tue May 15 20:12:48 2018]sys_sendfile() at sys_sendfile+0x119/frame 
0xfe158af62980

amd64_syscall() at amd64_syscall+0x298/frame 0xfe158af62ab0
fast_syscall_common() at fast_syscall_common+0x101/frame 0xfe158af62ab0



I once saw one like this:


Fatal trap 9: general protection fault while in kernel mode
cpuid = 0; apic id = 00
instruction pointer = 0x20:0x8088bf74
stack pointer   = 0x28:0xfe55af7712e0
frame pointer   = 0x28:0xfe55af771330
code segment= base 0x0, limit 0xf, type 0x1b
= DPL 0, pres 1, long 1, def32 0, gran 1
processor eflags= interrupt enabled, resume, IOPL = 0
current process = 12 (irq446: mlx5_core0)
[Mon May 14 04:45:10 2018]trap number   = 9
panic: general protection fault
cpuid = 0
time = 1526273109
KDB: stack backtrace:
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 
0xfe55af770ff0

vpanic() at vpanic+0x1a3/frame 0xfe55af771050
panic() at panic+0x43/frame 0xfe55af7710b0
trap_fatal() at trap_fatal+0x35f/frame 0xfe55af771100
trap() at trap+0x6d/frame 0xfe55af771210
[Mon May 14 04:45:10 2018]calltrap() at calltrap+0x8/frame 
0xfe55af771210
--- trap 0x9, rip = 0x8088bf74, rsp = 0xfe55af7712e0, rbp = 
0xfe55af771330 ---
vm_pqbatch_submit_page() at vm_pqbatch_submit_page+0x144/frame 
0xfe55af771330

sendfile_free_page() at sendfile_free_page+0x10e/frame 0xfe55af771360
sendfile_free_mext_pg() at sendfile_free_mext_pg+0xb7/frame 
0xfe55af7713b0

mb_free_ext() at mb_free_ext+0x103/frame 0xfe55af7713e0
m_freem() at m_freem+0x48/frame 0xfe55af771400
tcp_do_segment() at tcp_do_segment+0x1647/frame 0xfe55af771500
tcp_input_with_port() at tcp_input_with_port+0xfcc/frame 0xfe55af771650
tcp_input() at tcp_input+0xb/frame 0xfe55af771660
[Mon May 14 04:45:10 2018]ip_input() at ip_input+0xe9/frame 
0xfe55af7716c0

netisr_dispatch_src() at netisr_dispatch_src+0xa8/frame 0xfe55af771710
ether_demux() at ether_demux+0x140/frame 0xfe55af771740
ether_nh_input() at ether_nh_input+0x32c/frame 0xfe55af7717a0
netisr_dispatch_src() at netisr_dispatch_src+0xa8/frame 0xfe55af7717f0
ether_input() at ether_input+0x26/frame 0xfe55af771810
tcp_lro_flush_all() at tcp_lro_flush_all+0xf2/frame 0xfe55af771850
mlx5e_rx_cq_comp() at mlx5e_rx_cq_comp+0x5e5/frame 0xfe55af771950
mlx5_cq_completion() at mlx5_cq_completion+0x73/frame 0xfe55af771990
<...>

Thanks again for fixing it so quickly!

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r333655 - head/sys/sys

2018-05-15 Thread Andrew Gallatin
Author: gallatin
Date: Tue May 15 23:55:38 2018
New Revision: 333655
URL: https://svnweb.freebsd.org/changeset/base/333655

Log:
  Unhook DEBUG_BUFRING from INVARIANTS
  
  Some of the DEBUG_BUFRING checks are racy, and can lead to
  spurious assertions when run under high load.  Unhook these
  from INVARIANTS until the author can fix or remove them.
  
  Reviewed by:  mmacy
  Sponsored by: Netflix

Modified:
  head/sys/sys/buf_ring.h

Modified: head/sys/sys/buf_ring.h
==
--- head/sys/sys/buf_ring.h Tue May 15 23:46:49 2018(r333654)
+++ head/sys/sys/buf_ring.h Tue May 15 23:55:38 2018(r333655)
@@ -34,10 +34,6 @@
 
 #include 
 
-#if defined(INVARIANTS) && !defined(DEBUG_BUFRING)
-#define DEBUG_BUFRING 1
-#endif
-
 #ifdef DEBUG_BUFRING
 #include 
 #include 
@@ -69,6 +65,12 @@ buf_ring_enqueue(struct buf_ring *br, void *buf)
uint32_t prod_head, prod_next, cons_tail;
 #ifdef DEBUG_BUFRING
int i;
+
+   /*
+* Note: It is possible to encounter an mbuf that was removed
+* via drbr_peek(), and then re-added via drbr_putback() and
+* trigger a spurious panic.
+*/
for (i = br->br_cons_head; i != br->br_prod_head;
 i = ((i + 1) & br->br_cons_mask))
if(br->br_ring[i] == buf)
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r333470 - in head: share/mk sys/conf

2018-05-10 Thread Andrew Gallatin

On 05/10/18 20:11, Ed Maste wrote:

On 10 May 2018 at 20:00, Andrew Gallatin <galla...@cs.duke.edu> wrote:


Unfortunately, it looks like this method will get blown away by an
installworld:


Ah. You can set WITH_LLD_IS_LD in /etc/src.conf and installworld will
install ld as a symlink to ld.lld,



Super! That's the answer that I was looking for, and what should
get me back to building kernels like it's 1999 :)


Thanks,

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r333470 - in head: share/mk sys/conf

2018-05-10 Thread Andrew Gallatin

On 05/10/18 19:14, Ed Maste wrote:

On 10 May 2018 at 18:24, Andrew Gallatin <galla...@cs.duke.edu> wrote:

Rather than erroring out, we please just use the appropriate linker?


That's my goal, but it's a bit of an involved change and will take
some time to make sure we don't introduce new corner cases. I'm sorry
that I didn't catch this before the first ifunc use went in -- lld has
been the default bootstrap linker (via buildworld or kernel-toolchain)
since mid-Jan and this problem slipped my mind. I added the error in
the meantime to avoid the silently broken kernel case that you
unfortunately encountered.

The low-friction method of getting past this in the interim is to just
use ld.lld as the system linker:
# ln -fs ld.lld /usr/bin/ld
I'm just waiting on an update to the lang/ghc port and another exp-run
before that becomes the default.



Thanks!

Unfortunately, it looks like this method will get blown away by an 
installworld:



<7:57pm>thing1/gallatin:src>ls -li /usr/bin/ld*
12038400 lrwxr-xr-x  1 root  wheel15 May 10 19:21 /usr/bin/ld@ 
-> /usr/bin/ld.lld

32386537 -r-xr-xr-x  1 root  wheel   1911384 May 10 09:13 /usr/bin/ld.bfd*
32387059 -r-xr-xr-x  1 root  wheel  40449288 May 10 09:13 /usr/bin/ld.lld*
32386878 -r-xr-xr-x  1 root  wheel 19352 May 10 09:13 /usr/bin/ldd*
32387816 -r-xr-xr-x  1 root  wheel 26872 May 10 09:14 /usr/bin/ldd32*
<7:57pm>thing1/gallatin:src>sudo make -j32 installworld >& log
<7:58pm>thing1/gallatin:src>!ls
ls -li /usr/bin/ld*
32347218 -r-xr-xr-x  2 root  wheel   1911384 May 10 19:58 /usr/bin/ld*
32347218 -r-xr-xr-x  2 root  wheel   1911384 May 10 19:58 /usr/bin/ld.bfd*
32348085 -r-xr-xr-x  1 root  wheel  40449288 May 10 19:58 /usr/bin/ld.lld*
32347538 -r-xr-xr-x  1 root  wheel 19352 May 10 19:58 /usr/bin/ldd*
32348365 -r-xr-xr-x  1 root  wheel 26872 May 10 19:58 /usr/bin/ldd32*



Would it make sense to just set LD=ld.lld in my and root's .cshrc?

Thanks,

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r333470 - in head: share/mk sys/conf

2018-05-10 Thread Andrew Gallatin

On 05/10/18 16:10, Ed Maste wrote:

Author: emaste
Date: Thu May 10 20:10:02 2018
New Revision: 333470
URL: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_333470=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=rex4ilVMckTDXNGV-XhKnQ02pSuAJ0JPojwMYmZ6d9U=OfKJ8mXeldmYLNTK2NE1g9kYsBPeucarY_F6p-A3e0g=

Log:
   Error out on attempt to link amd64 kernel with old binutils linker
   



I lost the better part of a day due to the issue of the build using the 
wrong linker.   Rather than erroring out, we please just use the 
appropriate linker?  My workflow is that of the typical dinosaur:


config -g GENERIC
cd ../compile/GENERIC
make cleandepend && make depend && make -j64

Thanks,

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r333462 - head/sys/netinet6

2018-05-10 Thread Andrew Gallatin
Author: gallatin
Date: Thu May 10 16:19:41 2018
New Revision: 333462
URL: https://svnweb.freebsd.org/changeset/base/333462

Log:
  Fix a panic in the IPv6 multicast code.
  
  Use LIST_FOREACH_SAFE in in6m_disconnect() since we're
  deleting and freeing item from the membership list
  while traversing the list.
  
  Reviewed by:  mmacy
  Sponsored by: Netflix

Modified:
  head/sys/netinet6/in6_mcast.c

Modified: head/sys/netinet6/in6_mcast.c
==
--- head/sys/netinet6/in6_mcast.c   Thu May 10 15:01:43 2018
(r333461)
+++ head/sys/netinet6/in6_mcast.c   Thu May 10 16:19:41 2018
(r333462)
@@ -581,7 +581,7 @@ in6m_disconnect(struct in6_multi *inm)
struct ifnet *ifp;
struct ifaddr *ifa;
struct in6_ifaddr *ifa6;
-   struct in6_multi_mship *imm;
+   struct in6_multi_mship *imm, *imm_tmp;
struct ifmultiaddr *ifma, *ll_ifma;
 
ifp = inm->in6m_ifp;
@@ -607,7 +607,8 @@ in6m_disconnect(struct in6_multi *inm)
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifa6 = (void *)ifa;
-   LIST_FOREACH(imm, >ia6_memberships, i6mm_chain) {
+   LIST_FOREACH_SAFE(imm, >ia6_memberships,
+   i6mm_chain, imm_tmp) {
if (inm == imm->i6mm_maddr) {
LIST_REMOVE(imm, i6mm_chain);
free(imm, M_IP6MADDR);
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r333457 - head/sys/kern

2018-05-10 Thread Andrew Gallatin

On 05/10/18 07:36, Ed Maste wrote:

Author: emaste
Date: Thu May 10 11:36:16 2018
New Revision: 333457
URL: 
https://urldefense.proofpoint.com/v2/url?u=https-3A__svnweb.freebsd.org_changeset_base_333457=DwIDaQ=imBPVzF25OnBgGmVOlcsiEgHoG1i6YHLR0Sj_gZ4adc=Ed-falealxPeqc22ehgAUCLh8zlZbibZLSMWJeZro4A=Y7TkuLso5vnwZN5ypgs4eLKVEdMOSRgvhZZz1iAMdyU=-shrydFGkcYwmYlaG3W1nMyk2hg7rbKzCPfHI8_6GYM=

Log:
   ANSIfy sys_generic.c

Modified:
   head/sys/kern/sys_generic.c

Modified: head/sys/kern/sys_generic.c
==
--- head/sys/kern/sys_generic.c Thu May 10 09:37:54 2018(r333456)
+++ head/sys/kern/sys_generic.c Thu May 10 11:36:16 2018(r333457)


<..>


@@ -532,11 +519,7 @@ sys_pwritev(struct thread *td, struct pwritev_args *ua
  }
  
  int

-kern_pwritev(td, fd, auio, offset)
-   struct thread *td;
-   struct uio *auio;
-   int fd;
-   off_t offset;
+kern_pwritev(struct thread *td, struct uio *auio, int fd, off_t offset)
  {
struct file *fp;
int error;



This breaks the kernel build:

/usr/src/sys/kern/sys_generic.c:522:1: error: conflicting types for 
'kern_pwritev'

kern_pwritev(struct thread *td, struct uio *auio, int fd, off_t offset)
^
/usr/src/sys/sys/syscallsubr.h:212:5: note: previous declaration is here
int kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t 
offset);

^
1 error generated.
*** [sys_generic.o] Error code 1


I think the problem was that the non-ansi args were enumerated in a 
different order than their type declarations.


Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r333459 - head/sys/kern

2018-05-10 Thread Andrew Gallatin
Author: gallatin
Date: Thu May 10 13:19:42 2018
New Revision: 333459
URL: https://svnweb.freebsd.org/changeset/base/333459

Log:
  Fix the build after r333457
  
  In r333457, the arguments to kern_pwritev() were accidentally
  re-ordered as part of ANSIfication, breaking the build.

Modified:
  head/sys/kern/sys_generic.c

Modified: head/sys/kern/sys_generic.c
==
--- head/sys/kern/sys_generic.c Thu May 10 12:25:01 2018(r333458)
+++ head/sys/kern/sys_generic.c Thu May 10 13:19:42 2018(r333459)
@@ -519,7 +519,7 @@ sys_pwritev(struct thread *td, struct pwritev_args *ua
 }
 
 int
-kern_pwritev(struct thread *td, struct uio *auio, int fd, off_t offset)
+kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset)
 {
struct file *fp;
int error;
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r333329 - head/sys/net

2018-05-07 Thread Andrew Gallatin
Author: gallatin
Date: Mon May  7 18:11:22 2018
New Revision: 29
URL: https://svnweb.freebsd.org/changeset/base/29

Log:
  Fix an off-by-one error when deciding to request a tx interrupt
  
  The canonical check for whether or not a ring is drainable is
  TXQ_AVAIL() > MAX_TX_DESC() + 2.  Use this same construct here,
  in order to avoid a potential off-by-one error where we might otherwise
  fail to request an interrupt.
  
  Reviewed by:  mmacy
  Sponsored by: Netflix

Modified:
  head/sys/net/iflib.c

Modified: head/sys/net/iflib.c
==
--- head/sys/net/iflib.cMon May  7 17:37:07 2018(r28)
+++ head/sys/net/iflib.cMon May  7 18:11:22 2018(r29)
@@ -3299,7 +3299,7 @@ defrag:
 */
txq->ift_rs_pending += nsegs + 1;
if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
-iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs - 1) <= 
MAX_TX_DESC(ctx)) {
+iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) 
+ 2) {
pi.ipi_flags |= IPI_TX_INTR;
txq->ift_rs_pending = 0;
}
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r333325 - head/sys/kern

2018-05-07 Thread Andrew Gallatin
Author: gallatin
Date: Mon May  7 15:24:03 2018
New Revision: 25
URL: https://svnweb.freebsd.org/changeset/base/25

Log:
  Boost thread priority while changing CPU frequency
  
  Boost the priority of user-space threads when they set
  their affinity to a core to adjust its frequency.   This avoids a situation
  where a CPU bound kernel thread with the same affinity is running on a
  down-clocked core, and will "block" powerd from up-clocking the core
  until the kernel thread yields.   This can lead to poor perfomance,
  and to things potentially getting stuck on Giant.
  
  Reviewed by:  kib (imp reviewed earlier version)
  Sponsored by: Netflix
  Differential Revision:https://reviews.freebsd.org/D15246

Modified:
  head/sys/kern/kern_cpu.c

Modified: head/sys/kern/kern_cpu.c
==
--- head/sys/kern/kern_cpu.cMon May  7 15:07:28 2018(r24)
+++ head/sys/kern/kern_cpu.cMon May  7 15:24:03 2018(r25)
@@ -245,6 +245,7 @@ cf_set_method(device_t dev, const struct cf_level *lev
struct cf_saved_freq *saved_freq, *curr_freq;
struct pcpu *pc;
int error, i;
+   u_char pri;
 
sc = device_get_softc(dev);
error = 0;
@@ -333,6 +334,8 @@ cf_set_method(device_t dev, const struct cf_level *lev
/* Bind to the target CPU before switching. */
pc = cpu_get_pcpu(set->dev);
thread_lock(curthread);
+   pri = curthread->td_priority;
+   sched_prio(curthread, PRI_MIN);
sched_bind(curthread, pc->pc_cpuid);
thread_unlock(curthread);
CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq,
@@ -340,6 +343,7 @@ cf_set_method(device_t dev, const struct cf_level *lev
error = CPUFREQ_DRV_SET(set->dev, set);
thread_lock(curthread);
sched_unbind(curthread);
+   sched_prio(curthread, pri);
thread_unlock(curthread);
if (error) {
goto out;
@@ -357,6 +361,8 @@ cf_set_method(device_t dev, const struct cf_level *lev
/* Bind to the target CPU before switching. */
pc = cpu_get_pcpu(set->dev);
thread_lock(curthread);
+   pri = curthread->td_priority;
+   sched_prio(curthread, PRI_MIN);
sched_bind(curthread, pc->pc_cpuid);
thread_unlock(curthread);
CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq,
@@ -364,6 +370,7 @@ cf_set_method(device_t dev, const struct cf_level *lev
error = CPUFREQ_DRV_SET(set->dev, set);
thread_lock(curthread);
sched_unbind(curthread);
+   sched_prio(curthread, pri);
thread_unlock(curthread);
if (error) {
/* XXX Back out any successful setting? */
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r333141 - head/sys/dev/cxgbe

2018-05-01 Thread Andrew Gallatin
Author: gallatin
Date: Tue May  1 15:33:21 2018
New Revision: 333141
URL: https://svnweb.freebsd.org/changeset/base/333141

Log:
  Optionally panic when cxgbe encounters a fatal error
  
  Sometimes it is better to panic than to leave a machine
  unreachable.
  
  Reviewed by:  np
  Sponsored by: Netflix

Modified:
  head/sys/dev/cxgbe/t4_main.c

Modified: head/sys/dev/cxgbe/t4_main.c
==
--- head/sys/dev/cxgbe/t4_main.cTue May  1 15:17:46 2018
(r333140)
+++ head/sys/dev/cxgbe/t4_main.cTue May  1 15:33:21 2018
(r333141)
@@ -469,6 +469,8 @@ TUNABLE_INT("hw.cxgbe.num_vis", _num_vis);
 static int pcie_relaxed_ordering = -1;
 TUNABLE_INT("hw.cxgbe.pcie_relaxed_ordering", _relaxed_ordering);
 
+static int t4_panic_on_fatal_err = 0;
+TUNABLE_INT("hw.cxgbe.panic_on_fatal_err", _panic_on_fatal_err);
 
 #ifdef TCP_OFFLOAD
 /*
@@ -,6 +2224,8 @@ t4_fatal_err(struct adapter *sc)
t4_intr_disable(sc);
log(LOG_EMERG, "%s: encountered fatal error, adapter stopped.\n",
device_get_nameunit(sc->dev));
+   if (t4_panic_on_fatal_err)
+   panic("panic requested on fatal error");
 }
 
 void
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r333131 - head/sys/net

2018-04-30 Thread Andrew Gallatin
Author: gallatin
Date: Mon Apr 30 23:53:27 2018
New Revision: 333131
URL: https://svnweb.freebsd.org/changeset/base/333131

Log:
  Fix iflib_encap() EFBIG handling bugs
  
  1) Don't give up if m_collapse() fails.  Rather than giving up, try
  m_defrag() immediately.
  
  2) Fix a leak where, if the NIC driver rejected the defrag'ed chain
  as having too many segments, we would fail to free the chain.
  
  Reviewed by:  Matthew Macy  (this version of patch)
  Submitted by: Matthew Macy  (early version of leak fix)

Modified:
  head/sys/net/iflib.c

Modified: head/sys/net/iflib.c
==
--- head/sys/net/iflib.cMon Apr 30 23:05:57 2018(r333130)
+++ head/sys/net/iflib.cMon Apr 30 23:53:27 2018(r333131)
@@ -3244,8 +3244,12 @@ defrag:
switch (err) {
case EFBIG:
/* try collapse once and defrag once */
-   if (remap == 0)
+   if (remap == 0) {
m_head = m_collapse(*m_headp, M_NOWAIT, 
max_segs);
+   /* try defrag if collapsing fails */
+   if (m_head == NULL)
+   remap++;
+   }
if (remap == 1)
m_head = m_defrag(*m_headp, M_NOWAIT);
remap++;
@@ -,13 +3337,18 @@ defrag:
 */
txq->ift_pidx = pi.ipi_new_pidx;
txq->ift_npending += pi.ipi_ndescs;
-   } else if (__predict_false(err == EFBIG && remap < 2)) {
+   } else {
*m_headp = m_head = iflib_remove_mbuf(txq);
-   remap = 1;
-   txq->ift_txd_encap_efbig++;
-   goto defrag;
-   } else
+   if (err == EFBIG) {
+   txq->ift_txd_encap_efbig++;
+   if (remap < 2) {
+   remap = 1;
+   goto defrag;
+   }
+   }
DBG_COUNTER_INC(encap_txd_encap_fail);
+   goto defrag_failed;
+   }
return (err);
 
 defrag_failed:
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r332860 - head/sys/kern

2018-04-24 Thread Andrew Gallatin

On 04/24/18 13:24, Jonathan T. Looney wrote:
On Mon, Apr 23, 2018 at 6:04 PM, John Baldwin > wrote:

 >
 > I think this is actually a key question.  In my experience to date I 
have not

 > encountered a large number of post-panic assertion failures.  Given that
 > we already break all locks and disable assertions for locks I'd be 
curious
 > which assertions are actually failing.  My inclination given my 
experiences

 > to date would be to explicitly ignore those as we do for locking if it is
 > constrained set rather than blacklisting all of them.  However, I 
would be

 > most interested in seeing some examples of assertions that are failing.

The latest example (the one that prompted me to finally commit this) is 
in lockmgr_sunlock_try(): 'panic: Assertion (*xp & 
~LK_EXCLUSIVE_SPINNERS) == LK_SHARERS_LOCK(1) failed at 
/usr/src/sys/kern/kern_lock.c:541'


I don't see any obvious recent changes that would have caused this, so 
this is probably a case where a change to another file suddenly made us 
trip over this assert.


FWIW, that assertion has prevented me from getting a dump from an
INVARIANTS kernel for at least a year.

Drew
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r332653 - head/sys/dev/ixgbe

2018-04-17 Thread Andrew Gallatin
Author: gallatin
Date: Tue Apr 17 16:51:27 2018
New Revision: 332653
URL: https://svnweb.freebsd.org/changeset/base/332653

Log:
  Restore SIOCGI2C functionality to ixgbe
  
  When ixgbe was converted to iflib, it lost the SIOCGI2C support
  that allows ifconfig to print SFP state, optical light levels, etc.
  Restore this by plugging in to the ifdi_i2c_req iflib method.  Note
  that the sanity checking on dev_addr that used to be done in ixgbe is
  now done in iflib.
  
  Reviewed by:  erj, Matthew Macy 
  Sponsored by: Netflix

Modified:
  head/sys/dev/ixgbe/if_ix.c

Modified: head/sys/dev/ixgbe/if_ix.c
==
--- head/sys/dev/ixgbe/if_ix.c  Tue Apr 17 16:46:08 2018(r332652)
+++ head/sys/dev/ixgbe/if_ix.c  Tue Apr 17 16:51:27 2018(r332653)
@@ -137,7 +137,7 @@ static void ixgbe_if_timer(if_ctx_t ctx, uint16_t);
 static void ixgbe_if_update_admin_status(if_ctx_t ctx);
 static void ixgbe_if_vlan_register(if_ctx_t ctx, u16 vtag);
 static void ixgbe_if_vlan_unregister(if_ctx_t ctx, u16 vtag);
-
+static int  ixgbe_if_i2c_req(if_ctx_t ctx, struct ifi2creq *req);
 int ixgbe_intr(void *arg);
 
 /
@@ -270,6 +270,7 @@ static device_method_t ixgbe_if_methods[] = {
DEVMETHOD(ifdi_vlan_register, ixgbe_if_vlan_register),
DEVMETHOD(ifdi_vlan_unregister, ixgbe_if_vlan_unregister),
DEVMETHOD(ifdi_get_counter, ixgbe_if_get_counter),
+   DEVMETHOD(ifdi_i2c_req, ixgbe_if_i2c_req),
 #ifdef PCI_IOV
DEVMETHOD(ifdi_iov_init, ixgbe_if_iov_init),
DEVMETHOD(ifdi_iov_uninit, ixgbe_if_iov_uninit),
@@ -1232,6 +1233,25 @@ ixgbe_if_get_counter(if_ctx_t ctx, ift_counter cnt)
 } /* ixgbe_if_get_counter */
 
 /
+ * ixgbe_if_i2c_req
+ /
+static int
+ixgbe_if_i2c_req(if_ctx_t ctx, struct ifi2creq *req)
+{
+   struct adapter  *adapter = iflib_get_softc(ctx);
+   struct ixgbe_hw *hw = >hw;
+   int i;
+
+
+   if (hw->phy.ops.read_i2c_byte == NULL)
+   return (ENXIO);
+   for (i = 0; i < req->len; i++)
+   hw->phy.ops.read_i2c_byte(hw, req->offset + i,
+   req->dev_addr, >data[i]);
+   return (0);
+} /* ixgbe_if_i2c_req */
+
+/
  * ixgbe_add_media_types
  /
 static void
@@ -4547,4 +4567,3 @@ ixgbe_check_fan_failure(struct adapter *adapter, u32 r
if (reg & mask)
device_printf(adapter->dev, "\nCRITICAL: FAN FAILURE!! REPLACE 
IMMEDIATELY!!\n");
 } /* ixgbe_check_fan_failure */
-
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


  1   2   >