umb: delete old v4 address before adding the new one

2023-10-11 Thread Jonathan Matthew
Currently when a umb device gets a new IPv4 address, it just adds it to
the interface and tries to set the default route through it.  If there's
a different previous address, it stays there, and any routes using it also
remain in place, so umb can't set the new default route.  You see something
like this:

  umb0: unable to set IPv4 default route, error 17

and then you no longer have a working default route.

The diff below removes the old address before setting the new one.
SIOCDIFADDR with no details in the request will delete whatever v4 address
exists on the interface, and consequently remove routes, so umb will then be
able to add the new interface and set the new default route.

With this diff, I've had an ssh session inside a wg tunnel survive across 5
address changes on the umb interface that carries the wireguard traffic.
Without, wireguard stops working after the first address change and my ssh
session goes with it.

ok?

Index: if_umb.c
===
RCS file: /cvs/src/sys/dev/usb/if_umb.c,v
retrieving revision 1.55
diff -u -p -r1.55 if_umb.c
--- if_umb.c1 Sep 2023 20:24:29 -   1.55
+++ if_umb.c11 Oct 2023 23:18:50 -
@@ -1815,6 +1815,14 @@ umb_add_inet_config(struct umb_softc *sc
int  rv;
 
memset(&ifra, 0, sizeof (ifra));
+   rv = in_ioctl(SIOCDIFADDR, (caddr_t)&ifra, ifp, 1);
+   if (rv != 0 && rv != EADDRNOTAVAIL) {
+   printf("%s: unable to delete IPv4 address, error %d\n",
+   DEVNAM(ifp->if_softc), rv);
+   return rv;
+   }
+
+   memset(&ifra, 0, sizeof (ifra));
sin = &ifra.ifra_addr;
sin->sin_family = AF_INET;
sin->sin_len = sizeof (*sin);



ypldap: stop flattening trees

2023-09-20 Thread Jonathan Matthew
ypldap currently packs all the user and group lines into contiguous blocks
of memory so it can move from one entry to the next by pointer arithmetic.
This doesn't make much sense because the entries are also in red black trees
(that's how it looks up the entry in the first place) and RB_NEXT() is not
slow.

The one piece of the tree flattening code that seems worth keeping is
strdup()ing the netid lines so they don't take 1kB per user.

ok?

Index: entries.c
===
RCS file: /cvs/src/usr.sbin/ypldap/entries.c,v
retrieving revision 1.6
diff -u -p -u -p -r1.6 entries.c
--- entries.c   18 Jul 2023 13:06:33 -  1.6
+++ entries.c   20 Sep 2023 07:17:00 -
@@ -34,86 +34,6 @@
 #include 
 
 #include "ypldap.h"
-#include "log.h"
-
-void
-flatten_entries(struct env *env)
-{
-   size_t   len;
-   char*linep;
-   char*endp;
-   char*tmp;
-   struct userent  *ue;
-   struct groupent *ge;
-
-   log_debug("flattening trees");
-   /*
-* This takes all the line pointers in RB elements and
-* concatenates them in a single string, to be able to
-* implement next element lookup without tree traversal.
-*
-* An extra octet is alloced to make space for an additional NUL.
-*/
-   if ((linep = calloc(1, env->sc_user_line_len + 1)) == NULL) {
-   /*
-* XXX: try allocating a smaller chunk of memory
-*/
-   fatal("out of memory");
-   }
-   endp = linep;
-
-   RB_FOREACH(ue, user_name_tree, env->sc_user_names) {
-   /*
-* we convert the first nul back to a column,
-* copy the string and then convert it back to a nul.
-*/
-   ue->ue_line[strlen(ue->ue_line)] = ':';
-   log_debug("pushing line: %s", ue->ue_line);
-   len = strlen(ue->ue_line) + 1;
-   memcpy(endp, ue->ue_line, len);
-   endp[strcspn(endp, ":")] = '\0';
-   free(ue->ue_line);
-   ue->ue_line = endp;
-   endp += len;
-
-   /*
-* To save memory strdup(3) the netid_line which originally used
-* LINE_WIDTH bytes
-*/
-   tmp = ue->ue_netid_line;
-   ue->ue_netid_line = strdup(tmp);
-   if (ue->ue_netid_line == NULL) {
-   fatal("out of memory");
-   }
-   free(tmp);
-   }
-   env->sc_user_lines = linep;
-   log_debug("done pushing users");
-
-   if ((linep = calloc(1, env->sc_group_line_len + 1)) == NULL) {
-   /*
-* XXX: try allocating a smaller chunk of memory
-*/
-   fatal("out of memory");
-   }
-   endp = linep;
-   RB_FOREACH(ge, group_name_tree, env->sc_group_names) {
-   /*
-* we convert the first nul back to a column,
-* copy the string and then convert it back to a nul.
-*/
-   ge->ge_line[strlen(ge->ge_line)] = ':';
-   log_debug("pushing line: %s", ge->ge_line);
-   len = strlen(ge->ge_line) + 1;
-   memcpy(endp, ge->ge_line, len);
-   endp[strcspn(endp, ":")] = '\0';
-   free(ge->ge_line);
-   ge->ge_line = endp;
-   endp += len;
-   }
-   env->sc_group_lines = linep;
-   log_debug("done pushing groups");
-}
 
 int
 userent_name_cmp(struct userent *ue1, struct userent *ue2)
Index: yp.c
===
RCS file: /cvs/src/usr.sbin/ypldap/yp.c,v
retrieving revision 1.22
diff -u -p -u -p -r1.22 yp.c
--- yp.c18 Jul 2023 13:06:33 -  1.22
+++ yp.c20 Sep 2023 07:17:00 -
@@ -557,21 +557,25 @@ ypresp_key_val *
 ypproc_first_2_svc(ypreq_nokey *arg, struct svc_req *req)
 {
static struct ypresp_key_valres;
+   struct userent  *ue;
+   struct groupent *ge;
 
if (yp_valid_domain(arg->domain, (struct ypresp_val *)&res) == -1)
return (&res);
 
if (strcmp(arg->map, "passwd.byname") == 0 ||
strcmp(arg->map, "master.passwd.byname") == 0) {
-   if (env->sc_user_lines == NULL)
+   ue = RB_MIN(user_name_tree, env->sc_user_names);
+   if (ue == NULL)
return (NULL);
 
-   yp_make_keyval(&res, env->sc_user_lines, env->sc_user_lines);
+   yp_make_keyval(&res, ue->ue_line, ue->ue_line);
} else if (strcmp(arg->map, "group.byname") == 0) {
-   if (env->sc_group_lines == NULL)
+   ge = RB_MIN(group_name_tree, env->sc_group_names);
+   if (ge == NULL)
return (NULL);
 
-  

Re: Mellanox driver : add 100G_LR4 capability

2023-09-17 Thread Jonathan Matthew
On Fri, Sep 15, 2023 at 09:48:16AM +0200, Olivier Croquin wrote:
> Hi,
> 
> The media capability 100GBase_LR4 is not listed in the mcx driver.
> 
> Could you please take a look at this short patch ? I found the value of 23
> in the Linux mlx driver.

Thanks, I've committed it.

> Is this enough to say that QSFP28 100GBase_LR are supported with the mcx
> driver ?

As much as anything else, sure.



Re: JH7110 PCIe device tree binding update

2023-08-31 Thread Jonathan Matthew
On Wed, Aug 30, 2023 at 01:19:42PM +0800, Kevin Lo wrote:
> On Tue, Aug 29, 2023 at 09:15:41PM +0200, Mark Kettenis wrote:
> > 
> > > Date: Tue, 29 Aug 2023 11:58:23 +0200
> > > From: Mark Kettenis 
> > > 
> > > Upstreaming of the JH7110 PCIe device tree bindings isn't finished
> > > yet, but it seems some progress has been made and things have been
> > > reviewed by some of the key people involved:
> > > 
> > >   https://patchwork.kernel.org/project/linux-pci/list/?series=779297
> > > 
> > > Here is a diff that adjusts the driver to the current state of things
> > > such that we can use the latest device tree from:
> > > 
> > >   https://github.com/starfive-tech/linux/tree/JH7110_VisionFive2_upstream
> > > 
> > > to continue development.  The idea is to support the preliminary
> > > bindings a little bit longer such that folks can update their device
> > > trees.  Will probably drop support for the preliminary bindings in a
> > > few weeks.
> > > 
> > > ok?
> > 
> > patrick@ pointed out that the dv_unit check won't work properly if the
> > first PCIe controller is disabled.  So here is a diff that checks the
> > device address instead like we do for dwqe(4).
> > 
> > ok?
> 
> ok kevlo@
> 
> Tested on my VisionFive 2 v1.3b with the device tree from:
> 
> https://raw.githubusercontent.com/starfive-tech/linux/JH7110_VisionFive2_upstream/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2-v1.3b.dts
> 
> It works fine, the NVMe is detected.
> 
> BTW, I noticed that the memory statistics seem to be incorrect.
> The VisionFive 2 is equipped with 8GB RAM.
> 
> OpenBSD 7.3-current (GENERIC.MP) #0: Wed Aug 30 11:52:03 CST 2023
> kevlo@vf2:/usr/src/sys/arch/riscv64/compile/GENERIC.MP
> real mem  = 4294967296 (4096MB)
> ^^^
> avail mem = 8110370816 (7734MB)
> ^^^

riscv64 calculates physmem (the 'real mem' number) by adding up the
ranges in the /memory device tree node, but uses the EFI memory map to
set up UVM, which is where 'avail mem' comes from.

Should riscv64 be more like arm64 here and calculate physmem by adding
up the memreg segments, which are built from the EFI memory map if available,
and the /memory node if not?



Re: JH7110 PCIe device tree binding update

2023-08-30 Thread Jonathan Matthew
On Wed, Aug 30, 2023 at 01:19:42PM +0800, Kevin Lo wrote:
> On Tue, Aug 29, 2023 at 09:15:41PM +0200, Mark Kettenis wrote:
> > 
> > > Date: Tue, 29 Aug 2023 11:58:23 +0200
> > > From: Mark Kettenis 
> > > 
> > > Upstreaming of the JH7110 PCIe device tree bindings isn't finished
> > > yet, but it seems some progress has been made and things have been
> > > reviewed by some of the key people involved:
> > > 
> > >   https://patchwork.kernel.org/project/linux-pci/list/?series=779297
> > > 
> > > Here is a diff that adjusts the driver to the current state of things
> > > such that we can use the latest device tree from:
> > > 
> > >   https://github.com/starfive-tech/linux/tree/JH7110_VisionFive2_upstream
> > > 
> > > to continue development.  The idea is to support the preliminary
> > > bindings a little bit longer such that folks can update their device
> > > trees.  Will probably drop support for the preliminary bindings in a
> > > few weeks.
> > > 
> > > ok?
> > 
> > patrick@ pointed out that the dv_unit check won't work properly if the
> > first PCIe controller is disabled.  So here is a diff that checks the
> > device address instead like we do for dwqe(4).
> > 
> > ok?
> 
> ok kevlo@
> 
> Tested on my VisionFive 2 v1.3b with the device tree from:
> 
> https://raw.githubusercontent.com/starfive-tech/linux/JH7110_VisionFive2_upstream/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2-v1.3b.dts
> 
> It works fine, the NVMe is detected.

Also works on VisionFive 2 v1.3b with the older device tree here, ok jmatthew@



Re: all platforms: separate cpu_initclocks() from cpu_startclock()

2023-08-19 Thread Jonathan Matthew
On Sat, Aug 19, 2023 at 01:44:47PM -0500, Scott Cheloha wrote:
> On Sun, Aug 13, 2023 at 01:48:21PM -0500, Scott Cheloha wrote:
> > This is the next patch in the clock interrupt reorganization series.
> > 
> > Before we continue breaking up the hardclock(9) we need to detour into
> > the MD code.
> > 
> > This patch divides the "initialization" parts of cpu_initclocks() from
> > the "start the clock interrupt" parts.  Seprating the two parts leaves
> > initclocks() an opportunity to prepare the primary CPU for clock
> > interrupt dispatch in a machine-independent manner before actually
> > pulling the trigger.  It's nearly impossible to do any MI setup during
> > initclocks() because cpu_initclocks() does everything in one go: both
> > initialization and kickoff are done when cpu_initclocks() returns.
> > 
> > Many platforms have a "cpu_startclock()" function, so this patch takes
> > that de facto standard and makes it a rule: cpu_startclock() is now
> > required.  It is prototyped in sys/systm.h and every platform must
> > implement it.
> > 
> > The revised initclocks() sequence is then:
> > 
> > 1. Call cpu_initclocks().  At minimum, cpu_initclocks() ensures
> >hz, stathz, and profhz are initialized.  All the machine
> >independent setup in step (2) (currently) depends upon
> >these machine-dependent values.
> > 
> > 2. Compute intervals using hz, stathz, and profhz.
> > 
> >In a later step I will move the full contents of clockintr_init()
> >up into initclocks() and get rid of clockintr_init() entirely.
> > 
> > 3. Call cpu_startclock().  At minimum, cpu_startclock() starts the
> >clock interrupt dispatch cycle on the primary CPU.
> > 
> > I have compiled/booted this patch on amd64 (lapic path), arm64, i386
> > (lapic path), macppc, octeon, and sparc64 (sun4v).
> > 
> > I am looking for compile/boot tests on alpha, armv7, hppa, landisk,
> > luna88k, powerpc64, and riscv64.  I think armv7 is the tricky one
> > here.  Everything else is relatively straightforward, though I may
> > have missed a few stray variables here or there.
> > 
> > Test results?  Ok?
> 
> Here is an updated patch that removes several MD prototypes for
> cpu_startclock() that I missed the first time through.
> 
> I went back and tested these again:
> 
> - amd64 (lapic)
> - arm64
> - i386 (lapic)
> - powerpc/macppc
> - mips64/octeon (loongson should be fine)
> - sparc64 (sys_tick; tick/stick should be fine)
> 
> arm/armv7 and riscv64 were tested under the previous version, but I
> would appreciate a second compile-test to make sure the header changes
> in the updated patch did not break the build (CC phessler@, jsg@).

Still builds on riscv64 and armv7.



Re: all platforms: separate cpu_initclocks() from cpu_startclock()

2023-08-14 Thread Jonathan Matthew
On Mon, Aug 14, 2023 at 06:24:14PM +1000, Jonathan Matthew wrote:
> On Sun, Aug 13, 2023 at 01:48:21PM -0500, Scott Cheloha wrote:
> > This is the next patch in the clock interrupt reorganization series.
> > 
> > Before we continue breaking up the hardclock(9) we need to detour into
> > the MD code.
> > 
> > This patch divides the "initialization" parts of cpu_initclocks() from
> > the "start the clock interrupt" parts.  Seprating the two parts leaves
> > initclocks() an opportunity to prepare the primary CPU for clock
> > interrupt dispatch in a machine-independent manner before actually
> > pulling the trigger.  It's nearly impossible to do any MI setup during
> > initclocks() because cpu_initclocks() does everything in one go: both
> > initialization and kickoff are done when cpu_initclocks() returns.
> > 
> > Many platforms have a "cpu_startclock()" function, so this patch takes
> > that de facto standard and makes it a rule: cpu_startclock() is now
> > required.  It is prototyped in sys/systm.h and every platform must
> > implement it.
> > 
> > The revised initclocks() sequence is then:
> > 
> > 1. Call cpu_initclocks().  At minimum, cpu_initclocks() ensures
> >hz, stathz, and profhz are initialized.  All the machine
> >independent setup in step (2) (currently) depends upon
> >these machine-dependent values.
> > 
> > 2. Compute intervals using hz, stathz, and profhz.
> > 
> >In a later step I will move the full contents of clockintr_init()
> >up into initclocks() and get rid of clockintr_init() entirely.
> > 
> > 3. Call cpu_startclock().  At minimum, cpu_startclock() starts the
> >clock interrupt dispatch cycle on the primary CPU.
> > 
> > I have compiled/booted this patch on amd64 (lapic path), arm64, i386
> > (lapic path), macppc, octeon, and sparc64 (sun4v).
> > 
> > I am looking for compile/boot tests on alpha, armv7, hppa, landisk,
> > luna88k, powerpc64, and riscv64.  I think armv7 is the tricky one
> > here.  Everything else is relatively straightforward, though I may
> > have missed a few stray variables here or there.
> > 
> > Test results?  Ok?
> 
> Compiles on armv7 and boots on an Allwinner A20 machine using agtimer(4).
> I don't think I have any armv7 systems using other timer devices.
> 

Also compiles and boots on riscv64 (visionfive 2).



Re: all platforms: separate cpu_initclocks() from cpu_startclock()

2023-08-14 Thread Jonathan Matthew
On Sun, Aug 13, 2023 at 01:48:21PM -0500, Scott Cheloha wrote:
> This is the next patch in the clock interrupt reorganization series.
> 
> Before we continue breaking up the hardclock(9) we need to detour into
> the MD code.
> 
> This patch divides the "initialization" parts of cpu_initclocks() from
> the "start the clock interrupt" parts.  Seprating the two parts leaves
> initclocks() an opportunity to prepare the primary CPU for clock
> interrupt dispatch in a machine-independent manner before actually
> pulling the trigger.  It's nearly impossible to do any MI setup during
> initclocks() because cpu_initclocks() does everything in one go: both
> initialization and kickoff are done when cpu_initclocks() returns.
> 
> Many platforms have a "cpu_startclock()" function, so this patch takes
> that de facto standard and makes it a rule: cpu_startclock() is now
> required.  It is prototyped in sys/systm.h and every platform must
> implement it.
> 
> The revised initclocks() sequence is then:
> 
> 1. Call cpu_initclocks().  At minimum, cpu_initclocks() ensures
>hz, stathz, and profhz are initialized.  All the machine
>independent setup in step (2) (currently) depends upon
>these machine-dependent values.
> 
> 2. Compute intervals using hz, stathz, and profhz.
> 
>In a later step I will move the full contents of clockintr_init()
>up into initclocks() and get rid of clockintr_init() entirely.
> 
> 3. Call cpu_startclock().  At minimum, cpu_startclock() starts the
>clock interrupt dispatch cycle on the primary CPU.
> 
> I have compiled/booted this patch on amd64 (lapic path), arm64, i386
> (lapic path), macppc, octeon, and sparc64 (sun4v).
> 
> I am looking for compile/boot tests on alpha, armv7, hppa, landisk,
> luna88k, powerpc64, and riscv64.  I think armv7 is the tricky one
> here.  Everything else is relatively straightforward, though I may
> have missed a few stray variables here or there.
> 
> Test results?  Ok?

Compiles on armv7 and boots on an Allwinner A20 machine using agtimer(4).
I don't think I have any armv7 systems using other timer devices.



ix(4) shouldn't crash on memory allocation failure

2023-07-07 Thread Jonathan Matthew
One of the problems described here:
https://www.mail-archive.com/tech@openbsd.org/msg71790.html
amounts to ix(4) not checking that it allocated a dma map before trying to free 
it.

ok?


Index: if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.197
diff -u -p -r1.197 if_ix.c
--- if_ix.c 1 Jun 2023 09:05:33 -   1.197
+++ if_ix.c 7 Jul 2023 09:22:30 -
@@ -3094,8 +3094,11 @@ ixgbe_free_receive_buffers(struct rx_rin
m_freem(rxbuf->buf);
rxbuf->buf = NULL;
}
-   bus_dmamap_destroy(rxr->rxdma.dma_tag, rxbuf->map);
-   rxbuf->map = NULL;
+   if (rxbuf->map != NULL) {
+   bus_dmamap_destroy(rxr->rxdma.dma_tag,
+   rxbuf->map);
+   rxbuf->map = NULL;
+   }
}
free(rxr->rx_buffers, M_DEVBUF,
sc->num_rx_desc * sizeof(struct ixgbe_rx_buf));



use if_register in dwge(4)

2023-07-05 Thread Jonathan Matthew
Like dwqe(4), dwge(4) should also register its instances for lookup by ofw node
or phandle.

ok?


Index: if_dwge.c
===
RCS file: /cvs/src/sys/dev/fdt/if_dwge.c,v
retrieving revision 1.17
diff -u -p -r1.17 if_dwge.c
--- if_dwge.c   5 Jul 2023 18:48:49 -   1.17
+++ if_dwge.c   5 Jul 2023 19:04:18 -
@@ -267,6 +267,8 @@ struct dwge_softc {
bus_dma_tag_t   sc_dmat;
void*sc_ih;
 
+   struct if_devicesc_ifd;
+
struct arpcom   sc_ac;
 #define sc_lladdr  sc_ac.ac_enaddr
struct mii_data sc_mii;
@@ -634,6 +636,10 @@ dwge_attach(struct device *parent, struc
dwge_intr, sc, sc->sc_dev.dv_xname);
if (sc->sc_ih == NULL)
printf("%s: can't establish interrupt\n", sc->sc_dev.dv_xname);
+
+   sc->sc_ifd.if_node = faa->fa_node;
+   sc->sc_ifd.if_ifp = ifp;
+   if_register(&sc->sc_ifd);
 }
 
 void



Re: dwge(4) fixed-link support

2023-07-05 Thread Jonathan Matthew
On Wed, Jul 05, 2023 at 01:13:34PM +0200, Mark Kettenis wrote:
> > Date: Wed, 5 Jul 2023 12:46:36 +0300
> > From: Jonathan Matthew 
> > 
> > On the Banana Pi R1 (aka Lamobo R1), the dwge interface on the soc is
> > connected to a broadcom switch chip.  It looks like this in the device
> > tree:
> > 
> > &gmac {
> > pinctrl-names = "default";
> > pinctrl-0 = <&gmac_rgmii_pins>;
> > phy-mode = "rgmii";
> > phy-supply = <®_gmac_3v3>;
> > status = "okay";
> > 
> > fixed-link {
> > speed = <1000>;
> > full-duplex;
> > };
> > 
> > mdio {
> > ...
> > }
> > };
> > 
> > This diff makes the fixed-link part work, setting the interface's link
> > state to up and the media type to IFM_1000_T|IFM_FDX instead of trying to
> > attach a phy.  After setting the media type, we need to call mii_statchg()
> > to configure the MAC appropriately.
> > 
> > ok?
> 
> Is there a reason why you structured this differently than how this is
> done for dwqe(4)?

I did this quite a while ago, so I'm not completely sure, but I think what 
happened
is that I based it on an earlier version of the dwqe diff.  Here's a new 
version that
looks a lot more like how dwqe does it:


Index: if_dwge.c
===
RCS file: /cvs/src/sys/dev/fdt/if_dwge.c,v
retrieving revision 1.16
diff -u -p -r1.16 if_dwge.c
--- if_dwge.c   25 Jun 2023 22:36:09 -  1.16
+++ if_dwge.c   5 Jul 2023 13:03:47 -
@@ -271,6 +271,7 @@ struct dwge_softc {
 #define sc_lladdr  sc_ac.ac_enaddr
struct mii_data sc_mii;
 #define sc_media   sc_mii.mii_media
+   uint64_tsc_fixed_media;
int sc_link;
int sc_phyloc;
int sc_force_thresh_dma_mode;
@@ -386,7 +387,7 @@ dwge_attach(struct device *parent, struc
 {
struct dwge_softc *sc = (void *)self;
struct fdt_attach_args *faa = aux;
-   struct ifnet *ifp;
+   struct ifnet *ifp = &sc->sc_ac.ac_if;
uint32_t phy, phy_supply;
uint32_t axi_config;
uint32_t mode, pbl;
@@ -457,6 +458,30 @@ dwge_attach(struct device *parent, struc
/* Reset PHY */
dwge_reset_phy(sc);
 
+   node = OF_getnodebyname(faa->fa_node, "fixed-link");
+   if (node) {
+   ifp->if_baudrate = IF_Mbps(OF_getpropint(node, "speed", 0));
+
+   switch (OF_getpropint(node, "speed", 0)) {
+   case 1000:
+   sc->sc_fixed_media = IFM_ETHER | IFM_1000_T;
+   break;
+   case 100:
+   sc->sc_fixed_media = IFM_ETHER | IFM_100_TX;
+   break;
+   default:
+   sc->sc_fixed_media = IFM_ETHER | IFM_AUTO;
+   break;
+   }
+   
+   if (OF_getpropbool(node, "full-duplex")) {
+   ifp->if_link_state = LINK_STATE_FULL_DUPLEX;
+   sc->sc_fixed_media |= IFM_FDX;
+   } else {
+   ifp->if_link_state = LINK_STATE_UP;
+   }
+   }
+
sc->sc_clk = clock_get_frequency(faa->fa_node, "stmmaceth");
if (sc->sc_clk > 25000)
sc->sc_clk = GMAC_GMII_ADDR_CR_DIV_124;
@@ -479,7 +504,6 @@ dwge_attach(struct device *parent, struc
timeout_set(&sc->sc_tick, dwge_tick, sc);
timeout_set(&sc->sc_rxto, dwge_rxtick, sc);
 
-   ifp = &sc->sc_ac.ac_if;
ifp->if_softc = sc;
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
ifp->if_xflags = IFXF_MPSAFE;
@@ -576,14 +600,23 @@ dwge_attach(struct device *parent, struc
dwge_write(sc, GMAC_AXI_BUS_MODE, mode);
}
 
-   mii_attach(self, &sc->sc_mii, 0x, sc->sc_phyloc,
-   (sc->sc_phyloc == MII_PHY_ANY) ? 0 : MII_OFFSET_ANY, 0);
-   if (LIST_FIRST(&sc->sc_mii.mii_phys) == NULL) {
-   printf("%s: no PHY found!\n", sc->sc_dev.dv_xname);
-   ifmedia_add(&sc->sc_media, IFM_ETHER|IFM_MANUAL, 0, NULL);
-   ifmedia_set(&sc->sc_media, IFM_ETHER|IFM_MANUAL);
-   } else
-   ifmedia_set(&sc->sc_media, IFM_ETHER|IFM_AUTO);
+   if (sc->sc_fixed_media == 0) {
+   mii_attach(self, &sc->sc_mii, 0x, sc->sc_phyloc,
+   (sc->sc_phyloc == MII_PHY_ANY) ? 0 

dwge(4) fixed-link support

2023-07-05 Thread Jonathan Matthew
On the Banana Pi R1 (aka Lamobo R1), the dwge interface on the soc is
connected to a broadcom switch chip.  It looks like this in the device
tree:

&gmac {
pinctrl-names = "default";
pinctrl-0 = <&gmac_rgmii_pins>;
phy-mode = "rgmii";
phy-supply = <®_gmac_3v3>;
status = "okay";

fixed-link {
speed = <1000>;
full-duplex;
};

mdio {
...
}
};

This diff makes the fixed-link part work, setting the interface's link
state to up and the media type to IFM_1000_T|IFM_FDX instead of trying to
attach a phy.  After setting the media type, we need to call mii_statchg()
to configure the MAC appropriately.

ok?


Index: if_dwge.c
===
RCS file: /cvs/src/sys/dev/fdt/if_dwge.c,v
retrieving revision 1.16
diff -u -p -r1.16 if_dwge.c
--- if_dwge.c   25 Jun 2023 22:36:09 -  1.16
+++ if_dwge.c   5 Jul 2023 09:16:41 -
@@ -271,6 +271,7 @@ struct dwge_softc {
 #define sc_lladdr  sc_ac.ac_enaddr
struct mii_data sc_mii;
 #define sc_media   sc_mii.mii_media
+   uint64_tsc_fixed_media;
int sc_link;
int sc_phyloc;
int sc_force_thresh_dma_mode;
@@ -386,7 +387,7 @@ dwge_attach(struct device *parent, struc
 {
struct dwge_softc *sc = (void *)self;
struct fdt_attach_args *faa = aux;
-   struct ifnet *ifp;
+   struct ifnet *ifp = &sc->sc_ac.ac_if;
uint32_t phy, phy_supply;
uint32_t axi_config;
uint32_t mode, pbl;
@@ -403,16 +404,6 @@ dwge_attach(struct device *parent, struc
}
sc->sc_dmat = faa->fa_dmat;
 
-   /* Lookup PHY. */
-   phy = OF_getpropint(faa->fa_node, "phy", 0);
-   if (phy == 0)
-   phy = OF_getpropint(faa->fa_node, "phy-handle", 0);
-   node = OF_getnodebyphandle(phy);
-   if (node)
-   sc->sc_phyloc = OF_getpropint(node, "reg", MII_PHY_ANY);
-   else
-   sc->sc_phyloc = MII_PHY_ANY;
-
pinctrl_byname(faa->fa_node, "default");
 
/* Enable clocks. */
@@ -449,13 +440,48 @@ dwge_attach(struct device *parent, struc
if (OF_is_compatible(faa->fa_node, "starfive,jh7100-gmac"))
sc->sc_defrag = 1;
 
-   /* Power up PHY. */
-   phy_supply = OF_getpropint(faa->fa_node, "phy-supply", 0);
-   if (phy_supply)
-   regulator_enable(phy_supply);
+   node = OF_getnodebyname(faa->fa_node, "fixed-link");
+   if (node == 0) {
+   /* Lookup PHY. */
+   phy = OF_getpropint(faa->fa_node, "phy", 0);
+   if (phy == 0)
+   phy = OF_getpropint(faa->fa_node, "phy-handle", 0);
+   node = OF_getnodebyphandle(phy);
+   if (node)
+   sc->sc_phyloc = OF_getpropint(node, "reg", MII_PHY_ANY);
+   else
+   sc->sc_phyloc = MII_PHY_ANY;
+
+   /* Power up PHY. */
+   phy_supply = OF_getpropint(faa->fa_node, "phy-supply", 0);
+   if (phy_supply)
+   regulator_enable(phy_supply);
 
-   /* Reset PHY */
-   dwge_reset_phy(sc);
+   /* Reset PHY */
+   dwge_reset_phy(sc);
+   } else {
+   ifp->if_baudrate = IF_Mbps(OF_getpropint(node,
+   "speed", 0));
+
+   switch (OF_getpropint(node, "speed", 0)) {
+   case 1000:
+   sc->sc_fixed_media = IFM_ETHER | IFM_1000_T;
+   break;
+   case 100:
+   sc->sc_fixed_media = IFM_ETHER | IFM_100_TX;
+   break;
+   default:
+   sc->sc_fixed_media = IFM_ETHER | IFM_AUTO;
+   break;
+   }
+
+   if (OF_getpropbool(node, "full-duplex")) {
+   ifp->if_link_state = LINK_STATE_FULL_DUPLEX;
+   sc->sc_fixed_media |= IFM_FDX;
+   } else {
+   ifp->if_link_state = LINK_STATE_UP;
+   }
+   }
 
sc->sc_clk = clock_get_frequency(faa->fa_node, "stmmaceth");
if (sc->sc_clk > 25000)
@@ -479,7 +505,6 @@ dwge_attach(struct device *parent, struc
timeout_set(&sc->sc_tick, dwge_tick, sc);
timeout_set(&sc->sc_rxto, dwge_rxtick, sc);
 
-   ifp = &sc->sc_ac.ac_if;
ifp->if_softc = sc;
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
ifp->if_xflags = IFXF_MPSAFE;
@@ -576,14 +601,21 @@ dwge_attach(struct device *parent, struc
dwge_write(sc, GMAC_AXI_BUS_MODE, mode);
}
 
-   mii_attach(self, &sc->sc_mii, 0x, sc->sc_phyloc,
-   (sc->sc_phyloc == MII_PHY_ANY) ? 0 : MII_OFFSET_ANY, 0);
-   if (LIST_FIRST(&sc->sc_mii.

Re: cksum remove redundant code

2023-07-04 Thread Jonathan Matthew
ok jmatthew@

On Tue, Jul 04, 2023 at 12:20:32PM +0300, Alexander Bluhm wrote:
> anyone?
> 
> On Fri, May 26, 2023 at 06:44:25PM +0200, Alexander Bluhm wrote:
> > Hi,
> > 
> > in_ifcap_cksum() checks ifp == NULL
> > in_hdr_cksum_out() sets ip_sum = 0
> > in_proto_cksum_out() and in6_proto_cksum_out() always write
> > th_sum if M_TCP_CSUM_OUT is set and proto is IPPROTO_TCP.
> > 
> > ok?
> > 
> > bluhm
> > 
> > Index: netinet/ip_output.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v
> > retrieving revision 1.388
> > diff -u -p -r1.388 ip_output.c
> > --- netinet/ip_output.c 22 May 2023 16:08:34 -  1.388
> > +++ netinet/ip_output.c 26 May 2023 11:55:49 -
> > @@ -1801,7 +1801,7 @@ in_hdr_cksum_out(struct mbuf *m, struct 
> > struct ip *ip = mtod(m, struct ip *);
> >  
> > ip->ip_sum = 0;
> > -   if (ifp && in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) {
> > +   if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) {
> > SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT);
> > } else {
> > ipstat_inc(ips_outswcsum);
> > Index: netinet/tcp_output.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v
> > retrieving revision 1.138
> > diff -u -p -r1.138 tcp_output.c
> > --- netinet/tcp_output.c15 May 2023 16:34:56 -  1.138
> > +++ netinet/tcp_output.c26 May 2023 15:19:12 -
> > @@ -1295,7 +1295,6 @@ tcp_chopper(struct mbuf *m0, struct mbuf
> >  
> > /* copy and adjust IP header, calculate checksum */
> > SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
> > -   mhth->th_sum = 0;
> > if (ip) {
> > struct ip *mhip;
> >  
> > @@ -1328,10 +1327,8 @@ tcp_chopper(struct mbuf *m0, struct mbuf
> > }
> > /* adjust IP header, calculate checksum */
> > SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
> > -   th->th_sum = 0;
> > if (ip) {
> > ip->ip_len = htons(m0->m_pkthdr.len);
> > -   ip->ip_sum = 0;
> > in_hdr_cksum_out(m0, ifp);
> > in_proto_cksum_out(m0, ifp);
> > }
> 



bge(4) kstats

2023-07-02 Thread Jonathan Matthew
This adds kstats for the hardware counters available in bge(4) devices, BCM5705
and newer.  The main complication is that some of the counters are already used
in bge_stats_update_regs() as part of a hardware bug workaround, some are 
affected
by hardware bugs themselves, and some are read to update interface counters.
I decided to leave that as-is as much as possible.

The main changes to bge_stats_update_regs() are to always read the outgoing
ucast/mcast/bcast packet counters (instead of just when we're working around the
RDMA bug) and to accumulate any counters read into the kstat buffer, so
bge_kstat_read() doesn't have to touch them.  All the hardware counters reset on
read, so avoiding double handling keeps things simple.  This means
bge_stats_update_regs() also has to be called with bge_kstat_mtx held, so to
decrease the number of '#if NKSTAT > 0' the mutex is compiled in even in kernels
without kstat.

On a lightly used machine that sees a lot of multicast and broadcast due to 
being
near Windows desktops, the stats look like this:

ok?

bge0:0:bge-stats:0
  out octets: 738725 bytes
  collisions: 0
xon sent: 0
   xoff sent: 0
 xmit errors: 0
 coll frames: 0 packets
 multicoll frame: 0 packets
   deferred xmit: 0
 excess coll: 0
   late coll: 0
  out ucast pkts: 1495 packets
  out mcast pkts: 0 packets
  out bcast pkts: 5 packets
   in octets: 10192782 bytes
   fragments: 0
   in ucast pkts: 1736 packets
   in mcast pkts: 27251 packets
   in bcast pkts: 42984 packets
  FCS errors: 0
align errors: 0
xon rcvd: 0
   xoff rcvd: 0
 ctrlframes rcvd: 0
xoff entered: 0
 too long frames: 0
 jabbers: 0
  too short pkts: 0
 DMA RQ full: 0
   DMA HPRQ full: 0
  SDC queue full: 0
sendprod set: 0
   stats updated: 0
irqs: 0
avoided irqs: 0
   tx thresh hit: 0
filtdrop: 0
DMA WRQ full: 0
  DMA HPWRQ full: 0
  out of BDs: 10
 if in drops: 0
if in errors: 0
   rx thresh hit: 0



Index: if_bge.c
===
RCS file: /cvs/src/sys/dev/pci/if_bge.c,v
retrieving revision 1.400
diff -u -p -u -p -r1.400 if_bge.c
--- if_bge.c18 Jan 2023 23:31:37 -  1.400
+++ if_bge.c3 Jul 2023 06:09:42 -
@@ -74,6 +74,7 @@
 
 #include "bpfilter.h"
 #include "vlan.h"
+#include "kstat.h"
 
 #include 
 #include 
@@ -85,6 +86,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -203,6 +205,58 @@ void bge_ape_unlock(struct bge_softc *, 
 void bge_ape_send_event(struct bge_softc *, uint32_t);
 void bge_ape_driver_state_change(struct bge_softc *, int);
 
+#if NKSTAT > 0
+void bge_kstat_attach(struct bge_softc *);
+
+enum {
+   bge_stat_out_octets = 0,
+   bge_stat_collisions,
+   bge_stat_xon_sent,
+   bge_stat_xoff_sent,
+   bge_stat_xmit_errors,
+   bge_stat_coll_frames,
+   bge_stat_multicoll_frames,
+   bge_stat_deferred_xmit,
+   bge_stat_excess_coll,
+   bge_stat_late_coll,
+   bge_stat_out_ucast_pkt,
+   bge_stat_out_mcast_pkt,
+   bge_stat_out_bcast_pkt,
+   bge_stat_in_octets,
+   bge_stat_fragments,
+   bge_stat_in_ucast_pkt,
+   bge_stat_in_mcast_pkt,
+   bge_stat_in_bcast_pkt,
+   bge_stat_fcs_errors,
+   bge_stat_align_errors,
+   bge_stat_xon_rcvd,
+   bge_stat_xoff_rcvd,
+   bge_stat_ctrl_frame_rcvd,
+   bge_stat_xoff_entered,
+   bge_stat_too_long_frames,
+   bge_stat_jabbers,
+   bge_stat_too_short_pkts,
+
+   bge_stat_dma_rq_full,
+   bge_stat_dma_hprq_full,
+   bge_stat_sdc_queue_full,
+   bge_stat_nic_sendprod_set,
+   bge_stat_status_updated,
+   bge_stat_irqs,
+   bge_stat_avoided_irqs,
+   bge_stat_tx_thresh_hit,
+
+   bge_stat_filtdrop,
+   bge_stat_dma_wrq_full,
+   bge_stat_dma_hpwrq_full,
+   bge_stat_out_of_bds,
+   bge_stat_if_in_drops,
+   bge_stat_if_in_errors,
+   bge_stat_rx_thresh_hit,
+};
+
+#endif
+
 #ifdef BGE_DEBUG
 #define DPRINTF(x) do { if (bgedebug) printf x; } while (0)
 #define DPRINTFN(n,x)  do { if (bgedebug >= (n)) printf x; } while (0)
@@ -2993,6 +3047,12 @@ bge_attach(struct device *parent, struct
else
sc->bge_return_ring_cnt = BGE_RETURN_RING_CNT_5705;
 
+   mtx_init(&sc->bge_kstat_mtx, IPL_SOFTCLOCK);
+#if NKSTAT > 0
+   if (BGE_IS_5705_PLUS(sc))
+   bge_kstat_attach(sc);
+#endif
+
/* Set up ifnet structure */
ifp = &sc->arpcom.ac_if;
ifp->if_softc = sc;
@@ -3767,9 +3827,11 @@ bge_tick(void *xsc)
 
s = splnet();
 
-   if (BGE_IS_5705_PLUS(sc))
+   if (BGE_IS_5705_PLUS(sc)) {
+   mtx_enter(&sc->bge_kstat_mtx);
bge_stats_update_regs(sc);
-   else
+   mtx_leave(&sc->bge_kstat_mtx);
+   } else
bge_stats_update(sc);
 
if (sc->bge_flags & BGE_FIBER_TBI) {
@@ -37

dwge(4) kstats

2023-06-16 Thread Jonathan Matthew
This adds kstats for hardware counters available in (some) dwge devices.
If the counters are not present, as in Allwinner A20 devices among others,
they just read 0.  On an RK3399 device, they do exist, and they yield
something like this:

rp64$ kstat dwge0::dwge-stats:
dwge0:0:dwge-stats:0
 tx octets total: 6152530 bytes
 tx frames total: 7897 packets
tx underflow: 0 packets
  tx carrier err: 0 packets
  tx good octets: 6152530 bytes
  tx good frames: 7897 packets
 rx frames total: 7685 packets
 rx octets total: 960726 bytes
  rx good octets: 960726 bytes
   rx good mcast: 0 packets
   rx crc errors: 0 packets
   rx len errors: 0 packets
 rx fifo err: 0 packets

The counters are all 32 bit, so to avoid overflow, we set them to reset on
read, accumulate the values read into 64 bit counters, and read all the
counters when an MMC interrupt occurs, which happens when one or more
counters reach 0x800.  Writing all ones to GMAC_MMC_RX_INT_MSK and
GMAC_MMC_TX_INT_MSK masks the MMC interrupts, so the diff removes those.

ok?


Index: if_dwge.c
===
RCS file: /cvs/src/sys/dev/fdt/if_dwge.c,v
retrieving revision 1.15
diff -u -p -r1.15 if_dwge.c
--- if_dwge.c   26 Feb 2023 13:28:12 -  1.15
+++ if_dwge.c   16 Jun 2023 06:50:58 -
@@ -21,6 +21,7 @@
  */
 
 #include "bpfilter.h"
+#include "kstat.h"
 
 #include 
 #include 
@@ -54,6 +55,10 @@
 #include 
 #endif
 
+#if NKSTAT > 0
+#include 
+#endif
+
 #include 
 #include 
 
@@ -97,8 +102,24 @@
 #define  GMAC_INT_MASK_RIM (1 << 0)
 #define GMAC_MAC_ADDR0_HI  0x0040
 #define GMAC_MAC_ADDR0_LO  0x0044
+#define GMAC_MAC_MMC_CTRL  0x0100
+#define  GMAC_MAC_MMC_CTRL_ROR (1 << 2)
+#define  GMAC_MAC_MMC_CTRL_CR  (1 << 0)
 #define GMAC_MMC_RX_INT_MSK0x010c
 #define GMAC_MMC_TX_INT_MSK0x0110
+#define GMAC_MMC_TXOCTETCNT_GB 0x0114
+#define GMAC_MMC_TXFRMCNT_GB   0x0118
+#define GMAC_MMC_TXUNDFLWERR   0x0148
+#define GMAC_MMC_TXCARERR  0x0160
+#define GMAC_MMC_TXOCTETCNT_G  0x0164
+#define GMAC_MMC_TXFRMCNT_G0x0168
+#define GMAC_MMC_RXFRMCNT_GB   0x0180
+#define GMAC_MMC_RXOCTETCNT_GB 0x0184
+#define GMAC_MMC_RXOCTETCNT_G  0x0188
+#define GMAC_MMC_RXMCFRMCNT_G  0x0190
+#define GMAC_MMC_RXCRCERR  0x0194
+#define GMAC_MMC_RXLENERR  0x01c8
+#define GMAC_MMC_RXFIFOOVRFLW  0x01d4
 #define GMAC_MMC_IPC_INT_MSK   0x0200
 #define GMAC_BUS_MODE  0x1000
 #define  GMAC_BUS_MODE_8XPBL   (1 << 24)
@@ -113,6 +134,7 @@
 #define GMAC_RX_DESC_LIST_ADDR 0x100c
 #define GMAC_TX_DESC_LIST_ADDR 0x1010
 #define GMAC_STATUS0x1014
+#define  GMAC_STATUS_MMC   (1 << 27)
 #define  GMAC_STATUS_RI(1 << 6)
 #define  GMAC_STATUS_TU(1 << 2)
 #define  GMAC_STATUS_TI(1 << 0)
@@ -277,6 +299,11 @@ struct dwge_softc {
uint32_tsc_clk_sel_125;
uint32_tsc_clk_sel_25;
uint32_tsc_clk_sel_2_5;
+
+#if NKSTAT > 0
+   struct mutexsc_kstat_mtx;
+   struct kstat*sc_kstat;
+#endif
 };
 
 #define DEVNAME(_s)((_s)->sc_dev.dv_xname)
@@ -334,6 +361,11 @@ void   dwge_dmamem_free(struct dwge_softc 
 struct mbuf *dwge_alloc_mbuf(struct dwge_softc *, bus_dmamap_t);
 void   dwge_fill_rx_ring(struct dwge_softc *);
 
+#if NKSTAT > 0
+intdwge_kstat_read(struct kstat *);
+void   dwge_kstat_attach(struct dwge_softc *);
+#endif
+
 int
 dwge_match(struct device *parent, void *cfdata, void *aux)
 {
@@ -555,13 +587,14 @@ dwge_attach(struct device *parent, struc
 
if_attach(ifp);
ether_ifattach(ifp);
+#if NKSTAT > 0
+   dwge_kstat_attach(sc);
+#endif
 
/* Disable interrupts. */
dwge_write(sc, GMAC_INT_ENA, 0);
dwge_write(sc, GMAC_INT_MASK,
GMAC_INT_MASK_LPIIM | GMAC_INT_MASK_PIM | GMAC_INT_MASK_RIM);
-   dwge_write(sc, GMAC_MMC_RX_INT_MSK, 0x);
-   dwge_write(sc, GMAC_MMC_TX_INT_MSK, 0x);
dwge_write(sc, GMAC_MMC_IPC_INT_MSK, 0x);
 
sc->sc_ih = fdt_intr_establish(faa->fa_node, IPL_NET | IPL_MPSAFE,
@@ -921,6 +954,14 @@ dwge_intr(void *arg)
reg & GMAC_STATUS_TU)
dwge_tx_proc(sc);
 
+#if NKSTAT > 0
+   if (reg & GMAC_STATUS_MMC) {
+   mtx_enter(&sc->sc_kstat_mtx);
+   dwge_kstat_read(sc->sc_kstat);
+   mtx_leave(&sc->sc_kstat_mtx);
+   }
+#endif
+
return (1);
 }
 
@@ -1660,3 +1701,77 @@ dwge_mii_statchg_rockchip(struct device 
 
regmap_write_4(rm, sc->sc_clk_sel, gmac_clk_sel);
 }
+
+#if NKSTAT > 0
+
+struct dwge_counter {
+   const char  *c_name;
+   enum kstat_kv_unit  c_unit;
+   uint32_tc_reg;
+};
+
+const struct dwge_counter dwge_counters[] = {
+   { "tx octets total", KSTAT_KV_U_BYTES, GMAC_MMC_TXOCTETCNT_GB },
+   { "tx frames total", KSTAT_KV_U_PACKET

ypldap: try servers until one succeeds

2023-05-18 Thread Jonathan Matthew
We sometimes run into situations where one of the three servers a ypldap
can talk to will accept a TCP connection but won't do TLS properly, or won't
perform LDAP searches.  ypldap currently only tries servers until one accepts
the connection, so when this happens, it is less successful at updating than
it could be.

The diff below adjusts the ldap update code so it tries servers until it
either successfully queries one or it runs out of addresses to try.
If a server breaks after returning partial results, the ldap process will
still send what it got to the main process.  If the ldap process then gets
full results from another server, those will overwrite the partial results,
and if it doesn't, the main process will discard the partial results when it
gets a 'trash update' message from the ldap process.

While here, the diff also adds the server address to log messages about
servers not working, so it's easier to figure out what's going wrong.

ok?

Index: ldapclient.c
===
RCS file: /cvs/src/usr.sbin/ypldap/ldapclient.c,v
retrieving revision 1.46
diff -u -p -r1.46 ldapclient.c
--- ldapclient.c13 Oct 2022 04:55:33 -  1.46
+++ ldapclient.c3 Feb 2023 03:58:17 -
@@ -53,50 +53,10 @@ int client_build_req(struct idm *, struc
int, int);
 intclient_search_idm(struct env *, struct idm *, struct aldap *,
char **, char *, int, int, enum imsg_type);
-intclient_try_idm(struct env *, struct idm *);
+intclient_try_idm(struct env *, struct idm *, struct ypldap_addr *);
 void   client_addr_init(struct idm *);
 intclient_addr_free(struct idm *);
 
-struct aldap   *client_aldap_open(struct ypldap_addr_list *);
-
-/*
- * dummy wrapper to provide aldap_init with its fd's.
- */
-struct aldap *
-client_aldap_open(struct ypldap_addr_list *addr)
-{
-   int  fd = -1;
-   struct ypldap_addr  *p;
-   struct aldap*al;
-
-   TAILQ_FOREACH(p, addr, next) {
-   char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
-   struct sockaddr *sa = (struct sockaddr *)&p->ss;
-
-   if (getnameinfo(sa, SA_LEN(sa), hbuf, sizeof(hbuf), sbuf,
-   sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV))
-   errx(1, "could not get numeric hostname");
-
-   if ((fd = socket(sa->sa_family, SOCK_STREAM, 0)) == -1)
-   return NULL;
-
-   if (connect(fd, sa, SA_LEN(sa)) == 0)
-   break;
-
-   log_warn("connect to %s port %s failed", hbuf, sbuf);
-   close(fd);
-   fd = -1;
-   }
-
-   if (fd == -1)
-   return NULL;
-
-   al = aldap_init(fd);
-   if (al == NULL)
-   close(fd);
-   return al;
-}
-
 void
 client_addr_init(struct idm *idm)
 {
@@ -241,8 +201,12 @@ client_dispatch_dns(int fd, short events
}
 
TAILQ_FOREACH(idm, &env->sc_idms, idm_entry) {
-   if (client_try_idm(env, idm) == -1)
-   idm->idm_state = STATE_LDAP_FAIL;
+   TAILQ_FOREACH(h, &idm->idm_addr, next) {
+   if (client_try_idm(env, idm, h) == -1)
+   idm->idm_state = STATE_LDAP_FAIL;
+   else
+   break;
+   }
 
if (idm->idm_state < STATE_LDAP_DONE)
wait_cnt++;
@@ -585,17 +549,36 @@ fail:
 }
 
 int
-client_try_idm(struct env *env, struct idm *idm)
+client_try_idm(struct env *env, struct idm *idm, struct ypldap_addr *addr)
 {
const char  *where;
+   char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
char*attrs[ATTR_MAX+1];
+   int  fd = -1;
int  i, j;
+   struct sockaddr *sa = (struct sockaddr *)&addr->ss;
struct aldap_message*m;
struct aldap*al;
 
+   if (getnameinfo(sa, SA_LEN(sa), hbuf, sizeof(hbuf), sbuf,
+   sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV))
+   errx(1, "could not get numeric hostname");
+
where = "connect";
-   if ((al = client_aldap_open(&idm->idm_addr)) == NULL)
+   if ((fd = socket(sa->sa_family, SOCK_STREAM, 0)) == -1)
+   return (-1);
+
+   if (connect(fd, sa, SA_LEN(sa)) != 0) {
+   log_warn("connect to %s port %s failed", hbuf, sbuf);
+   close(fd);
+   return (-1);
+   }
+
+   al = aldap_init(fd);
+   if (al == NULL) {
+   close(fd);
return (-1);
+   }
 
if (idm->idm_flags & F_STARTTLS) {
log_debug("requesting starttls");
@@ -625,8 +608,8 @@ client_try_idm(struct env *env, struct i
if (aldap_tls(al, idm->idm_tls_config, idm->i

mvsw phy-mode support

2023-04-08 Thread Jonathan Matthew
On the Turris Omnia, the host can't transmit over the interface linked to
the switch unless mvsw applies phy-mode settings to the port on its side,
specifically the rgmii delay settings.

ok?


Index: mvsw.c
===
RCS file: /cvs/src/sys/dev/fdt/mvsw.c,v
retrieving revision 1.5
diff -u -p -r1.5 mvsw.c
--- mvsw.c  6 Apr 2022 18:59:28 -   1.5
+++ mvsw.c  9 Apr 2023 04:19:21 -
@@ -52,6 +52,10 @@
 #define MVSW_PORT(x)   (0x10 + (x))
 #define MVSW_G20x1c
 
+#define MVSW_PORT_MAC_CTL  0x01
+#define  MVSW_PORT_MAC_CTL_RGMII_RXID  0x8000
+#define  MVSW_PORT_MAC_CTL_RGMII_TXID  0x4000
+#define  MVSW_PORT_MAC_CTL_RGMII_MASK  0xc000
 #define MVSW_PORT_SWITCHID 0x03
 #define  MVSW_PORT_SWITCHID_PROD_MASK  0xfff0
 #define  MVSW_PORT_SWITCHID_PROD_88E6141 0x3400
@@ -70,6 +74,17 @@
 /* XXX #include  */
 #define MDIO_MMD_PHYXS 4
 
+const struct {
+const char  *name;
+uint16_tmac_ctl;
+} mvsw_phy_modes[] = {
+{ "rgmii",  0 },
+{ "rgmii-id",   MVSW_PORT_MAC_CTL_RGMII_TXID |
+MVSW_PORT_MAC_CTL_RGMII_RXID },
+{ "rgmii-rxid", MVSW_PORT_MAC_CTL_RGMII_RXID },
+{ "rgmii-txid", MVSW_PORT_MAC_CTL_RGMII_TXID }
+};
+
 struct mvsw_softc {
struct device   sc_dev;
 
@@ -310,12 +325,27 @@ mvsw_serdes_write(struct mvsw_softc *sc,
 void
 mvsw_port_enable(struct mvsw_softc *sc, int node)
 {
+   char phy_mode[16] = { 0 };
uint16_t val;
-   int port;
+   int port, i;
 
port = OF_getpropint(node, "reg", -1);
if (port == -1)
return;
+
+   OF_getprop(node, "phy-mode", phy_mode, sizeof(phy_mode));
+   for (i = 0; i < nitems(mvsw_phy_modes); i++) {
+   if (strcmp(phy_mode, mvsw_phy_modes[i].name) == 0) {
+   val = mvsw_smi_read(sc, MVSW_PORT(port),
+   MVSW_PORT_MAC_CTL);
+   val &= ~MVSW_PORT_MAC_CTL_RGMII_MASK;
+   val |= mvsw_phy_modes[i].mac_ctl;
+   mvsw_smi_write(sc, MVSW_PORT(port),
+   MVSW_PORT_MAC_CTL, val);
+
+   break;
+   }
+   }
 
/* Enable port. */
val = mvsw_smi_read(sc, MVSW_PORT(port), MVSW_PORT_CTRL);



ypldap: reduce imsg traffic

2023-03-26 Thread Jonathan Matthew
On systems where we pull in around 100k users from ldap, ypldap uses a
fair bit of memory (over 300MB peak) moving data from the ldapclient process
to the main process.

The ldapclient process sends each user and group record to the parent process
in instances of struct idm_req, which includes a 1kB buffer for the user/group
details.  It currently sends the full struct through imsg, but only sending
the used portion of the 1kB buffer reduces peak memory usage to around 100MB,
and it turns out it's pretty easy, as in the diff below.

ok?


Index: ldapclient.c
===
RCS file: /cvs/src/usr.sbin/ypldap/ldapclient.c,v
retrieving revision 1.46
diff -u -p -r1.46 ldapclient.c
--- ldapclient.c13 Oct 2022 04:55:33 -  1.46
+++ ldapclient.c27 Mar 2023 04:19:53 -
@@ -567,7 +567,8 @@ client_search_idm(struct env *env, struc
 
if (client_build_req(idm, &ir, m, min_attr, max_attr) 
== 0)
imsg_compose_event(env->sc_iev, type, 0, 0, -1,
-   &ir, sizeof(ir));
+   &ir, sizeof(ir.ir_key) +
+   strlen(ir.ir_line) + 1);
 
aldap_freemsg(m);   
}
Index: ypldap.c
===
RCS file: /cvs/src/usr.sbin/ypldap/ypldap.c,v
retrieving revision 1.23
diff -u -p -r1.23 ypldap.c
--- ypldap.c22 Aug 2022 08:02:02 -  1.23
+++ ypldap.c27 Mar 2023 04:19:53 -
@@ -392,7 +392,7 @@ main_dispatch_client(int fd, short event
if (env->update_trashed)
break;
 
-   (void)memcpy(&ir, imsg.data, sizeof(ir));
+   (void)memcpy(&ir, imsg.data, n - IMSG_HEADER_SIZE);
if ((ue = calloc(1, sizeof(*ue))) == NULL ||
(ue->ue_line = strdup(ir.ir_line)) == NULL) {
/*
@@ -418,7 +418,7 @@ main_dispatch_client(int fd, short event
if (env->update_trashed)
break;
 
-   (void)memcpy(&ir, imsg.data, sizeof(ir));
+   (void)memcpy(&ir, imsg.data, n - IMSG_HEADER_SIZE);
if ((ge = calloc(1, sizeof(*ge))) == NULL ||
(ge->ge_line = strdup(ir.ir_line)) == NULL) {
/*



Re: vmm: mask WAITPKG cpuid feature to hide TPAUSE

2023-01-08 Thread Jonathan Matthew
On Sun, Jan 08, 2023 at 03:09:44PM -0500, Dave Voutila wrote:
> 
> Philip Guenther  writes:
> 
> > On Sat, Jan 7, 2023 at 11:04 AM Dave Voutila  wrote:
> >
> >  Bringing this to tech@ to increase my chance of someone testing my
> >  diff.
> >
> >  As reported in this thread on misc@ [1], I believe newer Intel hardware
> >  may be experiencing issues hosting Linux guests under vmm/vmd. It looks
> >  like there are some newer instructions Intel added (TPAUSE specifically)
> >  that also involve some new MSR(s).
> >
> >  I don't have 12th gen Intel hardware to test this on (I think that's
> >  Alder Lake). I'd like to mask this feature from vmm guests since it's
> >  related to an MSR we don't yet pass through or emulate and has to do
> >  with the TSC (which has it's own challenges in vmm).
> >
> >  For someone testing, you should be able to grab an Alpine Linux iso
> >  (-virt flavor) and boot it with vmd with the diff. (Without it should
> >  "hang" and spike CPU or just die.) Also check that WAITPKG shows up in
> >  your dmesg on the cpu feature output.
> >
> > This seem like it'll obviously work, but I guess it seems to me that this 
> > "opt-out" approach is generally
> > unsafe/unstable and vmd should consider actively switching to "opt-in" on 
> > all these CPUID feature bits.  I mean,
> > what bits are defined in the SEFF first-leaf EDX that _do_ work with vmd?
> >
> 
> Great point (I think you mean ECX). Here's an updated diff that flips it
> to a whitelist so Intel/AMD don't burn me with these new bits in the
> future. This better?

I tried this out on this cpu:
cpu0: 12th Gen Intel(R) Core(TM) i7-12700, 4789.57 MHz, 06-97-02

and it works as advertised, alpine-virt-3.17.0-x86_64.iso boots up to
the login prompt, which it doesn't do without the diff.

> 
> 
> diff refs/heads/master refs/heads/vmm-tsleep
> commit - bfce157fda90a812e1a99aa179a4c42f12ebfa24
> commit + 5b434c89250e1901340c11c8f9c380dc18d0ae91
> blob - 001a437045be145322be30288c1f47d63fb07634
> blob + 0bd908e273a1c0e6324e1bc9f8c8ca921555c86f
> --- sys/arch/amd64/amd64/identcpu.c
> +++ sys/arch/amd64/amd64/identcpu.c
> @@ -208,6 +208,7 @@ const struct {
>   { SEFF0ECX_AVX512VBMI,  "AVX512VBMI" },
>   { SEFF0ECX_UMIP,"UMIP" },
>   { SEFF0ECX_PKU, "PKU" },
> + { SEFF0ECX_WAITPKG, "WAITPKG" },
>  }, cpu_seff0_edxfeatures[] = {
>   { SEFF0EDX_AVX512_4FNNIW, "AVX512FNNIW" },
>   { SEFF0EDX_AVX512_4FMAPS, "AVX512FMAPS" },
> blob - cbde6cf9b02fc882a8ed17aa6adb5c43249e0302
> blob + b26bd32e2d9ea7386b1f58960dea40b787d6a341
> --- sys/arch/amd64/include/specialreg.h
> +++ sys/arch/amd64/include/specialreg.h
> @@ -201,6 +201,7 @@
>  #define SEFF0ECX_AVX512VBMI  0x0002 /* AVX-512 vector bit inst */
>  #define SEFF0ECX_UMIP0x0004 /* UMIP support */
>  #define SEFF0ECX_PKU 0x0008 /* Page prot keys for user mode */
> +#define SEFF0ECX_WAITPKG 0x0010 /* UMONITOR/UMWAIT/TPAUSE insns */
>  /* SEFF EDX bits */
>  #define SEFF0EDX_AVX512_4FNNIW   0x0004 /* AVX-512 neural network 
> insns */
>  #define SEFF0EDX_AVX512_4FMAPS   0x0008 /* AVX-512 mult accum single 
> prec */
> blob - 6b4802abf4b508495cdbc961bd799d3fa83b9c36
> blob + 032444b05e19d7fbec96a0d11b5b340f668c0917
> --- sys/arch/amd64/include/vmmvar.h
> +++ sys/arch/amd64/include/vmmvar.h
> @@ -672,8 +672,10 @@ struct vm_mprotect_ept_params {
>  SEFF0EBX_AVX512IFMA | SEFF0EBX_AVX512PF | \
>  SEFF0EBX_AVX512ER | SEFF0EBX_AVX512CD | \
>  SEFF0EBX_AVX512BW | SEFF0EBX_AVX512VL)
> -#define VMM_SEFF0ECX_MASK ~(SEFF0ECX_AVX512VBMI)
> 
> +/* ECX mask contains the bits to include */
> +#define VMM_SEFF0ECX_MASK (SEFF0ECX_PREFETCHWT1 | SEFF0ECX_UMIP | 
> SEFF0ECX_PKU)
> +
>  /* EDX mask contains the bits to include */
>  #define VMM_SEFF0EDX_MASK (SEFF0EDX_MD_CLEAR)
> 
> blob - 310208ac4cdb262aaedfa9b78d869fd5911607b2
> blob + ccf1164fd658a69dc383e1602ae0ce1f269de4e4
> --- sys/arch/i386/i386/machdep.c
> +++ sys/arch/i386/i386/machdep.c
> @@ -1038,6 +1038,7 @@ const struct cpu_cpuid_feature cpu_seff0_ecxfeatures[]
>   { SEFF0ECX_UMIP,"UMIP" },
>   { SEFF0ECX_AVX512VBMI,  "AVX512VBMI" },
>   { SEFF0ECX_PKU, "PKU" },
> + { SEFF0ECX_WAITPKG, "WAITPKG" },
>  };
> 
>  const struct cpu_cpuid_feature cpu_seff0_edxfeatures[] = {
> blob - 392b4ff412e2dd3c4c48ed6c9c84aa2358721c6a
> blob + 7ce77ca3fdc6bd1a51571dd0b5dbf5afc311a138
> --- sys/arch/i386/include/specialreg.h
> +++ sys/arch/i386/include/specialreg.h
> @@ -190,6 +190,7 @@
>  #define SEFF0ECX_AVX512VBMI  0x0002 /* AVX-512 vector bit inst */
>  #define SEFF0ECX_UMIP0x0004 /* UMIP support */
>  #define SEFF0ECX_PKU 0x0008 /* Page prot keys for user mode */
> +#define SEFF0ECX_WAITPKG 0x0010 /* UMONITOR/UMWAIT/TPAUSE insns */
>  /* SEFF EDX bits */
>  #define SEFF0EDX_AVX512_4FNNIW   0x0004 /* AVX-512 neural network 
> insns */
>  #define SEFF0EDX_AVX512_4FMAPS   

Re: Fix evcount_percpu() after evcount_init_percpu() (plus bits for mips64)

2022-12-04 Thread Jonathan Matthew
On Sun, Dec 04, 2022 at 02:31:41PM +, Visa Hankala wrote:
> Do not re-insert the event counter to evcount_list in evcount_percpu().
> Otherwise the list becomes corrupt when evcount_percpu() is called
> after evcount_init_percpu().
> 
> OK?

clearly I never managed to test that path.  oops.
ok jmatthew@

> 
> As an extra, use percpu counters with mips64 clock and ipi interrupts.
> 
> Index: kern/subr_evcount.c
> ===
> RCS file: src/sys/kern/subr_evcount.c,v
> retrieving revision 1.14
> diff -u -p -r1.14 subr_evcount.c
> --- kern/subr_evcount.c   10 Nov 2022 07:05:41 -  1.14
> +++ kern/subr_evcount.c   4 Dec 2022 14:17:59 -
> @@ -56,7 +56,6 @@ evcount_percpu(struct evcount *ec)
>   TAILQ_INSERT_TAIL(&evcount_percpu_init_list, ec, next);
>   } else {
>   ec->ec_percpu = counters_alloc(1);
> - TAILQ_INSERT_TAIL(&evcount_list, ec, next);
>   }
>  }
>  
> Index: arch/mips64/mips64/clock.c
> ===
> RCS file: src/sys/arch/mips64/mips64/clock.c,v
> retrieving revision 1.48
> diff -u -p -r1.48 clock.c
> --- arch/mips64/mips64/clock.c19 Nov 2022 16:23:48 -  1.48
> +++ arch/mips64/mips64/clock.c4 Dec 2022 14:17:58 -
> @@ -37,7 +37,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
>  #include 
>  #include 
> @@ -100,6 +99,7 @@ clockattach(struct device *parent, struc
>*/
>   set_intr(INTPRI_CLOCK, CR_INT_5, cp0_int5);
>   evcount_attach(&cp0_clock_count, "clock", &cp0_clock_irq);
> + evcount_percpu(&cp0_clock_count);
>  
>   /* try to avoid getting clock interrupts early */
>   cp0_set_compare(cp0_get_count() - 1);
> @@ -121,7 +121,7 @@ cp0_int5(uint32_t mask, struct trapframe
>   struct cpu_info *ci = curcpu();
>   int s;
>  
> - atomic_inc_long((unsigned long *)&cp0_clock_count.ec_count);
> + evcount_inc(&cp0_clock_count);
>  
>   cp0_set_compare(cp0_get_count() - 1);   /* clear INT5 */
>  
> Index: arch/mips64/mips64/ipifuncs.c
> ===
> RCS file: src/sys/arch/mips64/mips64/ipifuncs.c,v
> retrieving revision 1.25
> diff -u -p -r1.25 ipifuncs.c
> --- arch/mips64/mips64/ipifuncs.c 10 Apr 2022 13:23:14 -  1.25
> +++ arch/mips64/mips64/ipifuncs.c 4 Dec 2022 14:17:58 -
> @@ -84,6 +84,7 @@ mips64_ipi_init(void)
>   if (!cpuid) {
>   mtx_init(&smp_rv_mtx, IPL_HIGH);
>   evcount_attach(&ipi_count, "ipi", &ipi_irq);
> + evcount_percpu(&ipi_count);
>   }
>  
>   hw_ipi_intr_clear(cpuid);
> @@ -113,8 +114,7 @@ mips64_ipi_intr(void *arg)
>   for (bit = 0; bit < MIPS64_NIPIS; bit++) {
>   if (pending_ipis & (1UL << bit)) {
>   (*ipifuncs[bit])();
> - atomic_inc_long(
> - (unsigned long *)&ipi_count.ec_count);
> + evcount_inc(&ipi_count);
>   }
>   }
>   }
> 



acpimadt: ignore OEM-reserved apic structures

2022-11-21 Thread Jonathan Matthew
On a Dell R6515, acpimadt(4) prints this 512 times during boot:

 acpimadt0: unknown apic structure type 80

Previous generations of machines had a few of these, and they were easy
enough to ignore, but 512 is a bit excessive.

On further inspection, it seems types 0x80 through 0xFF are reserved for
OEM specific uses, which we're never going to be able to work with, so
complaining about it seems pointless.  If we encounter a non-OEM type we
don't know about, we should still report that though.

ok?


Index: acpimadt.c
===
RCS file: /cvs/src/sys/dev/acpi/acpimadt.c,v
retrieving revision 1.38
diff -u -p -r1.38 acpimadt.c
--- acpimadt.c  6 Apr 2022 18:59:27 -   1.38
+++ acpimadt.c  22 Nov 2022 03:58:00 -
@@ -418,8 +418,11 @@ acpimadt_attach(struct device *parent, s
break;
 
default:
-   printf("%s: unknown apic structure type %x\n",
-   self->dv_xname, entry->madt_lapic.apic_type);
+   if (entry->madt_lapic.apic_type < ACPI_MADT_OEM_RSVD) {
+   printf("%s: unknown apic structure type %x\n",
+   self->dv_xname,
+   entry->madt_lapic.apic_type);
+   }
}
 
addr += entry->madt_lapic.length;
Index: acpireg.h
===
RCS file: /cvs/src/sys/dev/acpi/acpireg.h,v
retrieving revision 1.58
diff -u -p -r1.58 acpireg.h
--- acpireg.h   9 Jan 2022 05:42:37 -   1.58
+++ acpireg.h   22 Nov 2022 03:58:01 -
@@ -352,6 +352,8 @@ struct acpi_madt_x2apic_nmi {
uint8_t reserved[3];
 } __packed;
 
+#define ACPI_MADT_OEM_RSVD 128
+
 union acpi_madt_entry {
struct acpi_madt_lapic  madt_lapic;
struct acpi_madt_ioapic madt_ioapic;



ypldap TLS by default

2022-10-13 Thread Jonathan Matthew
While working on ypconnect(2), Theo suggested that ypldap(8) should
not default to plaintext LDAP connections, since the data it's dealing with
is pretty important to the security of the system.  Here's a straightforward
diff implementing that, defaulting to what was previously called 'tls'
(STARTTLS on port 389), and adding a 'notls' option for plaintext.

ok?  other opinions on what this option should be called?

Index: parse.y
===
RCS file: /cvs/src/usr.sbin/ypldap/parse.y,v
retrieving revision 1.36
diff -u -p -u -p -r1.36 parse.y
--- parse.y 13 Oct 2022 04:55:33 -  1.36
+++ parse.y 13 Oct 2022 08:13:21 -
@@ -107,7 +107,7 @@ typedef struct {
 %token SERVER FILTER ATTRIBUTE BASEDN BINDDN GROUPDN BINDCRED MAPS CHANGE 
DOMAIN PROVIDE
 %token USER GROUP TO EXPIRE HOME SHELL GECOS UID GID INTERVAL
 %token PASSWD NAME FIXED LIST GROUPNAME GROUPPASSWD GROUPGID MAP
-%token INCLUDE DIRECTORY CLASS PORT ERROR GROUPMEMBERS LDAPS TLS CAFILE
+%token INCLUDE DIRECTORY CLASS PORT ERROR GROUPMEMBERS LDAPS TLS NOTLS CAFILE
 %token BIND LOCAL PORTMAP BINDEXT CERTFILE KEYFILE
 %token   STRING
 %token   NUMBER
@@ -366,9 +366,10 @@ diropt : BINDDN STRING 
{
}
;
 
-ssl: /* empty */   { $$ = 0; }
+ssl: /* empty */   { $$ = F_STARTTLS; }
| LDAPS { $$ = F_SSL; }
| TLS   { $$ = F_STARTTLS; }
+   | NOTLS { $$ = 0; }
;
 
 directory  : DIRECTORY STRING port ssl {
@@ -556,6 +557,7 @@ lookup(char *s)
{ "map",MAP },
{ "maps",   MAPS },
{ "name",   NAME },
+   { "notls",  NOTLS },
{ "passwd", PASSWD },
{ "port",   PORT },
{ "portmap",PORTMAP },
Index: ypldap.conf.5
===
RCS file: /cvs/src/usr.sbin/ypldap/ypldap.conf.5,v
retrieving revision 1.28
diff -u -p -u -p -r1.28 ypldap.conf.5
--- ypldap.conf.5   13 Oct 2022 04:55:33 -  1.28
+++ ypldap.conf.5   13 Oct 2022 08:13:21 -
@@ -119,15 +119,19 @@ directory are used to construct YP map e
 .Bl -tag -width Ds
 .It Ic directory Ar hostname Oo Ic port Ar port Oc Oo tls Oc Brq ...
 Defines a directory by hostname and optionally port number.
-If the
+The
 .Ar tls
-argument is not specified, no transport-level security will be used.
+argument specifies the transport-level security used for the connection.
 Valid options are:
 .Bl -tag -width Ds
 .It Ic tls
-Use STARTTLS to negotiate TLS, by default on port 389.
+Use STARTTLS to negotiate TLS on port 389 unless an alternate port is
+specified.
+This is the default.
 .It Ic ldaps
-Connect with TLS enabled, by default on port 636.
+Connect with TLS enabled on port 636 unless an alternate port is specified.
+.It Ic notls
+Connect with no transport-level security.
 .El
 .El
 .Pp



Re: memory barrier in counters_zero

2022-09-25 Thread Jonathan Matthew
On Sat, Sep 17, 2022 at 04:28:15PM +0200, Alexander Bluhm wrote:
> Hi,
> 
> Inspired by Taylor's talk at EuroBSDCon I think a memory barrier
> in counters_zero() is missing.  Reading uses two consumer barriers,
> so writing should also have two.

Will slides or notes from this talk be available at some point?
I went looking but didn't find anything.

> 
> Following code would have no barrier between writing generation
> number and writing counters.
> 
> counters_leave();
> counters_zero();
> 
> counters_leave() writes to generation number at the end, so
> counters_zero() needs a barrier at the start.
> 
> ok?

This seems reasonable, and I don't see a reason not to add the barrier
here after release.

counters_zero() is currently unused, so it's a bit hard to reason about,
but I think using it sensibly would involve a memory barrier between the
call to counters_zero() and any other updates, either through a lock or a 
timeout/interrupt style barrier.

> 
> bluhm
> 
> Index: kern/subr_percpu.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/subr_percpu.c,v
> retrieving revision 1.9
> diff -u -p -r1.9 subr_percpu.c
> --- kern/subr_percpu.c10 Mar 2021 10:21:47 -  1.9
> +++ kern/subr_percpu.c17 Sep 2022 14:17:34 -
> @@ -213,6 +213,7 @@ counters_zero(struct cpumem *cm, unsigne
>   unsigned int i;
>  
>   counters = cpumem_first(&cmi, cm);
> + membar_producer();
>   do {
>   for (i = 0; i < n; i++)
>   counters[i] = 0;
> 



ypldap client cert authentication

2022-09-19 Thread Jonathan Matthew
This adds client certificate authentication to ypldap(8).  libtls makes the
actual certificate part of this straightforward (I would still like it
reviewed, though), but there are some LDAP complications.

Depending on your LDAP server and how you connect to it (LDAPS on port 636
or LDAP+TLS on port 389), a client presenting a certificate might
automatically be bound as the subject of the certificate, or it might not.
If it's not, the client can do an LDAP bind operation using the SASL
EXTERNAL mechanism to bind as the cert subject, and it can optionally specify
an identity, which means the bind will fail if the cert subject doesn't match
that identity.  If the client didn't present a certificate, the bind will
also fail (one would hope).

For reference, with Active Directory, SASL EXTERNAL bind is required when
using LDAP+TLS, but not when using LDAPS, and the client identity can be
specified in the form of "dn:" followed by the expected cert subject DN.
OpenLDAP doesn't seem to do automatic bind at all, so SASL EXTERNAL would
always be required there, and it doesn't appear to support specifying the
expected identity with the bind.

The diff adds 'certfile' and 'keyfile' config directives for specifying
the certificate to use, and a 'bindext' directive for enabling SASL EXTERNAL
bind, optionally including the identity string.  SASL EXTERNAL bind doesn't
get enabled implicitly when you configure a client cert, because ypldap
can't tell if it's required or supported by the server.  It's also not an
error to enable SASL EXTERNAL bind without a client cert, since you could be
connecting through stunnel or something.

To configure this in ypldap.conf, you'd do something like this:

directory "ldap.example.com" tls {
bindext "dn:CN=ypldap,OU,Accounts,DC=example,DC=com"
certfile "/etc/ssl/ypldap-cert.pem"
keyfile "/etc/ssl/private/ypldap-key.pem"

...
}

ok?

Index: aldap.c
===
RCS file: /cvs/src/usr.sbin/ypldap/aldap.c,v
retrieving revision 1.48
diff -u -p -r1.48 aldap.c
--- aldap.c 31 Mar 2022 09:06:55 -  1.48
+++ aldap.c 19 Sep 2022 11:47:13 -
@@ -220,6 +220,40 @@ fail:
 }
 
 int
+aldap_bind_sasl_external(struct aldap *ldap, char *bindid)
+{
+   struct ber_element *root = NULL, *elm;
+
+   if ((root = ober_add_sequence(NULL)) == NULL)
+   goto fail;
+
+   elm = ober_printf_elements(root, "d{tds{ts", ++ldap->msgid,
+   BER_CLASS_APP, LDAP_REQ_BIND, VERSION, "",
+   BER_CLASS_CONTEXT, LDAP_AUTH_SASL, LDAP_SASL_MECH_EXTERNAL);
+   if (bindid == NULL)
+   elm = ober_add_null(elm);
+   else
+   elm = ober_add_string(elm, bindid);
+
+   if (elm == NULL)
+   goto fail;
+
+   LDAP_DEBUG("aldap_bind_sasl_external", root);
+
+   if (aldap_send(ldap, root) == -1) {
+   root = NULL;
+   goto fail;
+   }
+   return (ldap->msgid);
+fail:
+   if (root != NULL)
+   ober_free_elements(root);
+
+   ldap->err = ALDAP_ERR_OPERATION_FAILED;
+   return (-1);
+}
+
+int
 aldap_unbind(struct aldap *ldap)
 {
struct ber_element *root = NULL, *elm;
Index: aldap.h
===
RCS file: /cvs/src/usr.sbin/ypldap/aldap.h,v
retrieving revision 1.14
diff -u -p -r1.14 aldap.h
--- aldap.h 11 May 2019 17:46:02 -  1.14
+++ aldap.h 19 Sep 2022 11:47:13 -
@@ -32,6 +32,8 @@
 #define LDAP_PAGED_OID "1.2.840.113556.1.4.319"
 #define LDAP_STARTTLS_OID  "1.3.6.1.4.1.1466.20037"
 
+#define LDAP_SASL_MECH_EXTERNAL"EXTERNAL"
+
 struct aldap {
 #define ALDAP_ERR_SUCCESS  0
 #define ALDAP_ERR_PARSER_ERROR 1
@@ -137,6 +139,7 @@ enum deref_aliases {
 
 enum authentication_choice {
LDAP_AUTH_SIMPLE= 0,
+   LDAP_AUTH_SASL  = 3,
 };
 
 enum scope {
@@ -222,6 +225,7 @@ void aldap_freemsg(struct 
aldap_messa
 int aldap_req_starttls(struct aldap *);
 
 int aldap_bind(struct aldap *, char *, char *);
+int aldap_bind_sasl_external(struct aldap *, char *);
 int aldap_unbind(struct aldap *);
 int aldap_search(struct aldap *, char *, enum scope, char *, char **, int, 
int, int, struct aldap_page_control *);
 int aldap_get_errno(struct aldap *, const char **);
Index: ldapclient.c
===
RCS file: /cvs/src/usr.sbin/ypldap/ldapclient.c,v
retrieving revision 1.45
diff -u -p -r1.45 ldapclient.c
--- ldapclient.c22 Aug 2022 10:10:59 -  1.45
+++ ldapclient.c19 Sep 2022 11:47:13 -
@@ -635,7 +635,11 @@ client_try_idm(struct env *env, struct i
int rc;
 
where = "binding";
-   if (aldap_bind(al, idm->idm_binddn, idm->idm_bindcred) == -1)
+   if (idm->idm_bindext != 

Re: ure(4): add support for RTL8156B

2022-03-31 Thread Jonathan Matthew
On Thu, Mar 31, 2022 at 09:41:09PM +0800, Kevin Lo wrote:
> Hi,   
>   
>   
>   
> This diff adds preliminary support for RTL8156B to ure(4) and
> bug fixes for RTL8153/RTL8156.
> 
> Tested:
> ure0 at uhub0 port 12 configuration 1 interface 0 "Realtek USB 10/100/1G/2.5G 
> LAN" rev 3.20/31.00 addr 3
> ure0: RTL8156B (0x7410), address 00:e0:4c:xx:xx:xx

Works OK here:

ure0 at uhub0 port 2 configuration 1 interface 0 "Realtek USB 10/100 LAN" rev 
2.10/20.00 addr 2
ure0: RTL8152 (0x4c00), address 00:e0:4c:xx:xx:xx
rlphy0 at ure0 phy 0: RTL8201E 10/100 PHY, rev. 2

Regarding this part:

> @@ -1914,7 +2026,7 @@ ure_rxeof(struct usbd_xfer *xfer, void *
>   total_len -= roundup(pktlen, URE_RX_BUF_ALIGN);
>   buf += sizeof(rxhdr);
>  
> - m = m_devget(buf, pktlen, ETHER_ALIGN);
> + m = m_devget(buf, pktlen - ETHER_CRC_LEN, ETHER_ALIGN);
>   if (m == NULL) {
>   DPRINTF(("unable to allocate mbuf for next packet\n"));
>   ifp->if_ierrors++;

We tried this earlier (r1.22 of if_ure.c) and had to back it out because it
didn't work on some devices.  Have we worked out what the problem was there?



Re: fix very small ntpd leak

2022-03-23 Thread Jonathan Matthew
On Wed, Mar 23, 2022 at 04:59:06PM +0100, Otto Moerbeek wrote:
> On Wed, Mar 23, 2022 at 09:09:01PM +1000, Jonathan Matthew wrote:
> 
> > We noticed that the ntpd engine process was getting a bit big on some boxes
> > that we'd accidentally cut off from the ntp servers (routing is hard).
> > Reading through the code, I noticed the 'query' member of struct ntp_peer
> > is never freed, which seems to account for the leak.
> > 
> > If you have a server pool in ntpd.conf and it resolves, but ntpd is unable
> > to talk to the servers, it will re-resolve periodically, freeing the old 
> > list
> > of peers and creating new ones.
> > 
> > To show how slow the leak is, here's the leak report from MALLOC_OPTIONS=D
> > after running for about two hours with four servers from two pools.
> > 
> > without diff:
> >  
> > Leak report
> >  f sum  #avg
> >0x09392128 73
> >  0x889878b920b 512  1512
> >  0x889878bc8e14096  4   1024
> >  0x889878bd065 128  2 64
> >  0x88bc91f0b4b   18280  1  18280
> >  0x88bc926a9ed   65536  1  65536
> >  
> >  
> > with diff:
> >  
> > Leak report
> >  f sum  #avg
> >0x06064 16379
> >  0xbee1253320b 512  1512
> >  0xbf0265f4b4b   18280  1  18280
> >  0xbf02666e9ed   65536  1  65536
> > 
> > ok?
> > 
> > Index: ntp.c
> > ===
> > RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v
> > retrieving revision 1.168
> > diff -u -p -r1.168 ntp.c
> > --- ntp.c   24 Oct 2021 21:24:19 -  1.168
> > +++ ntp.c   23 Mar 2022 10:43:59 -
> > @@ -686,6 +686,7 @@ void
> >  peer_remove(struct ntp_peer *p)
> >  {
> > TAILQ_REMOVE(&conf->ntp_peers, p, entry);
> > +   free(p->query);
> > free(p);
> > peer_cnt--;
> >  }
> > 
> 
> This is a bug that dlg reported last week. Serendepity or not? :-)

We found it together looking at systems we run at work, so not really.

> 
> This is my diff that uses an approach I like a litle bit better.

I agree.  I wasn't sure if there was a reason the query was allocated
separately, so I went with the more straightforward diff to start with.

> 
>   -Otto
> 
> Index: client.c
> ===
> RCS file: /cvs/src/usr.sbin/ntpd/client.c,v
> retrieving revision 1.116
> diff -u -p -r1.116 client.c
> --- client.c  21 Apr 2021 09:38:11 -  1.116
> +++ client.c  21 Mar 2022 07:31:54 -
> @@ -51,10 +51,9 @@ set_deadline(struct ntp_peer *p, time_t 
>  int
>  client_peer_init(struct ntp_peer *p)
>  {
> - if ((p->query = calloc(1, sizeof(struct ntp_query))) == NULL)
> - fatal("client_peer_init calloc");
> - p->query->fd = -1;
> - p->query->msg.status = MODE_CLIENT | (NTP_VERSION << 3);
> + p->query.fd = -1;
> + p->query.msg.status = MODE_CLIENT | (NTP_VERSION << 3);
> + p->query.xmttime = 0;
>   p->state = STATE_NONE;
>   p->shift = 0;
>   p->trustlevel = TRUSTLEVEL_PATHETIC;
> @@ -91,7 +90,7 @@ client_addr_init(struct ntp_peer *p)
>   }
>   }
>  
> - p->query->fd = -1;
> + p->query.fd = -1;
>   set_next(p, 0);
>  
>   return (0);
> @@ -100,9 +99,9 @@ client_addr_init(struct ntp_peer *p)
>  int
>  client_nextaddr(struct ntp_peer *p)
>  {
> - if (p->query->fd != -1) {
> - close(p->query->fd);
> - p->query->fd = -1;
> + if (p->query.fd != -1) {
> + close(p->query.fd);
> + p->query.fd = -1;
>   }
>  
>   if (p->state == STATE_DNS_INPROGRESS)
> @@ -148,26 +147,26 @@ client_query(struct ntp_peer *p)
>   if (p->state < STATE_DNS_DONE || p->addr == NULL)
>   return (-1);
>  
> - if (p->query->fd == -1) {
> + if (p->query.fd == -1) {
>   struct sockaddr *sa = (struct sockaddr *)&p->addr->ss;
>   struct sockaddr *qa4 = (struct sockaddr *)&p->query_addr4;
>   struct sockaddr *qa6 = (struct sockaddr *)&p->query_addr6;
>  
> - if ((p->query->fd = socket(p->addr->ss.ss_family, SOCK_DGRAM,
> + if ((p->query.fd = socket(p->addr->ss.ss_family, SOCK_DGRAM,
&

fix very small ntpd leak

2022-03-23 Thread Jonathan Matthew
We noticed that the ntpd engine process was getting a bit big on some boxes
that we'd accidentally cut off from the ntp servers (routing is hard).
Reading through the code, I noticed the 'query' member of struct ntp_peer
is never freed, which seems to account for the leak.

If you have a server pool in ntpd.conf and it resolves, but ntpd is unable
to talk to the servers, it will re-resolve periodically, freeing the old list
of peers and creating new ones.

To show how slow the leak is, here's the leak report from MALLOC_OPTIONS=D
after running for about two hours with four servers from two pools.

without diff:
 
Leak report
 f sum  #avg
   0x09392128 73
 0x889878b920b 512  1512
 0x889878bc8e14096  4   1024
 0x889878bd065 128  2 64
 0x88bc91f0b4b   18280  1  18280
 0x88bc926a9ed   65536  1  65536
 
 
with diff:
 
Leak report
 f sum  #avg
   0x06064 16379
 0xbee1253320b 512  1512
 0xbf0265f4b4b   18280  1  18280
 0xbf02666e9ed   65536  1  65536

ok?

Index: ntp.c
===
RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v
retrieving revision 1.168
diff -u -p -r1.168 ntp.c
--- ntp.c   24 Oct 2021 21:24:19 -  1.168
+++ ntp.c   23 Mar 2022 10:43:59 -
@@ -686,6 +686,7 @@ void
 peer_remove(struct ntp_peer *p)
 {
TAILQ_REMOVE(&conf->ntp_peers, p, entry);
+   free(p->query);
free(p);
peer_cnt--;
 }



Re: ping icmp ident collisions

2022-02-20 Thread Jonathan Matthew
On Fri, Feb 18, 2022 at 04:03:28PM +0100, Florian Obser wrote:
> On 2022-02-18 12:17 +10, Jonathan Matthew  wrote:
> > The only thing ping uses to determine whether a received icmp echo reply 
> > packet is a
> > response to one of its requests is the 16 bit icmp ident field.  If you 
> > ping enough
> > stuff at the same time, eventually you'll have two concurrent pings using 
> > the same ident,
> > and they will both see each other's replies.  Since we do tricky MAC stuff 
> > on the ping
> > payload, this results in signature mismatches that look like this:
> >
> > PING 172.23.94.210 (172.23.94.210): 56 data bytes
> > 64 bytes from 172.23.94.210: icmp_seq=0 ttl=253 time=0.820 ms
> > 64 bytes from 172.23.94.210: icmp_seq=1 ttl=253 time=0.419 ms
> > 64 bytes from 172.23.94.210: icmp_seq=2 ttl=253 time=0.369 ms
> > signature mismatch!
> > 64 bytes from 172.23.94.210: icmp_seq=3 ttl=253 time=0.273 ms
> >
> > --- 172.23.94.210 ping statistics ---
> > 4 packets transmitted, 5 packets received, -- somebody's duplicating 
> > packets!
> > round-trip min/avg/max/std-dev = 0.273/0.376/0.820/0.265 ms
> >
> > ping is counting the packet with the signature mismatch as a reply it 
> > received, and it
> > prints a misleading message about duplicated packets because it got more 
> > replies than
> > the number of requests it sent.
> >
> > I think it would be more helpful not to count signature mismatch packets as 
> > replies.
> > If you're actually getting corrupted replies, I'd say that's more like 
> > packet loss
> > than normal operation.  If you're getting extra replies due to ident 
> > collisions, this
> > will result in ping sending and receiving the expected number of packets.
> >
> > Printing the source address and sequence number on signature mismatches 
> > would also help.
> > I would have figured this out much quicker had ping told me the mismatch 
> > packets were
> > from a completely different source.  For example:
> >
> > PING 172.23.94.210 (172.23.94.210): 56 data bytes
> > 64 bytes from 172.23.94.210: icmp_seq=0 ttl=253 time=2.645 ms
> > 64 bytes from 172.23.94.210: icmp_seq=1 ttl=253 time=1.360 ms
> > 64 bytes from 172.23.94.210: icmp_seq=2 ttl=253 time=0.506 ms
> > 64 bytes from 172.23.94.210: icmp_seq=3 ttl=253 time=0.615 ms
> > signature mismatch from 10.138.79.45: icmp_seq=0
> > 64 bytes from 172.23.94.210: icmp_seq=4 ttl=253 time=0.431 ms
> >
> > --- 172.23.94.210 ping statistics ---
> > 5 packets transmitted, 5 packets received, 0.0% packet loss
> > round-trip min/avg/max/std-dev = 0.431/1.111/2.645/0.835 ms
> >
> > ok?
> 
> OK florian
> 
> I think we can go further and also check the from address in the echo
> reply case, like this.
> 
> If something on the path is so confused as to answer to our pings with
> the wrong source address I think it's tcpdump time...
> 
> Feel free to put this in at the same time if you agree.

I considered doing this, but I think I'd rather have ping print out
anything it sees with the same ident, as long as it doesn't get confused
and mess up its statistics.

> 
> diff --git sbin/ping/ping.c sbin/ping/ping.c
> index 6fa634bca3e..e47baa8912c 100644
> --- sbin/ping/ping.c
> +++ sbin/ping/ping.c
> @@ -181,6 +181,9 @@ char *hostname;
>  int ident;   /* random number to identify our packets */
>  int v6flag;  /* are we ping6? */
>  
> +struct sockaddr_in dst4;
> +struct sockaddr_in6 dst6;
> +
>  /* counters */
>  int64_t npackets;/* max packets to transmit */
>  int64_t nreceived;   /* # of packets we got back */
> @@ -243,8 +246,8 @@ main(int argc, char *argv[])
>   struct addrinfo hints, *res;
>   struct itimerval itimer;
>   struct sockaddr *from, *dst;
> - struct sockaddr_in from4, dst4;
> - struct sockaddr_in6 from6, dst6;
> + struct sockaddr_in from4;
> + struct sockaddr_in6 from6;
>   struct cmsghdr *scmsg = NULL;
>   struct in6_pktinfo *pktinfo = NULL;
>   struct icmp6_filter filt;
> @@ -1285,6 +1288,13 @@ pr_pack(u_char *buf, int cc, struct msghdr *mhdr)
>   }
>  
>   if (echo_reply) {
> + if (v6flag) {
> + if (memcmp(&dst6, from, sizeof(dst6)) != 0)
> + return; /* 'Twas not our ECHO */
> + } else {
> + if (memcmp(&dst4, from, sizeof(dst4)) != 0)
> + return; /* 'Twas not our

ping icmp ident collisions

2022-02-17 Thread Jonathan Matthew
The only thing ping uses to determine whether a received icmp echo reply packet 
is a
response to one of its requests is the 16 bit icmp ident field.  If you ping 
enough
stuff at the same time, eventually you'll have two concurrent pings using the 
same ident,
and they will both see each other's replies.  Since we do tricky MAC stuff on 
the ping
payload, this results in signature mismatches that look like this:

PING 172.23.94.210 (172.23.94.210): 56 data bytes
64 bytes from 172.23.94.210: icmp_seq=0 ttl=253 time=0.820 ms
64 bytes from 172.23.94.210: icmp_seq=1 ttl=253 time=0.419 ms
64 bytes from 172.23.94.210: icmp_seq=2 ttl=253 time=0.369 ms
signature mismatch!
64 bytes from 172.23.94.210: icmp_seq=3 ttl=253 time=0.273 ms

--- 172.23.94.210 ping statistics ---
4 packets transmitted, 5 packets received, -- somebody's duplicating packets!
round-trip min/avg/max/std-dev = 0.273/0.376/0.820/0.265 ms

ping is counting the packet with the signature mismatch as a reply it received, 
and it
prints a misleading message about duplicated packets because it got more 
replies than
the number of requests it sent.

I think it would be more helpful not to count signature mismatch packets as 
replies.
If you're actually getting corrupted replies, I'd say that's more like packet 
loss
than normal operation.  If you're getting extra replies due to ident 
collisions, this
will result in ping sending and receiving the expected number of packets.

Printing the source address and sequence number on signature mismatches would 
also help.
I would have figured this out much quicker had ping told me the mismatch 
packets were
from a completely different source.  For example:

PING 172.23.94.210 (172.23.94.210): 56 data bytes
64 bytes from 172.23.94.210: icmp_seq=0 ttl=253 time=2.645 ms
64 bytes from 172.23.94.210: icmp_seq=1 ttl=253 time=1.360 ms
64 bytes from 172.23.94.210: icmp_seq=2 ttl=253 time=0.506 ms
64 bytes from 172.23.94.210: icmp_seq=3 ttl=253 time=0.615 ms
signature mismatch from 10.138.79.45: icmp_seq=0
64 bytes from 172.23.94.210: icmp_seq=4 ttl=253 time=0.431 ms

--- 172.23.94.210 ping statistics ---
5 packets transmitted, 5 packets received, 0.0% packet loss
round-trip min/avg/max/std-dev = 0.431/1.111/2.645/0.835 ms

ok?


Index: ping.c
===
RCS file: /cvs/src/sbin/ping/ping.c,v
retrieving revision 1.245
diff -u -p -r1.245 ping.c
--- ping.c  12 Jul 2021 15:09:19 -  1.245
+++ ping.c  18 Feb 2022 01:52:22 -
@@ -1302,7 +1302,10 @@ pr_pack(u_char *buf, int cc, struct msgh
 
if (timingsafe_memcmp(mac, &payload.mac,
sizeof(mac)) != 0) {
-   printf("signature mismatch!\n");
+   printf("signature mismatch from %s: "
+   "icmp_seq=%u\n", pr_addr(from, fromlen),
+   ntohs(seq));
+   --nreceived;
return;
}
timinginfo=1;



mpsafe dwxe(4)

2022-01-03 Thread Jonathan Matthew
This is almost identical to the changes I made to dwge(4) recently, since
these drivers are very closely related.  Unfortunately the only machine I
have with dwxe(4) in it is armv7, so I can't test this properly, but it
does still work there.

Could someone with an arm64 allwinner board try this out more extensively?


Index: if_dwxe.c
===
RCS file: /cvs/src/sys/dev/fdt/if_dwxe.c,v
retrieving revision 1.19
diff -u -p -r1.19 if_dwxe.c
--- if_dwxe.c   24 Oct 2021 17:52:26 -  1.19
+++ if_dwxe.c   3 Jan 2022 11:21:19 -
@@ -275,6 +275,7 @@ struct dwxe_softc {
bus_space_tag_t sc_iot;
bus_space_handle_t  sc_ioh;
bus_dma_tag_t   sc_dmat;
+   void*sc_ih;
 
struct arpcom   sc_ac;
 #define sc_lladdr  sc_ac.ac_enaddr
@@ -287,7 +288,6 @@ struct dwxe_softc {
struct dwxe_buf *sc_txbuf;
struct dwxe_desc*sc_txdesc;
int sc_tx_prod;
-   int sc_tx_cnt;
int sc_tx_cons;
 
struct dwxe_dmamem  *sc_rxring;
@@ -322,7 +322,7 @@ uint32_t dwxe_read(struct dwxe_softc *, 
 void   dwxe_write(struct dwxe_softc *, bus_addr_t, uint32_t);
 
 intdwxe_ioctl(struct ifnet *, u_long, caddr_t);
-void   dwxe_start(struct ifnet *);
+void   dwxe_start(struct ifqueue *);
 void   dwxe_watchdog(struct ifnet *);
 
 intdwxe_media_change(struct ifnet *);
@@ -345,7 +345,7 @@ voiddwxe_rx_proc(struct dwxe_softc *);
 void   dwxe_up(struct dwxe_softc *);
 void   dwxe_down(struct dwxe_softc *);
 void   dwxe_iff(struct dwxe_softc *);
-intdwxe_encap(struct dwxe_softc *, struct mbuf *, int *);
+intdwxe_encap(struct dwxe_softc *, struct mbuf *, int *, int *);
 
 void   dwxe_reset(struct dwxe_softc *);
 void   dwxe_stop_dma(struct dwxe_softc *);
@@ -431,8 +431,9 @@ dwxe_attach(struct device *parent, struc
ifp = &sc->sc_ac.ac_if;
ifp->if_softc = sc;
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+   ifp->if_xflags = IFXF_MPSAFE;
ifp->if_ioctl = dwxe_ioctl;
-   ifp->if_start = dwxe_start;
+   ifp->if_qstart = dwxe_start;
ifp->if_watchdog = dwxe_watchdog;
ifq_set_maxlen(&ifp->if_snd, DWXE_NTXDESC - 1);
bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ);
@@ -460,8 +461,10 @@ dwxe_attach(struct device *parent, struc
if_attach(ifp);
ether_ifattach(ifp);
 
-   fdt_intr_establish(faa->fa_node, IPL_NET, dwxe_intr, sc,
-   sc->sc_dev.dv_xname);
+   sc->sc_ih = fdt_intr_establish(faa->fa_node, IPL_NET | IPL_MPSAFE,
+   dwxe_intr, sc, sc->sc_dev.dv_xname);
+   if (sc->sc_ih == NULL)
+   printf("%s: can't establish interrupt\n", sc->sc_dev.dv_xname);
 }
 
 void
@@ -584,11 +587,12 @@ dwxe_lladdr_write(struct dwxe_softc *sc)
 }
 
 void
-dwxe_start(struct ifnet *ifp)
+dwxe_start(struct ifqueue *ifq)
 {
+   struct ifnet *ifp = ifq->ifq_if;
struct dwxe_softc *sc = ifp->if_softc;
struct mbuf *m;
-   int error, idx;
+   int error, idx, left, used;
 
if (!(ifp->if_flags & IFF_RUNNING))
return;
@@ -600,27 +604,29 @@ dwxe_start(struct ifnet *ifp)
return;
 
idx = sc->sc_tx_prod;
-   while ((sc->sc_txdesc[idx].sd_status & DWXE_TX_DESC_CTL) == 0) {
-   m = ifq_deq_begin(&ifp->if_snd);
-   if (m == NULL)
+   left = sc->sc_tx_cons;
+   if (left <= idx)
+   left += DWXE_NTXDESC;
+   left -= idx;
+   used = 0;
+
+   for (;;) {
+   if (used + DWXE_NTXSEGS + 1 > left) {
+   ifq_set_oactive(ifq);
break;
+   }
 
-   error = dwxe_encap(sc, m, &idx);
-   if (error == ENOBUFS) {
-   ifq_deq_rollback(&ifp->if_snd, m);
-   ifq_set_oactive(&ifp->if_snd);
+   m = ifq_dequeue(ifq);
+   if (m == NULL)
break;
-   }
+
+   error = dwxe_encap(sc, m, &idx, &used);
if (error == EFBIG) {
-   ifq_deq_commit(&ifp->if_snd, m);
m_freem(m); /* give up: drop it */
ifp->if_oerrors++;
continue;
}
 
-   /* Now we are committed to transmit the packet. */
-   ifq_deq_commit(&ifp->if_snd, m);
-
 #if NBPFILTER > 0
if (ifp->if_bpf)
bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
@@ -632,6 +638,9 @@ dwxe_start(struct ifnet *ifp)
 
/* Set a timeout in case the chip goes out to lunch. */
ifp->if_timer = 5;
+
+   dwxe_write(sc, DWXE_TX_CTL1, dwxe_read(sc,
+DWXE_TX_CTL1) | DWXE_TX_CTL1_TX_DMA_START);
}
 }
 

Re: fix ldapd bug when removing last attribute

2021-12-19 Thread Jonathan Matthew
On Sun, Dec 19, 2021 at 01:31:24PM +0100, Claudio Jeker wrote:
> In LDAP there is two ways to remove an attribute.
> One can remove an attribute by just naming the attribute but it is also
> possible to remove a specific attribute: value combo.
> 
> In ldapd the latter is broken if the last attribute is removed because
> the result of ldap_del_values() is an invalid encoding (empty sequence)
> and with that the modification fails because validate_entry() fails.
> The error is LDAP_INVALID_SYNTAX and I have noticed that in tools like
> shelldap multiple times but never really connected the dots until now.
> 
> This is the minimal way of solving this. If ldap_del_values()
> removes the last element use ldap_del_attribute() to remove the attribute
> but to make this work the ober_scanf_elements() format has to be relaxed
> since what we remove no longer parses with "{s(". Is this an acceptable
> solution?

I think so, ldapd doesn't seem to check the attribute values consistently
except where it actually has to look at them, so relaxing the check here
seems fine to me.

one note below, but ok jmatthew@

> 
> -- 
> :wq Claudio
> 
> Index: attributes.c
> ===
> RCS file: /cvs/src/usr.sbin/ldapd/attributes.c,v
> retrieving revision 1.6
> diff -u -p -r1.6 attributes.c
> --- attributes.c  24 Oct 2019 12:39:26 -  1.6
> +++ attributes.c  19 Dec 2021 12:12:48 -
> @@ -181,7 +181,7 @@ ldap_del_attribute(struct ber_element *e
>  
>   attr = entry->be_sub;
>   while (attr) {
> - if (ober_scanf_elements(attr, "{s(", &s) != 0) {
> + if (ober_scanf_elements(attr, "{s", &s) != 0) {
>   log_warnx("failed to parse attribute");
>   return -1;
>   }
> @@ -240,6 +240,9 @@ ldap_del_values(struct ber_element *elm,
>   prev = v;
>   }
>   }
> +
> + if (old_vals->be_sub == NULL)
> + return 1;
>  
>   return 0;
>  }
> Index: modify.c
> ===
> RCS file: /cvs/src/usr.sbin/ldapd/modify.c,v
> retrieving revision 1.23
> diff -u -p -r1.23 modify.c
> --- modify.c  24 Oct 2019 12:39:26 -  1.23
> +++ modify.c  19 Dec 2021 12:20:19 -
> @@ -334,7 +334,8 @@ ldap_modify(struct request *req)
>*/
>   if (vals->be_sub &&
>   vals->be_sub->be_type == BER_TYPE_OCTETSTRING) {
> - ldap_del_values(a, vals);
> + if (ldap_del_values(a, vals) == 1)
> + ldap_del_attribute(entry, attr);
>   } else {
>   ldap_del_attribute(entry, attr);
>   }
> Index: validate.c
> ===
> RCS file: /cvs/src/usr.sbin/ldapd/validate.c,v
> retrieving revision 1.12
> diff -u -p -r1.12 validate.c
> --- validate.c24 Oct 2019 12:39:26 -  1.12
> +++ validate.c19 Dec 2021 11:42:48 -
> @@ -313,6 +313,7 @@ validate_entry(const char *dn, struct be
>   objclass = objclass->be_next;   /* skip attribute description */
>   for (a = objclass->be_sub; a != NULL; a = a->be_next) {
>   if (ober_get_string(a, &s) != 0) {
> + log_debug("bad ObjectClass encoding");
>   rc = LDAP_INVALID_SYNTAX;
>   goto done;
>   }
> @@ -396,6 +397,7 @@ validate_entry(const char *dn, struct be
>*/
>   for (a = entry->be_sub; a != NULL; a = a->be_next) {
>   if (ober_scanf_elements(a, "{se{", &s, &vals) != 0) {
> + log_debug("bad attribue encoding");

misspelled 'attribute' here.

>   rc = LDAP_INVALID_SYNTAX;
>   goto done;
>   }
> 



fix ldapd unveil

2021-12-14 Thread Jonathan Matthew
ldapd currently can't reopen its database files, because it always passes
O_CREAT to open() when reopening (see ldapd_open_request()), which means it
needs the unveil 'c' flag.  This may have been missed when ldapd was unveiled
because 'ldapctl compact' was broken (see other diff).

ok?


Index: ldapd.c
===
RCS file: /cvs/src/usr.sbin/ldapd/ldapd.c,v
retrieving revision 1.29
diff -u -p -r1.29 ldapd.c
--- ldapd.c 14 Jul 2021 13:33:57 -  1.29
+++ ldapd.c 15 Dec 2021 03:42:04 -
@@ -243,7 +243,7 @@ main(int argc, char *argv[])
err(1, "unveil %s.db", _PATH_LOGIN_CONF);
if (unveil(_PATH_AUTHPROGDIR, "x") == -1)
err(1, "unveil %s", _PATH_AUTHPROGDIR);
-   if (unveil(datadir, "rw") == -1)
+   if (unveil(datadir, "rwc") == -1)
err(1, "unveil %s", datadir);
if (unveil(NULL, NULL) == -1)
err(1, "unveil");



fix ldapctl compact and index operations

2021-12-14 Thread Jonathan Matthew
r1.5 of ldapctl.c accidentally inverted the conditionals meant to skip
compacting or indexing namespaces with referrals.

ok?

Index: ldapctl.c
===
RCS file: /cvs/src/usr.sbin/ldapctl/ldapctl.c,v
retrieving revision 1.15
diff -u -p -u -p -r1.15 ldapctl.c
--- ldapctl.c   15 Jan 2021 18:57:04 -  1.15
+++ ldapctl.c   15 Dec 2021 03:29:36 -
@@ -128,8 +128,8 @@ compact_namespaces(const char *datadir)
struct namespace*ns;
 
TAILQ_FOREACH(ns, &conf->namespaces, next) {
-   if (SLIST_EMPTY(&ns->referrals))
-   continue;
+   if (!SLIST_EMPTY(&ns->referrals))
+   continue;
if (compact_namespace(ns, datadir) != 0)
return -1;
}
@@ -224,7 +224,7 @@ index_namespaces(const char *datadir)
struct namespace*ns;
 
TAILQ_FOREACH(ns, &conf->namespaces, next) {
-   if (SLIST_EMPTY(&ns->referrals))
+   if (!SLIST_EMPTY(&ns->referrals))
continue;
if (index_namespace(ns, datadir) != 0)
return -1;



mpsafe dwge(4)

2021-11-28 Thread Jonathan Matthew
This applies our normal strategies for making network drivers mpsafe, and
also writes to GMAC_TX_POLL_DEMAND once per call to dwge_start() rather than
once per packet, and returns rx slots once per interrupt rather than once
per packet.

I've tested this on a rockpro64, where it makes tcpbench etc. a bit faster.
I think I have an armv7 board with dwge(4) somewhere, but I haven't tested it
there yet.

ok?

Index: if_dwge.c
===
RCS file: /cvs/src/sys/dev/fdt/if_dwge.c,v
retrieving revision 1.12
diff -u -p -r1.12 if_dwge.c
--- if_dwge.c   24 Oct 2021 17:52:26 -  1.12
+++ if_dwge.c   28 Nov 2021 09:36:56 -
@@ -234,6 +234,7 @@ struct dwge_softc {
bus_space_tag_t sc_iot;
bus_space_handle_t  sc_ioh;
bus_dma_tag_t   sc_dmat;
+   void*sc_ih;
 
struct arpcom   sc_ac;
 #define sc_lladdr  sc_ac.ac_enaddr
@@ -247,7 +248,6 @@ struct dwge_softc {
struct dwge_buf *sc_txbuf;
struct dwge_desc*sc_txdesc;
int sc_tx_prod;
-   int sc_tx_cnt;
int sc_tx_cons;
 
struct dwge_dmamem  *sc_rxring;
@@ -289,7 +289,7 @@ uint32_t dwge_read(struct dwge_softc *, 
 void   dwge_write(struct dwge_softc *, bus_addr_t, uint32_t);
 
 intdwge_ioctl(struct ifnet *, u_long, caddr_t);
-void   dwge_start(struct ifnet *);
+void   dwge_start(struct ifqueue *);
 void   dwge_watchdog(struct ifnet *);
 
 intdwge_media_change(struct ifnet *);
@@ -312,7 +312,7 @@ voiddwge_rx_proc(struct dwge_softc *);
 void   dwge_up(struct dwge_softc *);
 void   dwge_down(struct dwge_softc *);
 void   dwge_iff(struct dwge_softc *);
-intdwge_encap(struct dwge_softc *, struct mbuf *, int *);
+intdwge_encap(struct dwge_softc *, struct mbuf *, int *, int *);
 
 void   dwge_reset(struct dwge_softc *);
 void   dwge_stop_dma(struct dwge_softc *);
@@ -422,8 +422,9 @@ dwge_attach(struct device *parent, struc
ifp = &sc->sc_ac.ac_if;
ifp->if_softc = sc;
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+   ifp->if_xflags = IFXF_MPSAFE;
ifp->if_ioctl = dwge_ioctl;
-   ifp->if_start = dwge_start;
+   ifp->if_qstart = dwge_start;
ifp->if_watchdog = dwge_watchdog;
ifq_set_maxlen(&ifp->if_snd, DWGE_NTXDESC - 1);
bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ);
@@ -535,8 +536,10 @@ dwge_attach(struct device *parent, struc
dwge_write(sc, GMAC_MMC_TX_INT_MSK, 0x);
dwge_write(sc, GMAC_MMC_IPC_INT_MSK, 0x);
 
-   fdt_intr_establish(faa->fa_node, IPL_NET, dwge_intr, sc,
-   sc->sc_dev.dv_xname);
+   sc->sc_ih = fdt_intr_establish(faa->fa_node, IPL_NET | IPL_MPSAFE,
+   dwge_intr, sc, sc->sc_dev.dv_xname);
+   if (sc->sc_ih == NULL)
+   printf("%s: can't establish interrupt\n", sc->sc_dev.dv_xname);
 }
 
 void
@@ -612,11 +615,12 @@ dwge_lladdr_write(struct dwge_softc *sc)
 }
 
 void
-dwge_start(struct ifnet *ifp)
+dwge_start(struct ifqueue *ifq)
 {
+   struct ifnet *ifp = ifq->ifq_if;
struct dwge_softc *sc = ifp->if_softc;
struct mbuf *m;
-   int error, idx;
+   int error, idx, left, used;
 
if (!(ifp->if_flags & IFF_RUNNING))
return;
@@ -628,27 +632,29 @@ dwge_start(struct ifnet *ifp)
return;
 
idx = sc->sc_tx_prod;
-   while ((sc->sc_txdesc[idx].sd_status & TDES0_OWN) == 0) {
-   m = ifq_deq_begin(&ifp->if_snd);
-   if (m == NULL)
+   left = sc->sc_tx_cons;
+   if (left <= idx)
+   left += DWGE_NTXDESC;
+   left -= idx;
+   used = 0;
+
+   for (;;) {
+   if (used + DWGE_NTXSEGS + 1 > left) {
+   ifq_set_oactive(ifq);
break;
+   }
 
-   error = dwge_encap(sc, m, &idx);
-   if (error == ENOBUFS) {
-   ifq_deq_rollback(&ifp->if_snd, m);
-   ifq_set_oactive(&ifp->if_snd);
+   m = ifq_dequeue(ifq);
+   if (m == NULL)
break;
-   }
+
+   error = dwge_encap(sc, m, &idx, &used);
if (error == EFBIG) {
-   ifq_deq_commit(&ifp->if_snd, m);
m_freem(m); /* give up: drop it */
ifp->if_oerrors++;
continue;
}
 
-   /* Now we are committed to transmit the packet. */
-   ifq_deq_commit(&ifp->if_snd, m);
-
 #if NBPFILTER > 0
if (ifp->if_bpf)
bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
@@ -660,6 +666,8 @@ dwge_start(struct ifnet *ifp)
 
/* Set a timeout in case the chip goes out to lunch. */
ifp->if_timer =

Re: ixl(4): add rx/tx checksum offloading

2021-11-08 Thread Jonathan Matthew
On Tue, Oct 26, 2021 at 02:52:10PM +0200, Jan Klemkow wrote:
> On Tue, Oct 26, 2021 at 05:17:55PM +1000, Jonathan Matthew wrote:
> > First of all, thanks for looking at this, I forgot we hadn't done offloads
> > for ixl(4) yet.
> 
> You're welcome.
> 
> > In the case of ixl(4), the driver has to tell the nic the length of each of 
> > the
> > packet headers, so it should also be tested with vlan interfaces.
> > 
> > I think ixl_tx_setup_offload() needs to account for outgoing vlan-tagged 
> > packets.
> 
> Yes, it should.  I just want to keep this diff small for now.  I plan to
> implement handling of vlan tags in a later diff.  The code just stops
> processing the offload and returns, if the stack tries to send out a
> vlan taged ethernet frame in the switch-statement at the beginning.
> 
> So, with vlan tags we just don't offload checksumming at the moment.

and it turns out vlan interfaces don't allow checksum offload unless the parent
interface does vlan tagging too, so this doesn't matter until that's 
implemented.
I think I forget this every time.

> 
> I also tested this scenario.
> 
> > It currently assumes the ethernet header is ETHER_HDR_LEN bytes long, which 
> > isn't
> > always true.  See ixgbe_tx_ctx_setup() (sys/dev/pci/if_ix.c) for an example 
> > of
> > a driver that takes this into account.
> 
> I already looked at this code and will adapt vlan tagging later, if this
> is OK for you?

It'd probably be simpler to do vlan tagging first, so checksum offload could be 
done
all at once, but since we're here already, ok jmatthew@

> 
> Thanks,
> Jan
> 
> > > Index: dev/pci/if_ixl.c
> > > ===
> > > RCS file: /mount/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v
> > > retrieving revision 1.75
> > > diff -u -p -r1.75 if_ixl.c
> > > --- dev/pci/if_ixl.c  23 Jul 2021 00:29:14 -  1.75
> > > +++ dev/pci/if_ixl.c  25 Oct 2021 15:11:46 -
> > > @@ -82,6 +82,10 @@
> > >  #endif
> > >  
> > >  #include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > >  #include 
> > >  
> > >  #include 
> > > @@ -1388,6 +1392,7 @@ static int  ixl_rxeof(struct ixl_softc *,
> > >  static void  ixl_rxfill(struct ixl_softc *, struct ixl_rx_ring *);
> > >  static void  ixl_rxrefill(void *);
> > >  static int   ixl_rxrinfo(struct ixl_softc *, struct if_rxrinfo *);
> > > +static void  ixl_rx_checksum(struct mbuf *, uint64_t);
> > >  
> > >  #if NKSTAT > 0
> > >  static void  ixl_kstat_attach(struct ixl_softc *);
> > > @@ -1942,9 +1947,9 @@ ixl_attach(struct device *parent, struct
> > >   ifp->if_capabilities = IFCAP_VLAN_MTU;
> > >  #if 0
> > >   ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
> > > - ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 |
> > > - IFCAP_CSUM_UDPv4;
> > >  #endif
> > > + ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 |
> > > + IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> > >  
> > >   ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status);
> > >  
> > > @@ -2772,6 +2777,69 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
> > >  }
> > >  
> > >  static void
> > > +ixl_tx_setup_offload(struct mbuf *mp, uint64_t *cmd)
> > > +{
> > > + uint64_t ip_hdr_len;
> > > + int  ipoff = ETHER_HDR_LEN;
> > > + uint8_t  ipproto;
> > > + struct ip   *ip;
> > > +#ifdef INET6
> > > + struct ip6_hdr  *ip6;
> > > +#endif
> > > + struct tcphdr   *th;
> > > + struct mbuf *m;
> > > +
> > > + switch (ntohs(mtod(mp, struct ether_header *)->ether_type)) {
> > > + case ETHERTYPE_IP:
> > > + if (mp->m_pkthdr.len < ETHER_HDR_LEN + sizeof(*ip))
> > > + return;
> > > + m = m_getptr(mp, ETHER_HDR_LEN, &ipoff);
> > > + KASSERT(m != NULL && m->m_len - ipoff >= sizeof(*ip));
> > > + ip = (struct ip *)(m->m_data + ipoff);
> > > +
> > > + if (mp->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT)
> > > + *cmd |= IXL_TX_DESC_CMD_IIPT_IPV4_CSUM;
> > > + else
> > > + *cmd |= IXL_TX_DESC_CMD_IIPT_IPV4;
> > > +
> > > + 

Re: ixl(4): add rx/tx checksum offloading

2021-10-26 Thread Jonathan Matthew
Hi Jan,

First of all, thanks for looking at this, I forgot we hadn't done offloads
for ixl(4) yet.

On Mon, Oct 25, 2021 at 05:27:28PM +0200, Jan Klemkow wrote:
> On Fri, Oct 22, 2021 at 03:39:01PM +0200, Hrvoje Popovski wrote:
> > On 22.10.2021. 13:39, Jan Klemkow wrote:
> > > Thats because, you only see this flags, if the checksum offloading is
> > > enabled for "sending".  I'm still working/debugging on the sending side.
> > > Thus, I just send a diff with the receiving part for now.
> > > 
> > > You can see if its working for your card with the netstat(8) statistics.
> > > 
> > > # netstat -s | grep software-checksummed
> > > 
> > > These counters should not raise much on the receive side if you put some
> > > traffic over the interface.
> > 
> > Thank you for explanation...
> > 
> > I'm sending 8 tcp streams with iperf3 from some box to openbsd ixl box
> > and here are results:
> > 
> > without diff
> > smc24# netstat -s | grep software-checksummed
> > 5039250 input datagrams software-checksummed
> > 2592718 output datagrams software-checksummed
> > 2592709 packets software-checksummed
> > 5039250 packets software-checksummed
> > 0 input packets software-checksummed
> > 0 output packets software-checksummed
> > 
> > cca 6.12 Gbits/sec
> > 
> > 
> > 
> > with diff
> > smc24# netstat -s | grep software-checksummed
> > 0 input datagrams software-checksummed
> > 2956546 output datagrams software-checksummed
> > 2956537 packets software-checksummed
> > 0 packets software-checksummed
> > 0 input packets software-checksummed
> > 0 output packets software-checksummed
> > 
> > cca 6.70 Gbits/sec
> > 
> > are result like those expected?
> > 
> > is forwarding testing any good for checksum offload diffs?
> 
> Hi Hrvoje,
> 
> Thanks a lot for you big testing efforts!
> 
> In case of forwarding the forwarding box just checks the IPv4 header
> checksum and ignores the UDP/TCP header.  Your setup from one Box to
> another is fine.
> 
> Here is a new diff, which also includes send checksum offloading.
> Thus, all software-checksummed numbers should stay low in both
> directions.
> 
> Could you test this diff with your ospf{6}d and NFS tests?
> If you see IPv4 fragments in the ospf and NFS traffic within tcpdump(8),
> your test should find the bugs pointed out by deraadt@ and claudio@.

In the case of ixl(4), the driver has to tell the nic the length of each of the
packet headers, so it should also be tested with vlan interfaces.

I think ixl_tx_setup_offload() needs to account for outgoing vlan-tagged 
packets.
It currently assumes the ethernet header is ETHER_HDR_LEN bytes long, which 
isn't
always true.  See ixgbe_tx_ctx_setup() (sys/dev/pci/if_ix.c) for an example of
a driver that takes this into account.


> 
> You can provoke large NFS packets with the following options on your NFS
> mount point.
> 
> server:/export /mnt nfs ro,intr,-r65536,-w65536
> 
> Thanks,
> Jan
> 
> Index: dev/pci/if_ixl.c
> ===
> RCS file: /mount/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v
> retrieving revision 1.75
> diff -u -p -r1.75 if_ixl.c
> --- dev/pci/if_ixl.c  23 Jul 2021 00:29:14 -  1.75
> +++ dev/pci/if_ixl.c  25 Oct 2021 15:11:46 -
> @@ -82,6 +82,10 @@
>  #endif
>  
>  #include 
> +#include 
> +#include 
> +#include 
> +#include 
>  #include 
>  
>  #include 
> @@ -1388,6 +1392,7 @@ static int  ixl_rxeof(struct ixl_softc *,
>  static void  ixl_rxfill(struct ixl_softc *, struct ixl_rx_ring *);
>  static void  ixl_rxrefill(void *);
>  static int   ixl_rxrinfo(struct ixl_softc *, struct if_rxrinfo *);
> +static void  ixl_rx_checksum(struct mbuf *, uint64_t);
>  
>  #if NKSTAT > 0
>  static void  ixl_kstat_attach(struct ixl_softc *);
> @@ -1942,9 +1947,9 @@ ixl_attach(struct device *parent, struct
>   ifp->if_capabilities = IFCAP_VLAN_MTU;
>  #if 0
>   ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
> - ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 |
> - IFCAP_CSUM_UDPv4;
>  #endif
> + ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 |
> + IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
>  
>   ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status);
>  
> @@ -2772,6 +2777,69 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
>  }
>  
>  static void
> +ixl_tx_setup_offload(struct mbuf *mp, uint64_t *cmd)
> +{
> + uint64_t ip_hdr_len;
> + int  ipoff = ETHER_HDR_LEN;
> + uint8_t  ipproto;
> + struct ip   *ip;
> +#ifdef INET6
> + struct ip6_hdr  *ip6;
> +#endif
> + struct tcphdr   *th;
> + struct mbuf *m;
> +
> + switch (ntohs(mtod(mp, struct ether_header *)->ether_type)) {
> + case ETHERTYPE_IP:
> + if (mp->m_pkthdr.len < ETHER_HDR_LEN + sizeof(*ip))
> + return;
>

uaq(4): aquantia usb ethernet driver

2021-08-31 Thread Jonathan Matthew
Here's a driver for the Aquantia USB ethernet devices I just added
to usbdevs.  These are somewhat interesting because they theoretically
go up to 5GbE and support jumbo frames (not implemented yet).

While working on this I noticed that it doesn't receive 15-25% of the packets
it should, even at very low packet rates, when connected to ehci(4) controllers.
No such packet loss occurs with an xhci(4) controller.  I'm not sure if this
is a problem with our ehci driver or a poor hardware interaction.

ok?

Index: files.usb
===
RCS file: /cvs/src/sys/dev/usb/files.usb,v
retrieving revision 1.145
diff -u -p -u -r1.145 files.usb
--- files.usb   4 Feb 2021 16:25:39 -   1.145
+++ files.usb   31 Aug 2021 23:41:35 -
@@ -295,6 +295,10 @@ device ure: ether, ifnet, mii, ifmedia
 attach ure at uhub
 file   dev/usb/if_ure.cure
 
+# Aquantia AQC111
+device uaq: ether, ifnet, ifmedia
+attach uaq at uhub
+file   dev/usb/if_uaq.cuaq
 
 # Serial drivers
 # Modems
Index: if_uaq.c
===
RCS file: if_uaq.c
diff -N if_uaq.c
--- /dev/null   1 Jan 1970 00:00:00 -
+++ if_uaq.c31 Aug 2021 23:41:35 -
@@ -0,0 +1,1397 @@
+/* $OpenBSD$   */
+/*-
+ * Copyright (c) 2021 Jonathan Matthew 
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *notice, this list of conditions and the following disclaimer in the
+ *documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "bpfilter.h"
+#include "vlan.h"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+#include 
+
+#if NBPFILTER > 0
+#include 
+#endif
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#ifdef UAQ_DEBUG
+#define DPRINTF(x) do { if (uaqdebug) printf x; } while (0)
+#define DPRINTFN(n,x)  do { if (uaqdebug >= (n)) printf x; } while (0)
+intuaqdebug = 0;
+#else
+#define DPRINTF(x)
+#define DPRINTFN(n,x)
+#endif
+
+#define UAQ_ENDPT_RX   0
+#define UAQ_ENDPT_TX   1
+#define UAQ_ENDPT_INTR 2
+#define UAQ_ENDPT_MAX  3
+
+#define UAQ_TX_LIST_CNT1
+#define UAQ_RX_LIST_CNT1
+#define UAQ_TX_BUF_ALIGN   8
+#define UAQ_RX_BUF_ALIGN   8
+
+#define UAQ_TX_BUFSZ   16384
+#define UAQ_RX_BUFSZ   32768
+
+#define UAQ_CTL_READ   1
+#define UAQ_CTL_WRITE  2
+
+#define UAQ_MCAST_FILTER_SIZE  8
+
+/* control commands */
+#define UAQ_CMD_ACCESS_MAC 0x01
+#define UAQ_CMD_FLASH_PARAM0x20
+#define UAQ_CMD_PHY_POWER  0x31
+#define UAQ_CMD_WOL_CFG0x60
+#define UAQ_CMD_PHY_OPS0x61
+
+/* SFR registers */
+#define UAQ_SFR_GENERAL_STATUS 0x03
+#define UAQ_SFR_CHIP_STATUS0x05
+#define UAQ_SFR_RX_CTL 0x0B
+#define  UAQ_SFR_RX_CTL_STOP   0x
+#define  UAQ_SFR_RX_CTL_PRO0x0001
+#define  UAQ_SFR_RX_CTL_AMALL  0x0002
+#define  UAQ_SFR_RX_CTL_AB 0x0008
+#define  UAQ_SFR_RX_CTL_AM 0x0010
+#define  UAQ_SFR_RX_CTL_START  0x0080
+#define  UAQ_SFR_RX_CTL_IPE0x0200
+#define UAQ_SFR_IPG_0  0x0D
+#define UAQ_SFR_NODE_ID0x10
+#define UAQ_SFR_MCAST_FILTER   0x16
+#define UAQ_SFR_MEDIUM_STATUS_MODE 0x22
+#define  UAQ_SFR_MEDIUM_XGMIIMODE  0x0001
+#define  UAQ_SFR_MEDIUM_FULL_DUPLEX0x0002
+#define  UAQ_SFR_MEDIUM_RXFLOW_CTRLEN  0x0010
+#define  UAQ_SFR_MEDIUM_TXFLOW_CTRLEN  0x0020
+#define  UAQ_SFR_MEDIUM_JUMBO_EN   0x0040
+#define  UAQ_SFR_MEDIUM_RECEIVE_EN 0x0100
+#define UAQ_SFR_MONITOR_MODE   0x24
+#define  UAQ_SFR_MONITOR_MODE_EPHYRW   0x01
+#define  UAQ_SFR_MONITOR_MODE_RWLC 0x02
+#define  UAQ_SFR_MONITOR_MOD

Re: snmpd: allow sending traps with SNMPv3

2021-08-16 Thread Jonathan Matthew
On Tue, Aug 10, 2021 at 12:58:05PM +0200, Martijn van Duren wrote:
> On Mon, 2021-08-09 at 21:44 +0200, Martijn van Duren wrote:
> > On Tue, 2021-07-27 at 21:28 +0200, Martijn van Duren wrote:
> > > This diff allows sending traps in SNMPv3 messages.
> > > It defaults to the global seclevel, but it can be specified on a per
> > > rule basis.
> > > 
> > > Diff requires both previous setting engineid and ober_dup diff.
> > > 
> > > Tested with netsnmp's snmptrapd and my WIP diff.
> > > 
> > > The other 2 outstanding diffs are for receiving SNMPv3 traps.
> > > 
> > > OK?
> > > 
> > > martijn@
> > > 
> > Resending now that the engineid diff is in.
> > 
> > Still awaiting the commit of ober_dup diff[0].
> > 
> > OK once that one goes in?
> > 
> > Also, rereading the diff, splitting the trap receiver in two might be a
> > bit clutch. Once again invoking the manpage gurus.
> > 
> > martijn@
> > 
> > [0] https://marc.info/?l=openbsd-tech&m=162698527126249&w=2
> > 
> The listen on diff committed this morning broke this patch.
> Updated version

I think my only concern with this is that the config syntax changes
incompatibly, since you now have to specify 'snmpv2c' for v2c trap
receivers.  I can think of a few alternatives, but none of them are
great.  What you've done here seems to be the cleanest option both in
terms of what the config looks like and the code for processing it,
so if we're prepared to change the config syntax, I'm happy with it.



usb: don't pass USBD_EXCLUSIVE_USE to usbd_open_pipe_intr()

2021-08-07 Thread Jonathan Matthew
While working on a new driver, I noticed we have a few places where we
pass USBD_EXCLUSIVE_USE as the flags parameter to usbd_open_pipe_intr(),
which is wrong.

The interrupt pipe is always opened exclusively, and the flags parameter is
actually passed to usbd_setup_xfer(), where it means USBD_NO_COPY, so any data
written by the transfer is not copied to the buffer where the driver expects
it.

I don't have hardware supported by any these drivers, but most of them
don't look at the transferred data, and in a couple of them, the interrupt
pipe code is #if 0'd out, so I think there is little chance this changes
anything.

ok?

Index: if_aue.c
===
RCS file: /cvs/src/sys/dev/usb/if_aue.c,v
retrieving revision 1.111
diff -u -p -u -p -r1.111 if_aue.c
--- if_aue.c31 Jul 2020 10:49:32 -  1.111
+++ if_aue.c8 Aug 2021 03:25:19 -
@@ -1355,7 +1355,7 @@ aue_openpipes(struct aue_softc *sc)
return (EIO);
}
err = usbd_open_pipe_intr(sc->aue_iface, sc->aue_ed[AUE_ENDPT_INTR],
-   USBD_EXCLUSIVE_USE, &sc->aue_ep[AUE_ENDPT_INTR], sc,
+   0, &sc->aue_ep[AUE_ENDPT_INTR], sc,
&sc->aue_cdata.aue_ibuf, AUE_INTR_PKTLEN, aue_intr,
AUE_INTR_INTERVAL);
if (err) {
Index: if_udav.c
===
RCS file: /cvs/src/sys/dev/usb/if_udav.c,v
retrieving revision 1.84
diff -u -p -u -p -r1.84 if_udav.c
--- if_udav.c   31 Jul 2020 10:49:32 -  1.84
+++ if_udav.c   8 Aug 2021 03:25:19 -
@@ -769,7 +769,7 @@ udav_openpipes(struct udav_softc *sc)
/* XXX: interrupt endpoint is not yet supported */
/* Open Interrupt pipe */
err = usbd_open_pipe_intr(sc->sc_ctl_iface, sc->sc_intrin_no,
- USBD_EXCLUSIVE_USE, &sc->sc_pipe_intr, sc,
+ 0, &sc->sc_pipe_intr, sc,
  &sc->sc_cdata.udav_ibuf, UDAV_INTR_PKGLEN,
  udav_intr, UDAV_INTR_INTERVAL);
if (err) {
Index: if_ugl.c
===
RCS file: /cvs/src/sys/dev/usb/if_ugl.c,v
retrieving revision 1.26
diff -u -p -u -p -r1.26 if_ugl.c
--- if_ugl.c31 Jul 2020 10:49:32 -  1.26
+++ if_ugl.c8 Aug 2021 03:25:20 -
@@ -681,7 +681,7 @@ ugl_openpipes(struct ugl_softc *sc)
return (EIO);
}
err = usbd_open_pipe_intr(sc->sc_iface, sc->sc_ed[UGL_ENDPT_INTR],
-   USBD_EXCLUSIVE_USE, &sc->sc_ep[UGL_ENDPT_INTR], sc,
+   0, &sc->sc_ep[UGL_ENDPT_INTR], sc,
sc->sc_ibuf, UGL_INTR_PKTLEN, ugl_intr,
UGL_INTR_INTERVAL);
if (err) {
Index: if_upl.c
===
RCS file: /cvs/src/sys/dev/usb/if_upl.c,v
retrieving revision 1.78
diff -u -p -u -p -r1.78 if_upl.c
--- if_upl.c31 Jul 2020 10:49:32 -  1.78
+++ if_upl.c8 Aug 2021 03:25:20 -
@@ -661,7 +661,7 @@ upl_openpipes(struct upl_softc *sc)
return (EIO);
}
err = usbd_open_pipe_intr(sc->sc_iface, sc->sc_ed[UPL_ENDPT_INTR],
-   USBD_EXCLUSIVE_USE, &sc->sc_ep[UPL_ENDPT_INTR], sc,
+   0, &sc->sc_ep[UPL_ENDPT_INTR], sc,
&sc->sc_ibuf, UPL_INTR_PKTLEN, upl_intr,
UPL_INTR_INTERVAL);
if (err) {
Index: if_url.c
===
RCS file: /cvs/src/sys/dev/usb/if_url.c,v
retrieving revision 1.88
diff -u -p -u -p -r1.88 if_url.c
--- if_url.c31 Jul 2020 10:49:33 -  1.88
+++ if_url.c8 Aug 2021 03:25:20 -
@@ -635,7 +635,7 @@ url_openpipes(struct url_softc *sc)
/* XXX: interrupt endpoint is not yet supported */
/* Open Interrupt pipe */
err = usbd_open_pipe_intr(sc->sc_ctl_iface, sc->sc_intrin_no,
- USBD_EXCLUSIVE_USE, &sc->sc_pipe_intr, sc,
+ 0, &sc->sc_pipe_intr, sc,
  &sc->sc_cdata.url_ibuf, URL_INTR_PKGLEN,
  url_intr, URL_INTR_INTERVAL);
if (err) {
Index: if_wi_usb.c
===
RCS file: /cvs/src/sys/dev/usb/if_wi_usb.c,v
retrieving revision 1.73
diff -u -p -u -p -r1.73 if_wi_usb.c
--- if_wi_usb.c 31 Jul 2020 10:49:33 -  1.73
+++ if_wi_usb.c 8 Aug 2021 03:25:21 -
@@ -1233,7 +1233,7 @@ wi_usb_open_pipes(struct wi_usb_softc *s
 
/* is this used? */
err = usbd_open_pipe_intr(sc->wi_usb_iface,
-   sc->wi_usb_ed[WI_USB_ENDPT_INTR], USBD_EXCLUSIVE_USE,
+   sc->wi_usb_ed[WI_USB_ENDPT_INTR], 0,
&sc->wi_usb_ep[WI_USB_ENDPT_INTR], sc, &sc->wi_usb_ibuf,
WI_USB_INTR_PKTLEN, wi_usb_intr, WI_USB_INTR_INTERVAL);
if (err) {



Re: libutil/ber: add ober_dup(3)

2021-08-01 Thread Jonathan Matthew
On Thu, Jul 22, 2021 at 10:19:59PM +0200, Martijn van Duren wrote:
> I'm currently working on adding SNMPv3 support to traps in snmpd(8).
> For sending traps we loop over sc_trapreceivers and can send each trap
> to 0 or more receivers.
> 
> I want to high-jack snmpe_response() to do the heavy lifting for doing
> the snmp/usm encoding, but this interface frees the varbindlist in
> snmp_msgfree(), which means I need to rebuild the varbindlist for every
> iteration. To keep this simple I suggest adding ober_dup, which
> duplicates a full ber_element chain.
> 
> Sending this prior to any of my snmpd(8) work, since it requires a
> library version bump.
> 
> OK?
> Any additional coordination needed for this diff?

ok jmatthew@
I don't think we need to worry about clashing with existing symbols in
ports, since we renamed ber_* to ober_* because no one was using it.

> 
> martijn@
> 
> Index: Symbols.map
> ===
> RCS file: /cvs/src/lib/libutil/Symbols.map,v
> retrieving revision 1.3
> diff -u -p -r1.3 Symbols.map
> --- Symbols.map   24 Oct 2019 12:39:26 -  1.3
> +++ Symbols.map   22 Jul 2021 20:18:35 -
> @@ -65,6 +65,7 @@
>   ober_add_set;
>   ober_add_string;
>   ober_calc_len;
> + ober_dup;
>   ober_free;
>   ober_free_element;
>   ober_free_elements;
> Index: ber.c
> ===
> RCS file: /cvs/src/lib/libutil/ber.c,v
> retrieving revision 1.21
> diff -u -p -r1.21 ber.c
> --- ber.c 22 Feb 2021 17:15:02 -  1.21
> +++ ber.c 22 Jul 2021 20:18:35 -
> @@ -926,6 +926,43 @@ ober_getpos(struct ber_element *elm)
>   return elm->be_offs;
>  }
>  
> +struct ber_element *
> +ober_dup(struct ber_element *orig)
> +{
> + struct ber_element *new;
> +
> + if ((new = malloc(sizeof(*new))) == NULL)
> + return NULL;
> + memcpy(new, orig, sizeof(*new));
> + new->be_next = NULL;
> + new->be_sub = NULL;
> +
> + if (orig->be_next != NULL) {
> + if ((new->be_next = ober_dup(orig->be_next)) == NULL)
> + goto fail;
> + }
> + if (orig->be_encoding == BER_TYPE_SEQUENCE ||
> + orig->be_encoding == BER_TYPE_SET) {
> + if (orig->be_sub != NULL) {
> + if ((new->be_sub = ober_dup(orig->be_sub)) == NULL)
> + goto fail;
> + }
> + } else if (orig->be_encoding == BER_TYPE_OCTETSTRING ||
> + orig->be_encoding == BER_TYPE_BITSTRING ||
> + orig->be_encoding == BER_TYPE_OBJECT) {
> + if (orig->be_val != NULL) {
> + if ((new->be_val = malloc(orig->be_len)) == NULL)
> + goto fail;
> + memcpy(new->be_val, orig->be_val, orig->be_len);
> + }
> + } else
> + new->be_numeric = orig->be_numeric;
> + return new;
> + fail:
> + ober_free_elements(new);
> + return NULL;
> +}
> +
>  void
>  ober_free_element(struct ber_element *root)
>  {
> Index: ber.h
> ===
> RCS file: /cvs/src/lib/libutil/ber.h,v
> retrieving revision 1.3
> diff -u -p -r1.3 ber.h
> --- ber.h 31 Dec 2019 10:34:14 -  1.3
> +++ ber.h 22 Jul 2021 20:18:35 -
> @@ -137,6 +137,7 @@ ssize_tober_write_elements(struct be
>  void  ober_set_readbuf(struct ber *, void *, size_t);
>  struct ber_element   *ober_read_elements(struct ber *, struct ber_element *);
>  off_t ober_getpos(struct ber_element *);
> +struct ber_element   *ober_dup(struct ber_element *);
>  void  ober_free_element(struct ber_element *);
>  void  ober_free_elements(struct ber_element *);
>  size_tober_calc_len(struct ber_element *);
> Index: ober_set_header.3
> ===
> RCS file: /cvs/src/lib/libutil/ober_set_header.3,v
> retrieving revision 1.2
> diff -u -p -r1.2 ober_set_header.3
> --- ober_set_header.3 12 Mar 2021 05:18:01 -  1.2
> +++ ober_set_header.3 22 Jul 2021 20:18:35 -
> @@ -23,6 +23,7 @@
>  .Nm ober_set_writecallback ,
>  .Nm ober_link_elements ,
>  .Nm ober_replace_elements ,
> +.Nm ober_dup ,
>  .Nm ober_unlink_elements ,
>  .Nm ober_free_element ,
>  .Nm ober_free_elements
> @@ -45,6 +46,8 @@
>  .Ft "void"
>  .Fn "ober_replace_elements" "struct ber_element *prev" "struct ber_element 
> *elm"
>  .Ft "struct ber_element *"
> +.Ft "struct ber_element *"
> +.Fn "ober_dup" "struct ber_element *orig"
>  .Fn "ober_unlink_elements" "struct ber_element *prev"
>  .Ft "void"
>  .Fn "ober_free_element" "struct ber_element *root"
> @@ -101,6 +104,9 @@ with
>  and frees any dynamically allocated storage associated with
>  .Fa prev .
>  .

Re: snmpd(8): set smi_applicatoin in usm_decrypt

2021-08-01 Thread Jonathan Matthew
On Thu, Jul 22, 2021 at 10:27:44PM +0200, Martijn van Duren wrote:
> Not an issue with read requests, but will set requests if they contain
> snmp application elements such as timeticks.
> 
> Definitely needed for upcoming SNMPv3 trap support.
> 
> OK?

ok jmatthew@

> 
> martijn@
> 
> Index: usm.c
> ===
> RCS file: /cvs/src/usr.sbin/snmpd/usm.c,v
> retrieving revision 1.20
> diff -u -p -r1.20 usm.c
> --- usm.c 20 Jun 2021 19:55:48 -  1.20
> +++ usm.c 22 Jul 2021 20:27:01 -
> @@ -630,6 +630,7 @@ usm_decrypt(struct snmp_message *msg, st
>   return NULL;
>  
>   bzero(&ber, sizeof(ber));
> + ober_set_application(&ber, smi_application);
>   ober_set_readbuf(&ber, buf, scoped_pdu_len);
>   scoped_pdu = ober_read_elements(&ber, NULL);
>  
> 
> 



Re: snmpd(8): fix trapv2 on correct protocol detection

2021-08-01 Thread Jonathan Matthew
On Thu, Jul 22, 2021 at 10:31:53PM +0200, Martijn van Duren wrote:
> This type-O snuck in when merging traphandler into snmpe.
> Not a big deal since it's there just for ASN1/SMI strictness, but it
> breaks when introducing SNMPv3 support.
> 
> OK?

ok jmatthew@

> 
> martijn@
> 
> Index: snmpe.c
> ===
> RCS file: /cvs/src/usr.sbin/snmpd/snmpe.c,v
> retrieving revision 1.72
> diff -u -p -r1.72 snmpe.c
> --- snmpe.c   20 Jun 2021 19:55:48 -  1.72
> +++ snmpe.c   22 Jul 2021 20:31:31 -
> @@ -381,7 +381,7 @@ badversion:
>   case SNMP_C_TRAPV2:
>   if (msg->sm_pdutype == SNMP_C_TRAPV2 &&
>   !(msg->sm_version == SNMP_V2 ||
> - msg->sm_version != SNMP_V3)) {
> + msg->sm_version == SNMP_V3)) {
>   msg->sm_errstr = "trapv2 request on !SNMPv2C or "
>   "!SNMPv3 message";
>   goto parsefail;
> 
> 



Re: snmpd(8): Allow setting engineid

2021-08-01 Thread Jonathan Matthew
On Tue, Jul 27, 2021 at 08:43:20PM +0200, Martijn van Duren wrote:
> Previous diff failed to set the initial bit when not defining engineid
> in the config.
> 
> On Fri, 2021-07-23 at 15:41 +0200, Martijn van Duren wrote:
> > This diff introduces setting the engineid for snmpd(8).
> > Although this diff might seem quite excessive at first glance, there's
> > a valid reason to do so.
> > 
> > The following things are in effect when sending an SNMPv3 trap:
> > - SNMP trap packets are unacknowledged; meaning that we don't get a
> >   response -, nor report message.
> > - SNMPv3 packets with a trap contain the engineid of the sender.
> > - The key used in auth and priv are derived from the password and the
> >   engineid.
> > - users are linked to an engineid
> > 
> > So if we're sending messages in SNMPv3 format we can't generate a random
> > engineid on each boot as we do now, or the trap receiver can't find the
> > correct user. Since I want to keep the default config as empty as
> > possible I've choosen to use the first 27 bytes (maximum length that
> > fits in the engineid) of the sha256 hash of the hostname(3). This should
> > give us the biggest confidence in having a consistent name that won't
> > clash with other agents. If someone has a better idea though, please
> > speak up now.

This seems reasonable to me.  Another option would be to generate a
random ID once and store it on disk, like the SOII key.  Seems like an
awkward thing to do when there's also a config file that the information
could be in, though, so I don't think this is really a good option.

> > 
> > As for allowing to set the engineid: When receiving a trap admins will
> > need to be able to specify the engineid of the remote agent, or there
> > will be problems with the key generation of that user.
> > Given this requirement it's a small step to allow the same yacc rules
> > to be used for setting the global engineid and gives a little more
> > control to the admin. The global engineid just happens to be more
> > convenient to implement first.
> > 
> > OK?

If no one has any better ideas for generating a default engine ID,
ok jmatthew@



Re: ix(4): fix Rx hash type

2021-07-26 Thread Jonathan Matthew
On Wed, Jul 14, 2021 at 01:46:37PM +0800, Kevin Lo wrote:
> Hi,
> 
> The diff below fixes Rx desc RSS type.  This matches what Linux and FreeBSD 
> do.
> ok?

ok jmatthew@

> 
> Index: sys/dev/pci/if_ix.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
> retrieving revision 1.178
> diff -u -p -u -p -r1.178 if_ix.c
> --- sys/dev/pci/if_ix.c   22 Dec 2020 23:25:37 -  1.178
> +++ sys/dev/pci/if_ix.c   14 Jul 2021 05:41:08 -
> @@ -3071,7 +3071,8 @@ ixgbe_rxeof(struct rx_ring *rxr)
>  
>   i = rxr->next_to_check;
>   while (if_rxr_inuse(&rxr->rx_ring) > 0) {
> - uint32_t hash, hashtype;
> + uint32_t hash;
> + uint16_t hashtype;
>  
>   bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
>   dsize * i, dsize, BUS_DMASYNC_POSTREAD);
> @@ -3101,7 +3102,8 @@ ixgbe_rxeof(struct rx_ring *rxr)
>   vtag = letoh16(rxdesc->wb.upper.vlan);
>   eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0);
>   hash = lemtoh32(&rxdesc->wb.lower.hi_dword.rss);
> - hashtype = lemtoh32(&rxdesc->wb.lower.lo_dword.data) &
> + hashtype =
> + lemtoh16(&rxdesc->wb.lower.lo_dword.hs_rss.pkt_info) &
>   IXGBE_RXDADV_RSSTYPE_MASK;
>  
>   if (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) {
> 



Re: ahci(4): Add support for JMicron JMB585 chipset

2021-07-24 Thread Jonathan Matthew
On Thu, Jul 22, 2021 at 10:45:17PM -0400, Ashton Fagg wrote:
> I have two devices here based on the JMicron JMB585 chipset. This diff
> adds the required pcidev IDs and sets disables native command queuing in
> the driver. FreeBSD does something similar for this device:
> 
> https://github.com/freebsd/freebsd-src/commit/16b766eed443043f4216d50e40ba283e74f992c2

Can you explain how you came to the conclusion that you'd need to
disable NCQ?  The FreeBSD commit you link to doesn't appear to do that
as they're not applying the AHCI_Q_NONCQ flag to these devices.
Does it not work with NCQ enabled?



Re: ix(4)/riscv64: Make ix(4) work when MSI-X interrupts aren't available

2021-07-20 Thread Jonathan Matthew
On Tue, Jul 20, 2021 at 02:21:39PM +0200, Mark Kettenis wrote:
> > Date: Tue, 20 Jul 2021 21:55:56 +1000
> > From: Jonathan Matthew 
> > 
> > On Mon, Jul 19, 2021 at 07:37:10PM -0400, Ashton Fagg wrote:
> > > I have an Intel 82599 10 gigabit ethernet card I wanted to get working
> > > on my SiFive Unmatched board.
> > > 
> > > I found the ix(4) driver has some weirdness around MSI-X
> > > interrupts. While the driver supports operating both with and without
> > > MSI-X support, it's hard-coded via a flag rather than dynamically checking
> > > if it's available. If the flag is set (which it always is right now),
> > > but MSI-X isn't available, the driver will throw an error and the device
> > > won't work:
> > > 
> > > ix0 at pci7 dev 0 function 0 "Intel 82599" rev 0x01ixgbe_allocate_msix: 
> > > pci_intr_map_msix vec 0 failed
> > > 
> > > The root cause is this call failing in if_ix.c:
> > > 
> > >   if (pci_intr_map_msix(pa, i, &ih)) {
> > >   printf("ixgbe_allocate_msix: "
> > >   "pci_intr_map_msix vec %d failed\n", i);
> > >   error = ENOMEM;
> > >   goto fail;
> > >   }
> > > 
> > > 
> > > Because in _pci_intr_map_msix (in sys/arch/riscv64/dev/pci_machdep.c):
> > > 
> > > if ((pa->pa_flags & PCI_FLAGS_MSI_ENABLED) == 0 ||
> > >   pci_get_capability(pc, tag, PCI_CAP_MSI, NULL, NULL) == 0)
> > >   return -1;
> > > 
> > > The PCI attach flags would not have PCI_FLAGS_MSI_ENABLED set.
> > > 
> > > The following diff remedies that by checking if PCI_FLAGS_MSI_ENABLED is
> > > actually set, rather than just trying and failing because the hard-coded
> > > flag says so. It also enables ix(4) in the kernel config for
> > > riscv64. Effectively, the driver will now only try to use MSI-X if the
> > > machine is advertising it to be available.
> > 
> > I'd rather not have to do this in every driver.  We otherwise check that 
> > flag
> > inside the pci interrupt functions rather than in the driver code, so we
> > should do so in pci_intr_msix_count() too, since that's what we call in
> > multi-queue nic drivers to decide whether to use MSI-X.  Drivers that only
> > want a single vector will just call pci_intr_map_msix() and fall back to MSI
> > or legacy interrupts if that fails.
> > 
> > I posted the alternate version of this diff to misc@ a few days ago,
> > which repeats the checks used to set PCI_FLAGS_MSI_ENABLED in
> > pci_intr_msix_count(), rather than passing in struct
> > pci_attach_args, in case we prefer to do it that way.
> 
> I don't really read misc@, so don't post your patches there.

Right, it was just there for testing.

> 
> > Mark, what do you think?
> 
> Yeah, making pci_intr_msix_count() should return 0 if MSIs are not
> supported.  A bit strange though to pass both pa and pa->pa_tag.  I'd
> change the function to only take pa as an argument.

Yes, on second look that makes sense.  Here's a better diff with that change,
and that also doesn't break arches without __HAVE_PCI_MSIX.  ok?

Index: if_bnxt.c
===
RCS file: /cvs/src/sys/dev/pci/if_bnxt.c,v
retrieving revision 1.32
diff -u -p -u -p -r1.32 if_bnxt.c
--- if_bnxt.c   24 Apr 2021 09:37:46 -  1.32
+++ if_bnxt.c   21 Jul 2021 03:24:44 -
@@ -537,7 +537,7 @@ bnxt_attach(struct device *parent, struc
sc->sc_flags |= BNXT_FLAG_MSIX;
intrstr = pci_intr_string(sc->sc_pc, ih);
 
-   nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag);
+   nmsix = pci_intr_msix_count(pa);
if (nmsix > 1) {
sc->sc_ih = pci_intr_establish(sc->sc_pc, ih,
IPL_NET | IPL_MPSAFE, bnxt_admin_intr, sc, 
DEVNAME(sc));
Index: if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.178
diff -u -p -u -p -r1.178 if_ix.c
--- if_ix.c 22 Dec 2020 23:25:37 -  1.178
+++ if_ix.c 21 Jul 2021 03:24:44 -
@@ -1783,7 +1783,7 @@ ixgbe_setup_msix(struct ix_softc *sc)
if (!ixgbe_enable_msix)
return;
 
-   nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag);
+   nmsix = pci_intr_msix_count(pa);
if (nmsix <= 1)
return;
 
Index: if_ixl.c
==

Re: ix(4)/riscv64: Make ix(4) work when MSI-X interrupts aren't available

2021-07-20 Thread Jonathan Matthew
On Mon, Jul 19, 2021 at 07:37:10PM -0400, Ashton Fagg wrote:
> I have an Intel 82599 10 gigabit ethernet card I wanted to get working
> on my SiFive Unmatched board.
> 
> I found the ix(4) driver has some weirdness around MSI-X
> interrupts. While the driver supports operating both with and without
> MSI-X support, it's hard-coded via a flag rather than dynamically checking
> if it's available. If the flag is set (which it always is right now),
> but MSI-X isn't available, the driver will throw an error and the device
> won't work:
> 
> ix0 at pci7 dev 0 function 0 "Intel 82599" rev 0x01ixgbe_allocate_msix: 
> pci_intr_map_msix vec 0 failed
> 
> The root cause is this call failing in if_ix.c:
> 
>   if (pci_intr_map_msix(pa, i, &ih)) {
>   printf("ixgbe_allocate_msix: "
>   "pci_intr_map_msix vec %d failed\n", i);
>   error = ENOMEM;
>   goto fail;
>   }
> 
> 
> Because in _pci_intr_map_msix (in sys/arch/riscv64/dev/pci_machdep.c):
> 
> if ((pa->pa_flags & PCI_FLAGS_MSI_ENABLED) == 0 ||
>   pci_get_capability(pc, tag, PCI_CAP_MSI, NULL, NULL) == 0)
>   return -1;
> 
> The PCI attach flags would not have PCI_FLAGS_MSI_ENABLED set.
> 
> The following diff remedies that by checking if PCI_FLAGS_MSI_ENABLED is
> actually set, rather than just trying and failing because the hard-coded
> flag says so. It also enables ix(4) in the kernel config for
> riscv64. Effectively, the driver will now only try to use MSI-X if the
> machine is advertising it to be available.

I'd rather not have to do this in every driver.  We otherwise check that flag
inside the pci interrupt functions rather than in the driver code, so we
should do so in pci_intr_msix_count() too, since that's what we call in
multi-queue nic drivers to decide whether to use MSI-X.  Drivers that only
want a single vector will just call pci_intr_map_msix() and fall back to MSI
or legacy interrupts if that fails.

I posted the alternate version of this diff to misc@ a few days ago, which
repeats the checks used to set PCI_FLAGS_MSI_ENABLED in pci_intr_msix_count(),
rather than passing in struct pci_attach_args, in case we prefer to do it that
way.

Mark, what do you think?


Index: if_bnxt.c
===
RCS file: /cvs/src/sys/dev/pci/if_bnxt.c,v
retrieving revision 1.32
diff -u -p -u -p -r1.32 if_bnxt.c
--- if_bnxt.c   24 Apr 2021 09:37:46 -  1.32
+++ if_bnxt.c   20 Jul 2021 11:23:22 -
@@ -537,7 +537,7 @@ bnxt_attach(struct device *parent, struc
sc->sc_flags |= BNXT_FLAG_MSIX;
intrstr = pci_intr_string(sc->sc_pc, ih);
 
-   nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag);
+   nmsix = pci_intr_msix_count(pa, pa->pa_tag);
if (nmsix > 1) {
sc->sc_ih = pci_intr_establish(sc->sc_pc, ih,
IPL_NET | IPL_MPSAFE, bnxt_admin_intr, sc, 
DEVNAME(sc));
Index: if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.178
diff -u -p -u -p -r1.178 if_ix.c
--- if_ix.c 22 Dec 2020 23:25:37 -  1.178
+++ if_ix.c 20 Jul 2021 11:23:22 -
@@ -1783,7 +1783,7 @@ ixgbe_setup_msix(struct ix_softc *sc)
if (!ixgbe_enable_msix)
return;
 
-   nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag);
+   nmsix = pci_intr_msix_count(pa, pa->pa_tag);
if (nmsix <= 1)
return;
 
Index: if_ixl.c
===
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.74
diff -u -p -u -p -r1.74 if_ixl.c
--- if_ixl.c26 Mar 2021 08:02:34 -  1.74
+++ if_ixl.c20 Jul 2021 11:23:22 -
@@ -1795,7 +1795,7 @@ ixl_attach(struct device *parent, struct
}
 
if (pci_intr_map_msix(pa, 0, &sc->sc_ih) == 0) {
-   int nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag);
+   int nmsix = pci_intr_msix_count(pa, pa->pa_tag);
if (nmsix > 1) { /* we used 1 (the 0th) for the adminq */
nmsix--;
 
Index: if_mcx.c
===
RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
retrieving revision 1.101
diff -u -p -u -p -r1.101 if_mcx.c
--- if_mcx.c2 Jun 2021 19:16:11 -   1.101
+++ if_mcx.c20 Jul 2021 11:23:22 -
@@ -2831,7 +2831,7 @@ mcx_attach(struct device *parent, struct
goto teardown;
}
 
-   msix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag);
+   msix = pci_intr_msix_count(pa, pa->pa_tag);
if (msix < 2) {
printf(": not enough msi-x vectors\n");
goto teardown;
Index: if_vmx.c
===
RCS file: /cvs/src/sys/dev

Re: uao references & uao_swap_off() cleanup

2021-06-23 Thread Jonathan Matthew
On Wed, Jun 23, 2021 at 09:37:10AM +0200, Martin Pieuchot wrote:
> On 16/06/21(Wed) 11:26, Martin Pieuchot wrote:
> > Diff below does two things:
> > 
> > - Use atomic operations for incrementing/decrementing references of
> >   anonymous objects.  This allows us to manipulate them without holding
> >   the KERNEL_LOCK().
> > 
> > - Rewrite the loop from uao_swap_off() to only keep a reference to the
> >   next item in the list.  This is imported from NetBSD and is necessary
> >   to introduce locking around uao_pagein().
> > 
> > ok?
> 
> Anyone?

uao_reference_locked() and uao_detach_locked() are prototyped in
uvm_extern.h, so they should be removed here too.

It doesn't look like uao_detach() is safe to call without the
kernel lock; it calls uao_dropswap() for each page, which calls
uao_set_swslot(), which includes a KERNEL_ASSERT_LOCKED().
Should we keep the KERNEL_ASSERT_LOCKED() in uao_detach()?

ok jmatthew@ otherwise

> 
> > 
> > Index: uvm/uvm_aobj.c
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
> > retrieving revision 1.98
> > diff -u -p -r1.98 uvm_aobj.c
> > --- uvm/uvm_aobj.c  16 Jun 2021 09:02:21 -  1.98
> > +++ uvm/uvm_aobj.c  16 Jun 2021 09:20:26 -
> > @@ -779,19 +779,11 @@ uao_init(void)
> >  void
> >  uao_reference(struct uvm_object *uobj)
> >  {
> > -   KERNEL_ASSERT_LOCKED();
> > -   uao_reference_locked(uobj);
> > -}
> > -
> > -void
> > -uao_reference_locked(struct uvm_object *uobj)
> > -{
> > -
> > /* Kernel object is persistent. */
> > if (UVM_OBJ_IS_KERN_OBJECT(uobj))
> > return;
> >  
> > -   uobj->uo_refs++;
> > +   atomic_inc_int(&uobj->uo_refs);
> >  }
> >  
> >  
> > @@ -801,34 +793,19 @@ uao_reference_locked(struct uvm_object *
> >  void
> >  uao_detach(struct uvm_object *uobj)
> >  {
> > -   KERNEL_ASSERT_LOCKED();
> > -   uao_detach_locked(uobj);
> > -}
> > -
> > -
> > -/*
> > - * uao_detach_locked: drop a reference to an aobj
> > - *
> > - * => aobj may freed upon return.
> > - */
> > -void
> > -uao_detach_locked(struct uvm_object *uobj)
> > -{
> > struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
> > struct vm_page *pg;
> >  
> > /*
> >  * Detaching from kernel_object is a NOP.
> >  */
> > -   if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
> > +   if (UVM_OBJ_IS_KERN_OBJECT(uobj))
> > return;
> > -   }
> >  
> > /*
> >  * Drop the reference.  If it was the last one, destroy the object.
> >  */
> > -   uobj->uo_refs--;
> > -   if (uobj->uo_refs) {
> > +   if (atomic_dec_int_nv(&uobj->uo_refs) > 0) {
> > return;
> > }
> >  
> > @@ -1265,68 +1242,54 @@ uao_dropswap(struct uvm_object *uobj, in
> >  boolean_t
> >  uao_swap_off(int startslot, int endslot)
> >  {
> > -   struct uvm_aobj *aobj, *nextaobj, *prevaobj = NULL;
> > +   struct uvm_aobj *aobj;
> >  
> > /*
> > -* Walk the list of all anonymous UVM objects.
> > +* Walk the list of all anonymous UVM objects.  Grab the first.
> >  */
> > mtx_enter(&uao_list_lock);
> > +   if ((aobj = LIST_FIRST(&uao_list)) == NULL) {
> > +   mtx_leave(&uao_list_lock);
> > +   return FALSE;
> > +   }
> > +   uao_reference(&aobj->u_obj);
> >  
> > -   for (aobj = LIST_FIRST(&uao_list);
> > -aobj != NULL;
> > -aobj = nextaobj) {
> > +   do {
> > +   struct uvm_aobj *nextaobj;
> > boolean_t rv;
> >  
> > /*
> > -* add a ref to the aobj so it doesn't disappear
> > -* while we're working.
> > -*/
> > -   uao_reference_locked(&aobj->u_obj);
> > -
> > -   /*
> > -* now it's safe to unlock the uao list.
> > -* note that lock interleaving is alright with IPL_NONE mutexes.
> > +* Prefetch the next object and immediately hold a reference
> > +* on it, so neither the current nor the next entry could
> > +* disappear while we are iterating.
> >  */
> > -   mtx_leave(&uao_list_lock);
> > -
> > -   if (prevaobj) {
> > -   uao_detach_locked(&prevaobj->u_obj);
> > -   prevaobj = NULL;
> > +   if ((nextaobj = LIST_NEXT(aobj, u_list)) != NULL) {
> > +   uao_reference(&nextaobj->u_obj);
> > }
> > +   mtx_leave(&uao_list_lock);
> >  
> > /*
> > -* page in any pages in the swslot range.
> > -* if there's an error, abort and return the error.
> > +* Page in all pages in the swap slot range.
> >  */
> > rv = uao_pagein(aobj, startslot, endslot);
> > +
> > +   /* Drop the reference of the current object. */
> > +   uao_detach(&aobj->u_obj);
> > if (rv) {
> > -   uao_detach_locked(&aobj->u_obj);
> > +   if (nextaobj) {
> > +   uao_detach(&nextaobj->u_obj);
> > +

Re: Reaper & amaps

2021-06-15 Thread Jonathan Matthew
On Mon, Jun 14, 2021 at 05:35:07PM +0200, Mark Kettenis wrote:
> > Date: Mon, 14 Jun 2021 11:50:24 +0200
> > From: Martin Pieuchot 
> > 
> > Now that operations on amaps are serialized using a per-map rwlock
> > the KERNEL_LOCK() shouldn't be necessary to call amap_unref().  The
> > diff below allows the reaper to do this operation before grabbing it.
> > 
> > I haven't seen any relevant contention on the reaper in my profilings,
> > so I don't expect any visible change related to this change.  However
> > this reflects the current state of locking in UVM and helps me shrink
> > my diff.
> > 
> > ok?
> 
> This means we no longer call uvm_pause() for these, but I believe the
> main reason for calling uvm_pause() is to prevent us from holding the
> kernel lock for too long.  So I think that's fine.
> 
> ok kettenis@

And I guess to allow something else to run if we're on a single
processor system, which I don't think is a huge concern.

ok jmatthew@

> 
> 
> > Index: uvm/uvm_map.c
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> > retrieving revision 1.275
> > diff -u -p -r1.275 uvm_map.c
> > --- uvm/uvm_map.c   22 May 2021 08:38:29 -  1.275
> > +++ uvm/uvm_map.c   14 Jun 2021 09:32:04 -
> > @@ -1571,10 +1571,16 @@ uvm_unmap_detach(struct uvm_map_deadq *d
> >  
> > TAILQ_FOREACH_SAFE(entry, deadq, dfree.deadq, tmp) {
> > /* Skip entries for which we have to grab the kernel lock. */
> > -   if (entry->aref.ar_amap || UVM_ET_ISSUBMAP(entry) ||
> > -   UVM_ET_ISOBJ(entry))
> > +   if (UVM_ET_ISSUBMAP(entry) || UVM_ET_ISOBJ(entry))
> > continue;
> >  
> > +   /* Drop reference to amap, if we've got one. */
> > +   if (entry->aref.ar_amap)
> > +   amap_unref(entry->aref.ar_amap,
> > +   entry->aref.ar_pageoff,
> > +   atop(entry->end - entry->start),
> > +   flags & AMAP_REFALL);
> > +
> > TAILQ_REMOVE(deadq, entry, dfree.deadq);
> > uvm_mapent_free(entry);
> > }
> > @@ -1586,12 +1592,6 @@ uvm_unmap_detach(struct uvm_map_deadq *d
> > while ((entry = TAILQ_FIRST(deadq)) != NULL) {
> > if (waitok)
> > uvm_pause();
> > -   /* Drop reference to amap, if we've got one. */
> > -   if (entry->aref.ar_amap)
> > -   amap_unref(entry->aref.ar_amap,
> > -   entry->aref.ar_pageoff,
> > -   atop(entry->end - entry->start),
> > -   flags & AMAP_REFALL);
> >  
> > /* Drop reference to our backing object, if we've got one. */
> > if (UVM_ET_ISSUBMAP(entry)) {
> > 
> > 
> 



Re: nvme(4): fix prpl sync length

2021-05-31 Thread Jonathan Matthew
On Tue, Jun 01, 2021 at 08:24:10AM +1000, David Gwynne wrote:
> 
> 
> > On 1 Jun 2021, at 04:17, Patrick Wildt  wrote:
> > 
> > Hi,
> > 
> > this call to sync the DMA mem wants to sync N - 1 number of prpl
> > entries, as the first segment is configured regularly, while the
> > addresses for the following segments (if more than 2), are in a
> > special DMA memory.
> > 
> > The code currently removes a single byte, instead of an entry.
> > This just means that it is syncing more than it should.
> 
> nice.
> 
> > ok?
> 
> ok.

ok by me too.

> 
> > 
> > Patrick
> > 
> > diff --git a/sys/dev/ic/nvme.c b/sys/dev/ic/nvme.c
> > index 62b8e40c626..6db25260ef0 100644
> > --- a/sys/dev/ic/nvme.c
> > +++ b/sys/dev/ic/nvme.c
> > @@ -629,7 +629,7 @@ nvme_scsi_io(struct scsi_xfer *xs, int dir)
> > bus_dmamap_sync(sc->sc_dmat,
> > NVME_DMA_MAP(sc->sc_ccb_prpls),
> > ccb->ccb_prpl_off,
> > -   sizeof(*ccb->ccb_prpl) * dmap->dm_nsegs - 1,
> > +   sizeof(*ccb->ccb_prpl) * (dmap->dm_nsegs - 1),
> > BUS_DMASYNC_PREWRITE);
> > }
> > 
> > @@ -691,7 +691,7 @@ nvme_scsi_io_done(struct nvme_softc *sc, struct 
> > nvme_ccb *ccb,
> > bus_dmamap_sync(sc->sc_dmat,
> > NVME_DMA_MAP(sc->sc_ccb_prpls),
> > ccb->ccb_prpl_off,
> > -   sizeof(*ccb->ccb_prpl) * dmap->dm_nsegs - 1,
> > +   sizeof(*ccb->ccb_prpl) * (dmap->dm_nsegs - 1),
> > BUS_DMASYNC_POSTWRITE);
> > }
> > 
> > 
> 



Re: mcx(4): sync only received length on RX

2021-05-31 Thread Jonathan Matthew
On Tue, Jun 01, 2021 at 08:20:43AM +1000, David Gwynne wrote:
> 
> 
> > On 1 Jun 2021, at 04:15, Patrick Wildt  wrote:
> > 
> > Hi,
> > 
> > mcx(4) seems to sync the whole mapsize on processing a received packet.
> > As far as I know, we usually only sync the actual size that we have
> > received.  Noticed this when doing bounce buffer tests, seeing that
> > it copied a lot more data than is necessary.
> > 
> > That's because the RX buffer size is maximum supported MTU, which is
> > about 9500 bytes or so.  For small packets, or regular 1500 bytes,
> > this adds overhead.
> > 
> > This change should not change anything for ARM machines that have a
> > cache coherent PCIe bus or x86.
> > 
> > ok?
> 
> ok.

ok by me too.

> 
> > 
> > Patrick
> > 
> > diff --git a/sys/dev/pci/if_mcx.c b/sys/dev/pci/if_mcx.c
> > index 38437e54897..065855d46d3 100644
> > --- a/sys/dev/pci/if_mcx.c
> > +++ b/sys/dev/pci/if_mcx.c
> > @@ -6800,20 +6800,20 @@ mcx_process_rx(struct mcx_softc *sc, struct mcx_rx 
> > *rx,
> > {
> > struct mcx_slot *ms;
> > struct mbuf *m;
> > -   uint32_t flags;
> > +   uint32_t flags, len;
> > int slot;
> > 
> > +   len = bemtoh32(&cqe->cq_byte_cnt);
> > slot = betoh16(cqe->cq_wqe_count) % (1 << MCX_LOG_RQ_SIZE);
> > 
> > ms = &rx->rx_slots[slot];
> > -   bus_dmamap_sync(sc->sc_dmat, ms->ms_map, 0, ms->ms_map->dm_mapsize,
> > -   BUS_DMASYNC_POSTREAD);
> > +   bus_dmamap_sync(sc->sc_dmat, ms->ms_map, 0, len, BUS_DMASYNC_POSTREAD);
> > bus_dmamap_unload(sc->sc_dmat, ms->ms_map);
> > 
> > m = ms->ms_m;
> > ms->ms_m = NULL;
> > 
> > -   m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt);
> > +   m->m_pkthdr.len = m->m_len = len;
> > 
> > if (cqe->cq_rx_hash_type) {
> > m->m_pkthdr.ph_flowid = betoh32(cqe->cq_rx_hash);
> > 
> 



Re: [External] : arp mbuf queue

2021-04-25 Thread Jonathan Matthew
On Sun, Apr 25, 2021 at 09:44:16AM +0200, Alexandr Nedvedicky wrote:
> Hello,
> 
> I think this should go in as-is. Though I have one question/idea
> to share at the moment.
> 
> 
> > @@ -672,20 +666,18 @@ arpcache(struct ifnet *ifp, struct ether
> >  
> > la->la_asked = 0;
> > la->la_refreshed = 0;
> > -   while ((len = ml_len(&la->la_ml)) != 0) {
> > -   struct mbuf *mh;
> > +   while ((m = mq_dequeue(&la->la_mq)) != NULL) {
> > +   unsigned int len;
> >  
> > -   mh = ml_dequeue(&la->la_ml);
> > -   la_hold_total--;
> > +   atomic_dec_int(&la_hold_total);
> > +   len = mq_len(&la->la_mq);
> >  
> > -   ifp->if_output(ifp, mh, rt_key(rt), rt);
> > +   ifp->if_output(ifp, m, rt_key(rt), rt);
> >  
> > -   if (ml_len(&la->la_ml) == len) {
> > +   /* XXXSMP we discard if other CPU enqueues */
> > +   if (mq_len(&la->la_mq) > len) {
> > /* mbuf is back in queue. Discard. */
> > -   while ((mh = ml_dequeue(&la->la_ml)) != NULL) {
> > -   la_hold_total--;
> > -   m_freem(mh);
> > -   }
> > +   atomic_sub_int(&la_hold_total, mq_purge(&la->la_mq));
> > break;
> > }
> 
> would it make sense to have let's say
> 
>   mq_move2mlist(struct mbuf_queue *, struct mbuf_list *)

This already exists, it's called mq_delist()

> 
> This would allow as to move whole globally visible la->la_mq into
> into mbuf list, which will be a local variable. This way we won't
> need to jump on la->la_mq's mutex with every loop iteration.
> 
> I it makes sense,  we can do it as a follow up change.

We'd need some other way to do the 'mbuf is back in queue' detection,
but I agree this seems like a sensible thing to do.

> 
> 
> thanks and
> regards
> sashan
> 



Re: rge(4): move tx/rx descriptors into their own structs

2021-03-29 Thread Jonathan Matthew
On Thu, Mar 25, 2021 at 05:21:38PM +0800, Kevin Lo wrote:
> Hi,
> 
> The diff below moves tx/rx descriptors into their own structs.
> This is a first step toward making rge work with multiple queues and 
> interrupts.
> Only one queue is currently used.
> 
> While here, update the RTL8125B microcode.

I can't really comment on the magic numbers, but the struct reorganisation
looks good to me, ok jmatthew@

> 
> Index: sys/dev/pci/if_rge.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_rge.c,v
> retrieving revision 1.12
> diff -u -p -u -p -r1.12 if_rge.c
> --- sys/dev/pci/if_rge.c  11 Feb 2021 16:22:06 -  1.12
> +++ sys/dev/pci/if_rge.c  25 Mar 2021 09:14:17 -
> @@ -61,7 +61,7 @@ int rge_match(struct device *, void *, 
>  void rge_attach(struct device *, struct device *, void *);
>  int  rge_activate(struct device *, int);
>  int  rge_intr(void *);
> -int  rge_encap(struct rge_softc *, struct mbuf *, int);
> +int  rge_encap(struct rge_queues *, struct mbuf *, int);
>  int  rge_ioctl(struct ifnet *, u_long, caddr_t);
>  void rge_start(struct ifqueue *);
>  void rge_watchdog(struct ifnet *);
> @@ -70,13 +70,13 @@ void  rge_stop(struct ifnet *);
>  int  rge_ifmedia_upd(struct ifnet *);
>  void rge_ifmedia_sts(struct ifnet *, struct ifmediareq *);
>  int  rge_allocmem(struct rge_softc *);
> -int  rge_newbuf(struct rge_softc *);
> -void rge_discard_rxbuf(struct rge_softc *, int);
> -void rge_rx_list_init(struct rge_softc *);
> -void rge_tx_list_init(struct rge_softc *);
> -void rge_fill_rx_ring(struct rge_softc *);
> -int  rge_rxeof(struct rge_softc *);
> -int  rge_txeof(struct rge_softc *);
> +int  rge_newbuf(struct rge_queues *);
> +void rge_discard_rxbuf(struct rge_queues *, int);
> +void rge_rx_list_init(struct rge_queues *);
> +void rge_tx_list_init(struct rge_queues *);
> +void rge_fill_rx_ring(struct rge_queues *);
> +int  rge_rxeof(struct rge_queues *);
> +int  rge_txeof(struct rge_queues *);
>  void rge_reset(struct rge_softc *);
>  void rge_iff(struct rge_softc *);
>  void rge_set_phy_power(struct rge_softc *, int);
> @@ -159,6 +159,7 @@ rge_attach(struct device *parent, struct
>   pci_intr_handle_t ih;
>   const char *intrstr = NULL;
>   struct ifnet *ifp;
> + struct rge_queues *q;
>   pcireg_t reg;
>   uint32_t hwrev;
>   uint8_t eaddr[ETHER_ADDR_LEN];
> @@ -184,6 +185,17 @@ rge_attach(struct device *parent, struct
>   }
>   }
>  
> + q = malloc(sizeof(struct rge_queues), M_DEVBUF, M_NOWAIT | M_ZERO);
> + if (q == NULL) {
> + printf(": unable to allocate queue memory\n");
> + return;
> + }
> + q->q_sc = sc;
> + q->q_index = 0;
> +
> + sc->sc_queues = q;
> + sc->sc_nqueues = 1;
> +
>   /* 
>* Allocate interrupt.
>*/
> @@ -323,9 +335,10 @@ int
>  rge_intr(void *arg)
>  {
>   struct rge_softc *sc = arg;
> + struct rge_queues *q = sc->sc_queues;
>   struct ifnet *ifp = &sc->sc_arpcom.ac_if;
>   uint32_t status;
> - int claimed = 0, rx, tx;
> + int claimed = 0, rv;
>  
>   if (!(ifp->if_flags & IFF_RUNNING))
>   return (0);
> @@ -345,29 +358,21 @@ rge_intr(void *arg)
>   if (status & RGE_ISR_PCS_TIMEOUT)
>   claimed = 1;
>  
> - rx = tx = 0;
> + rv = 0;
>   if (status & sc->rge_intrs) {
> - if (status &
> - (sc->rge_rx_ack | RGE_ISR_RX_ERR | RGE_ISR_RX_FIFO_OFLOW)) {
> - rx |= rge_rxeof(sc);
> - claimed = 1;
> - }
> -
> - if (status & (sc->rge_tx_ack | RGE_ISR_TX_ERR)) {
> - tx |= rge_txeof(sc);
> - claimed = 1;
> - }
> + rv |= rge_rxeof(q);
> + rv |= rge_txeof(q);
>  
>   if (status & RGE_ISR_SYSTEM_ERR) {
>   KERNEL_LOCK();
>   rge_init(ifp);
>   KERNEL_UNLOCK();
> - claimed = 1;
>   }
> + claimed = 1;
>   }
>  
>   if (sc->rge_timerintr) {
> - if ((tx | rx) == 0) {
> + if (!rv) {
>   /*
>* Nothing needs to be processed, fallback
>* to use TX/RX interrupts.
> @@ -379,11 +384,11 @@ rge_intr(void *arg)
>* race introduced by changing interrupt
>* masks.
>*/
> - rge_rxeof(sc);
> - rge_txeof(sc);
> + rge_rxeof(q);
> + rge_txeof(q);
>   } else
>   RGE_WRI

Re: btrace: add dry run mode

2021-03-19 Thread Jonathan Matthew
On Fri, Mar 19, 2021 at 08:24:12AM -0600, Todd C. Miller wrote:
> On Fri, 19 Mar 2021 13:22:35 +0100, Klemens Nanni wrote:
> 
> > I argue it should be `-n' like all the daemons, e.g. vmd(8) and other
> > parsers such as pfctl(8) do.
> 
> Yes, please.  I was about to make the same point.

Fair enough.  I started with -d because that's what bpftrace has, but
changing to -n for consistency makes sense to me.


Index: btrace.8
===
RCS file: /cvs/src/usr.sbin/btrace/btrace.8,v
retrieving revision 1.2
diff -u -p -u -p -r1.2 btrace.8
--- btrace.811 Sep 2020 08:16:15 -  1.2
+++ btrace.820 Mar 2021 04:38:55 -
@@ -46,6 +46,9 @@ Execute
 .Ar program .
 .It Fl l
 List all available probes.
+.It Fl n
+No action.
+Parse the program and then exit.
 .It Fl p Ar pid
 Enable tracing on the indicated process ID (only one
 .Fl p
Index: btrace.c
===
RCS file: /cvs/src/usr.sbin/btrace/btrace.c,v
retrieving revision 1.29
diff -u -p -u -p -r1.29 btrace.c
--- btrace.c8 Feb 2021 09:46:45 -   1.29
+++ btrace.c20 Mar 2021 04:38:56 -
@@ -124,7 +124,7 @@ main(int argc, char *argv[])
int fd = -1, ch, error = 0;
const char *filename = NULL, *btscript = NULL;
const char *errstr;
-   int showprobes = 0, tracepid = -1;
+   int showprobes = 0, tracepid = -1, noaction = 0;
 
setlocale(LC_ALL, "");
 
@@ -133,7 +133,7 @@ main(int argc, char *argv[])
err(1, "pledge");
 #endif
 
-   while ((ch = getopt(argc, argv, "e:lp:v")) != -1) {
+   while ((ch = getopt(argc, argv, "e:lnp:v")) != -1) {
switch (ch) {
case 'e':
btscript = optarg;
@@ -141,6 +141,9 @@ main(int argc, char *argv[])
case 'l':
showprobes = 1;
break;
+   case 'n':
+   noaction = 1;
+   break;
case 'p':
if (tracepid != -1)
usage();
@@ -178,6 +181,9 @@ main(int argc, char *argv[])
return error;
}
 
+   if (noaction)
+   return error;
+
if (showprobes || g_nprobes > 0) {
fd = open(__PATH_DEVDT, O_RDONLY);
if (fd == -1)
@@ -201,7 +207,7 @@ main(int argc, char *argv[])
 __dead void
 usage(void)
 {
-   fprintf(stderr, "usage: %s [-lv] [-p pid] [-e program|file]\n",
+   fprintf(stderr, "usage: %s [-lnv] [-p pid] [-e program | file]\n",
getprogname());
exit(1);
 }



btrace: add dry run mode

2021-03-19 Thread Jonathan Matthew
I'd like to add some regress tests for the btrace(8) parser.
To do that, it would help to have a dry-run mode where it just parses the
file and exits.

The way I've implemented it here, it exits immediately after parsing, so it
won't open /dev/dt and try to find the probes the program uses.  This means
it doesn't require privileges and can be run on kernels without dt(4)
compiled in, but it can't catch typos in probe names.

ok?

Index: btrace.8
===
RCS file: /cvs/src/usr.sbin/btrace/btrace.8,v
retrieving revision 1.2
diff -u -p -u -p -r1.2 btrace.8
--- btrace.811 Sep 2020 08:16:15 -  1.2
+++ btrace.819 Mar 2021 11:22:41 -
@@ -41,6 +41,9 @@ in
 .Pp
 The options are as follows:
 .Bl -tag -width Ds
+.It Fl d
+Dry run.
+Parse the program and then exit.
 .It Fl e Ar program
 Execute
 .Ar program .
Index: btrace.c
===
RCS file: /cvs/src/usr.sbin/btrace/btrace.c,v
retrieving revision 1.26
diff -u -p -u -p -r1.26 btrace.c
--- btrace.c7 Dec 2020 18:28:09 -   1.26
+++ btrace.c19 Mar 2021 11:22:41 -
@@ -123,7 +123,7 @@ main(int argc, char *argv[])
int fd = -1, ch, error = 0;
const char *filename = NULL, *btscript = NULL;
const char *errstr;
-   int showprobes = 0, tracepid = -1;
+   int showprobes = 0, tracepid = -1, dryrun = 0;
 
setlocale(LC_ALL, "");
 
@@ -132,8 +132,11 @@ main(int argc, char *argv[])
err(1, "pledge");
 #endif
 
-   while ((ch = getopt(argc, argv, "e:lp:v")) != -1) {
+   while ((ch = getopt(argc, argv, "de:lp:v")) != -1) {
switch (ch) {
+   case 'd':
+   dryrun = 1;
+   break;
case 'e':
btscript = optarg;
break;
@@ -177,6 +180,9 @@ main(int argc, char *argv[])
return error;
}
 
+   if (dryrun)
+   return error;
+
if (showprobes || g_nprobes > 0) {
fd = open(__PATH_DEVDT, O_RDONLY);
if (fd == -1)
@@ -200,7 +206,7 @@ main(int argc, char *argv[])
 __dead void
 usage(void)
 {
-   fprintf(stderr, "usage: %s [-lv] [-p pid] [-e program|file]\n",
+   fprintf(stderr, "usage: %s [-dlv] [-p pid] [-e program|file]\n",
getprogname());
exit(1);
 }



relayd check script memory explosion

2021-02-14 Thread Jonathan Matthew
It's fairly easy to accidentally configure relayd to try to run check scripts
faster than they finish, for example if you have a check interval of one
second and the check script makes a tcp connection to a host that doesn't
exist any more.

In this situation, the hce process will keep writing messages to its imsg
buffer to the parent process asking it to run checks, which causes its memory
usage to grow without bounds.  If the check script starts working again
(or if you change it to just 'exit 0') the parent works its way through the
backlog and memory usage goes back to normal, but ideally relayd would avoid
doing this to itself.

If we don't clear the F_CHECK_SENT and F_CHECK_DONE flags in
hce_launch_checks(), check_script() can use them to figure out if the
last check request it sent for the host has finished yet, so it can avoid
building up a backlog of work for the parent.  The ICMP and script check 
implementations clear these flags as they start checks, and the TCP check
code doesn't use them at all, so this shouldn't affect anything else.

ok?


Index: check_script.c
===
RCS file: /cvs/src/usr.sbin/relayd/check_script.c,v
retrieving revision 1.21
diff -u -p -u -p -r1.21 check_script.c
--- check_script.c  28 May 2017 10:39:15 -  1.21
+++ check_script.c  15 Feb 2021 01:28:54 -
@@ -38,6 +38,9 @@ check_script(struct relayd *env, struct 
struct ctl_scriptscr;
struct table*table;
 
+   if ((host->flags & (F_CHECK_SENT|F_CHECK_DONE)) == F_CHECK_SENT)
+   return;
+
if ((table = table_find(env, host->conf.tableid)) == NULL)
fatalx("%s: invalid table id", __func__);
 
@@ -52,7 +55,9 @@ check_script(struct relayd *env, struct 
fatalx("invalid script path");
memcpy(&scr.timeout, &table->conf.timeout, sizeof(scr.timeout));
 
-   proc_compose(env->sc_ps, PROC_PARENT, IMSG_SCRIPT, &scr, sizeof(scr));
+   if (proc_compose(env->sc_ps, PROC_PARENT, IMSG_SCRIPT, &scr,
+   sizeof(scr)) == 0)
+   host->flags |= F_CHECK_SENT;
 }
 
 void
Index: hce.c
===
RCS file: /cvs/src/usr.sbin/relayd/hce.c,v
retrieving revision 1.79
diff -u -p -u -p -r1.79 hce.c
--- hce.c   6 Aug 2018 17:31:31 -   1.79
+++ hce.c   15 Feb 2021 01:28:54 -
@@ -139,7 +139,6 @@ hce_launch_checks(int fd, short event, v
TAILQ_FOREACH(host, &table->hosts, entry) {
if ((host->flags & F_CHECK_DONE) == 0)
host->he = HCE_INTERVAL_TIMEOUT;
-   host->flags &= ~(F_CHECK_SENT|F_CHECK_DONE);
if (event_initialized(&host->cte.ev)) {
event_del(&host->cte.ev);
close(host->cte.s);



Re: Uninitialized var in dev/pv/vmt.c

2021-02-11 Thread Jonathan Matthew
On Thu, Feb 11, 2021 at 11:41:24AM +, Ricardo Mestre wrote:
> Hi,
> 
> Uninitialized var and it's used in a condition != NULL a little bit 
> afterwards.
> CID 1501713
> 
> OK?

yes, ok jmatthew@

> 
> Index: vmt.c
> ===
> RCS file: /cvs/src/sys/dev/pv/vmt.c,v
> retrieving revision 1.22
> diff -u -p -u -r1.22 vmt.c
> --- vmt.c 15 Jan 2021 06:14:41 -  1.22
> +++ vmt.c 11 Feb 2021 11:35:41 -
> @@ -1289,7 +1289,7 @@ vmt_xdr_nic_info(char *data)
>   struct ifnet *iface;
>   struct vm_nicinfo_nic_list nl;
>   size_t total, nictotal;
> - char *listdata;
> + char *listdata = NULL;
>   int nics;
>  
>   NET_ASSERT_LOCKED();



Re: sleep_setup/finish simplification

2021-01-08 Thread Jonathan Matthew
On Fri, Jan 08, 2021 at 12:59:16PM -0600, Scott Cheloha wrote:
> On Mon, Dec 28, 2020 at 11:41:52AM -0300, Martin Pieuchot wrote:
> > On 08/12/20(Tue) 10:06, Martin Pieuchot wrote:
> > > Diff below aims to simplify the API to put a thread on a sleep queue and
> > > reduce it to the following:
> > > 
> > >   sleep_setup();
> > >   /* check condition or release lock */
> > >   sleep_finish();
> > > 
> > > It is motivated by my work to sleep the SCHED_LOCK() but might as well
> > > prevent/fix some bugs.
> > > 
> > > The tricky part of the current implementation is that sleep_setup_signal()
> > > can already park/stop the current thread resulting in a context change.
> > > Should any custom accounting / lock check happen before that?  At least
> > > two lock primitives do so currently:  drm's schedule_timeout() and
> > > rwlock's rw_enter().
> > > 
> > > As a result of this diff various states can be removed and sleep_finish()
> > > contains the following magic:
> > > 
> > >   1. check for signal/parking
> > >   2. context switch or remove from sleep queue
> > >   3. check for signal/parking
> > > 
> > > Note that sleep_finish() could be simplified even further but I left
> > > that for later to ease the review.
> > > 
> > > Comments?  Oks?
> > 
> > Anyone?
> 
> I really like this simplification.
> 
> It also makes my forthcoming kclock changes to tsleep_nsec(9)/etc.
> simpler, so it's doubly good for me.
> 
> I was hoping someone would step forward and OK this but nobody did, at
> least not publicly.
> 
> I see claudio@ is trying to break off a piece of this for commit in a
> different thread.  Unsure if that means this is dead or just being cut
> up and merged piecemeal.
> 
> FWIW, ok cheloha@.  Obviously you need more OKs.
> 
> Even if this is dead, some other simplification in this vein would be
> nice.

I agree; I read through this, tried to puzzle my way through what would
happen if sleep_setup_signal() slept and figured it was OK.  I don't
think I've written any code using this API so my opinion doesn't count
for much, but I've always found it more complicated than I'd like, and
particularly hard to determine if it's being used correctly, so
simplifying it definitely sounds good to me.



Re: remove vmt(4) (superseeded by open-vm-tools package)

2021-01-08 Thread Jonathan Matthew
On Fri, Jan 08, 2021 at 10:34:02PM +0100, Klemens Nanni wrote:
> The report on bugs shows vmt(4) lagging behind and I sent a working
> working open-vm-tools port to ports@ yesterday.
> 
> In case the port gets imported and there are no further regressions wrt.
> the functionality vmt(4) already provides, here's a tentative diff to
> remove the driver entirely.
> 
> Not asking for OKs at this point because the port neede testing test and
> I have only tested with it anyway, but vmt(4) supports i386 as well.
> 
> Thoughts?

The reason I work on vmt(4) is so I don't have to run open-vm-tools, so
I don't want to see it removed in favour of open-vm-tools.



btrace: fix parsing of profile:hz:

2021-01-08 Thread Jonathan Matthew
Anton's fix for parsing of syscall names that are also tokens in the btrace
grammar broke parsing of 'profile:hz:number', because it forces 'hz' to be
handled as a string rather than a token.  I can't see how we'd ever end up
with a syscall named 'hz', so one way we could fix this would be to exclude
the HZ token from the lexer backdoor.

ok?

Index: bt_parse.y
===
RCS file: /cvs/src/usr.sbin/btrace/bt_parse.y,v
retrieving revision 1.20
diff -u -p -r1.20 bt_parse.y
--- bt_parse.y  11 Dec 2020 07:27:55 -  1.20
+++ bt_parse.y  8 Jan 2021 21:37:53 -
@@ -792,10 +792,14 @@ again:
/*
 * Probe lexer backdoor, interpret the token as a string
 * rather than a keyword. Otherwise, reserved keywords
-* would conflict with syscall names.
+* would conflict with syscall names. The exception to
+* this is 'hz', which hopefully will never be a
+* syscall.
 */
-   yylval.v.string = kwp->word;
-   return STRING;
+   if (kwp->token != HZ) {
+   yylval.v.string = kwp->word;
+   return STRING;
+   }
}
yylval.v.i = kwp->type;
return kwp->token;



convert i386 fix_f00f() uvm_km_zalloc

2021-01-03 Thread Jonathan Matthew
I don't have a real 586, but I can tell qemu to pretend to be one,
which at least executes this code.

Using kd_waitok here seems suspect, because if we're out of memory
this early I can't see anything else freeing any up, but
uvm_km_zalloc() will also sleep rather than return failure.
Should this use kd_nowait and panic if the allocation fails instead?

ok?

Index: arch/i386/i386/machdep.c
===
RCS file: /cvs/src/sys/arch/i386/i386/machdep.c,v
retrieving revision 1.642
diff -u -p -u -p -r1.642 machdep.c
--- arch/i386/i386/machdep.c28 Dec 2020 14:02:07 -  1.642
+++ arch/i386/i386/machdep.c3 Jan 2021 23:01:34 -
@@ -3100,7 +3100,7 @@ fix_f00f(void)
void *p;
 
/* Allocate two new pages */
-   va = uvm_km_zalloc(kernel_map, NBPG*2);
+   va = (vaddr_t)km_alloc(NBPG*2, &kv_any, &kp_zero, &kd_waitok);
p = (void *)(va + NBPG - 7*sizeof(*idt));
 
/* Copy over old IDT */



convert vga POST uvm_km_vallocs

2021-01-02 Thread Jonathan Matthew
This code is now only here for some unfortunate Intel graphics chips
based on PowerVR, and I don't have a machine with one of those.
vga_post_init() gets called from vga_attach() in any case, and
vga_post_free() doesn't seem to be called at all.  I've booted this on
amd64 (real) and i386 (virtualized) with no problems.

ok?

diff --git sys/arch/amd64/pci/vga_post.c sys/arch/amd64/pci/vga_post.c
index 32876649ddd..36596490d35 100644
--- sys/arch/amd64/pci/vga_post.c
+++ sys/arch/amd64/pci/vga_post.c
@@ -125,13 +125,15 @@ vga_post_init(int bus, int device, int function)
vaddr_t sys_image, sys_bios_data;
int err;
 
-   sys_bios_data = uvm_km_valloc(kernel_map, PAGE_SIZE);
+   sys_bios_data = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none,
+   &kd_nowait);
if (sys_bios_data == 0)
return NULL;
 
-   sys_image = uvm_km_valloc(kernel_map, 1024 * 1024);
+   sys_image = (vaddr_t)km_alloc(1024 * 1024, &kv_any, &kp_none,
+   &kd_nowait);
if (sys_image == 0) {
-   uvm_km_free(kernel_map, sys_bios_data, PAGE_SIZE);
+   km_free((void *)sys_bios_data, PAGE_SIZE, &kv_any, &kp_none);
return NULL;
}
sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
@@ -140,7 +142,7 @@ vga_post_init(int bus, int device, int function)
err = uvm_pglistalloc(BASE_MEMORY, 0, (paddr_t)-1, 0, 0,
&sc->ram_backing, BASE_MEMORY/PAGE_SIZE, UVM_PLA_WAITOK);
if (err) {
-   uvm_km_free(kernel_map, sc->sys_image, 1024 * 1024);
+   km_free((void *)sc->sys_image, 1024 * 1024, &kv_any, &kp_none);
free(sc, M_DEVBUF, sizeof(*sc));
return NULL;
}
@@ -152,7 +154,7 @@ vga_post_init(int bus, int device, int function)
pmap_update(pmap_kernel());
memcpy((void *)sc->bios_data, (void *)sys_bios_data, PAGE_SIZE);
pmap_kremove(sys_bios_data, PAGE_SIZE);
-   uvm_km_free(kernel_map, sys_bios_data, PAGE_SIZE);
+   km_free((void *)sys_bios_data, PAGE_SIZE, &kv_any, &kp_none);
 
iter = 0;
TAILQ_FOREACH(pg, &sc->ram_backing, pageq) {
@@ -209,7 +211,7 @@ vga_post_free(struct vga_post *sc)
 {
uvm_pglistfree(&sc->ram_backing);
pmap_kremove(sc->sys_image, 1024 * 1024);
-   uvm_km_free(kernel_map, sc->sys_image, 1024 * 1024);
+   km_free((void *)sc->sys_image, 1024 * 1024, &kv_any, &kp_none)
pmap_update(pmap_kernel());
free(sc, M_DEVBUF, sizeof(*sc));
 }
diff --git sys/arch/i386/pci/vga_post.c sys/arch/i386/pci/vga_post.c
index c85ee05dcdb..2464fd6019c 100644
--- sys/arch/i386/pci/vga_post.c
+++ sys/arch/i386/pci/vga_post.c
@@ -126,13 +126,15 @@ vga_post_init(int bus, int device, int function)
vaddr_t sys_image, sys_bios_data;
int err;
 
-   sys_bios_data = uvm_km_valloc(kernel_map, PAGE_SIZE);
+   sys_bios_data = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none,
+   &kd_nowait);
if (sys_bios_data == 0)
return NULL;
 
-   sys_image = uvm_km_valloc(kernel_map, 1024 * 1024);
+   sys_image = (vaddr_t)km_alloc(1024 * 1024, &kv_any, &kp_none,
+   &kd_nowait);
if (sys_image == 0) {
-   uvm_km_free(kernel_map, sys_bios_data, PAGE_SIZE);
+   km_free((void *)sys_bios_data, PAGE_SIZE, &kv_any, &kp_none);
return NULL;
}
sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
@@ -141,7 +143,7 @@ vga_post_init(int bus, int device, int function)
err = uvm_pglistalloc(BASE_MEMORY, 0, (paddr_t)-1, 0, 0,
&sc->ram_backing, BASE_MEMORY/PAGE_SIZE, UVM_PLA_WAITOK);
if (err) {
-   uvm_km_free(kernel_map, sc->sys_image, 1024 * 1024);
+   km_free((void *)sc->sys_image, 1024 * 1024, &kv_any, &kp_none);
free(sc, M_DEVBUF, sizeof *sc);
return NULL;
}
@@ -153,7 +155,7 @@ vga_post_init(int bus, int device, int function)
pmap_update(pmap_kernel());
memcpy((void *)sc->bios_data, (void *)sys_bios_data, PAGE_SIZE);
pmap_kremove(sys_bios_data, PAGE_SIZE);
-   uvm_km_free(kernel_map, sys_bios_data, PAGE_SIZE);
+   km_free((void *)sys_bios_data, PAGE_SIZE, &kv_any, &kp_none);
 
iter = 0;
TAILQ_FOREACH(pg, &sc->ram_backing, pageq) {
@@ -211,7 +213,7 @@ vga_post_free(struct vga_post *sc)
uvm_pglistfree(&sc->ram_backing);
pmap_kremove(sc->sys_image, 1024 * 1024);
 
-   uvm_km_free(kernel_map, sc->sys_image, 1024 * 1024);
+   km_free((void *)sc->sys_image, 1024 * 1024, &kv_any, &kp_none);
pmap_update(pmap_kernel());
free(sc, M_DEVBUF, sizeof *sc);
 }



sparc64 cpu uvm_km_valloc()

2020-12-20 Thread Jonathan Matthew
Continuing to convert uvm_km_valloc() calls to km_alloc(), sparc64's
struct cpu_info wants to be allocated on an 8 page boundary, so it needs
a custom kmem_va_mode.  My T5120 didn't blow up with this, so I think it
works.

ok?

Index: arch/sparc64/sparc64/cpu.c
===
RCS file: /cvs/src/sys/arch/sparc64/sparc64/cpu.c,v
retrieving revision 1.71
diff -u -p -u -p -r1.71 cpu.c
--- arch/sparc64/sparc64/cpu.c  31 Jul 2020 11:19:12 -  1.71
+++ arch/sparc64/sparc64/cpu.c  21 Dec 2020 05:12:32 -
@@ -113,6 +113,12 @@ void hummingbird_init(struct cpu_info *c
 #defineIU_IMPL(v)  u_int64_t)(v))&VER_IMPL) >> VER_IMPL_SHIFT)
 #defineIU_VERS(v)  u_int64_t)(v))&VER_MASK) >> VER_MASK_SHIFT)
 
+/* virtual address allocation mode for struct cpu_info */
+struct kmem_va_mode kv_cpu_info = {
+   .kv_map = &kernel_map,
+   .kv_align = 8 * PAGE_SIZE
+};
+
 struct cpu_info *
 alloc_cpuinfo(struct mainbus_attach_args *ma)
 {
@@ -137,7 +143,7 @@ alloc_cpuinfo(struct mainbus_attach_args
if (cpi->ci_upaid == portid)
return cpi;
 
-   va = uvm_km_valloc_align(kernel_map, sz, 8 * PAGE_SIZE, 0);
+   va = (vaddr_t)km_alloc(sz, &kv_cpu_info, &kp_none, &kd_nowait);
if (va == 0)
panic("alloc_cpuinfo: no virtual space");
va0 = va;



mpbios: replace uvm_km_valloc() with km_alloc()

2020-12-19 Thread Jonathan Matthew
A few more km_alloc()s following the same pattern as acpi.  I don't have any
machines that actually need mpbios(4) but I've booted amd64 and i386 smp qemu
vms with acpi disabled, which causes mpbios to attach instead.

ok?

Index: arch/amd64/amd64/mpbios.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/mpbios.c,v
retrieving revision 1.29
diff -u -p -u -p -r1.29 mpbios.c
--- arch/amd64/amd64/mpbios.c   7 Feb 2018 06:19:54 -   1.29
+++ arch/amd64/amd64/mpbios.c   19 Dec 2020 09:26:33 -
@@ -240,7 +240,8 @@ mpbios_map(paddr_t pa, int len, struct m
 {
paddr_t pgpa = trunc_page(pa);
paddr_t endpa = round_page(pa + len);
-   vaddr_t va = uvm_km_valloc(kernel_map, endpa - pgpa);
+   vaddr_t va = (vaddr_t)km_alloc(endpa - pgpa, &kv_any, &kp_none,
+   &kd_nowait);
vaddr_t retva = va + (pa & PGOFSET);
 
handle->pa = pa;
@@ -262,7 +263,7 @@ void
 mpbios_unmap(struct mp_map *handle)
 {
pmap_kremove(handle->baseva, handle->vsize);
-   uvm_km_free(kernel_map, handle->baseva, handle->vsize);
+   km_free((void *)handle->baseva, handle->vsize, &kv_any, &kp_none);
 }
 
 /*
Index: arch/i386/i386/mpbios.c
===
RCS file: /cvs/src/sys/arch/i386/i386/mpbios.c,v
retrieving revision 1.41
diff -u -p -u -p -r1.41 mpbios.c
--- arch/i386/i386/mpbios.c 7 Feb 2018 06:19:54 -   1.41
+++ arch/i386/i386/mpbios.c 19 Dec 2020 09:26:33 -
@@ -253,7 +253,8 @@ mpbios_map(paddr_t pa, int len, struct m
 {
paddr_t pgpa = trunc_page(pa);
paddr_t endpa = round_page(pa + len);
-   vaddr_t va = uvm_km_valloc(kernel_map, endpa - pgpa);
+   vaddr_t va = (vaddr_t)km_alloc(endpa - pgpa, &kv_any, &kp_none,
+   &kd_nowait);
vaddr_t retva = va + (pa & PGOFSET);
 
handle->pa = pa;
@@ -275,7 +276,7 @@ void
 mpbios_unmap(struct mp_map *handle)
 {
pmap_kremove(handle->baseva, handle->vsize);
-   uvm_km_free(kernel_map, handle->baseva, handle->vsize);
+   km_free((void *)handle->baseva, handle->vsize, &kv_any, &kp_none);
 }
 
 /*



converting uvm_km_valloc to km_alloc

2020-12-17 Thread Jonathan Matthew
On Wed, Dec 16, 2020 at 12:00:38AM +0100, Mark Kettenis wrote:
> > Date: Tue, 15 Dec 2020 21:21:37 +0100
> > From: Alexander Bluhm 
> > 
> > On Tue, Dec 15, 2020 at 06:57:03PM +0100, Mark Kettenis wrote:
> > > Does the diff below fix this?
> > 
> > I can reproduce the panic and your diff fixes it.
> > 
> > Usually my regress machines do not trigger it as I do not install
> > firmware.  fw_update and reboot makes it crash.
> > 
> > bluhm
> 
> Thanks.  This is committed now.  However, there may be other case
> where we use uvm_km_valloc() early on that will trip over the kernel
> lock assertion that mpi@ added in uvm_km_pgremove().  Ideally we
> should get rid of all the uvm_km_free() calls in the kernel.

Here are a couple of relatively easy ones, applying changes from r1.86 of
amd64's acpi_machdep.c to i386 and arm64.  I've tested i386 but it turns out I
don't have any arm64 machines with acpi.


Index: arch/arm64/arm64/acpi_machdep.c
===
RCS file: /cvs/src/sys/arch/arm64/arm64/acpi_machdep.c,v
retrieving revision 1.10
diff -u -p -u -p -r1.10 acpi_machdep.c
--- arch/arm64/arm64/acpi_machdep.c 6 Dec 2020 21:19:55 -   1.10
+++ arch/arm64/arm64/acpi_machdep.c 18 Dec 2020 00:23:01 -
@@ -74,7 +74,8 @@ acpi_map(paddr_t pa, size_t len, struct 
 {
paddr_t pgpa = trunc_page(pa);
paddr_t endpa = round_page(pa + len);
-   vaddr_t va = uvm_km_valloc(kernel_map, endpa - pgpa);
+   vaddr_t va = (vaddr_t)km_alloc(endpa - pgpa, &kv_any, &kp_none,
+   &kd_nowait);
 
if (va == 0)
return (ENOMEM);
@@ -97,7 +98,7 @@ void
 acpi_unmap(struct acpi_mem_map *handle)
 {
pmap_kremove(handle->baseva, handle->vsize);
-   uvm_km_free(kernel_map, handle->baseva, handle->vsize);
+   km_free((void *)handle->baseva, handle->vsize, &kv_any, &kp_none);
 }
 
 int
Index: arch/i386/i386/acpi_machdep.c
===
RCS file: /cvs/src/sys/arch/i386/i386/acpi_machdep.c,v
retrieving revision 1.74
diff -u -p -u -p -r1.74 acpi_machdep.c
--- arch/i386/i386/acpi_machdep.c   21 Jul 2020 03:48:06 -  1.74
+++ arch/i386/i386/acpi_machdep.c   18 Dec 2020 00:23:01 -
@@ -117,7 +117,8 @@ acpi_map(paddr_t pa, size_t len, struct 
 {
paddr_t pgpa = trunc_page(pa);
paddr_t endpa = round_page(pa + len);
-   vaddr_t va = uvm_km_valloc(kernel_map, endpa - pgpa);
+   vaddr_t va = (vaddr_t)km_alloc(endpa - pgpa, &kv_any, &kp_none,
+   &kd_nowait);
 
if (va == 0)
return (ENOMEM);
@@ -140,7 +141,7 @@ void
 acpi_unmap(struct acpi_mem_map *handle)
 {
pmap_kremove(handle->baseva, handle->vsize);
-   uvm_km_free(kernel_map, handle->baseva, handle->vsize);
+   km_free((void *)handle->baseva, handle->vsize, &kv_any, &kp_none);
 }
 
 int



Re: uvm_fault: entering swap code

2020-12-13 Thread Jonathan Matthew
On Sat, Dec 12, 2020 at 10:54:57PM +1000, Jonathan Matthew wrote:
> On Thu, Dec 10, 2020 at 10:46:58AM -0300, Martin Pieuchot wrote:
> > On 08/12/20(Tue) 22:55, Jonathan Matthew wrote:
> > > On Mon, Dec 07, 2020 at 03:15:50PM -0300, Martin Pieuchot wrote:
> > > > Getting a page from the fault handler might require poking at some
> > > > swap-related states.
> > > > 
> > > > These are not in the hot-path of the fault handler so for the moment
> > > > just assert that the KERNEL_LOCK() is held or grab it if the function
> > > > might be called from an future unlocked path.
> > > > 
> > > > ok?
> > > 
> > > Could you add 'K' to the list of locks in the comment above struct uvmexp 
> > > too?
> > 
> > Updated diff below.
> > 
> > > I went looking for other uses of swpgonly and saw that it's used under
> > > uvm_map_teardown -> uvm_unmap_kill_entry -> uvm_km_pgremove,
> > > and uvm_map_teardown ensures that the kernel lock is not held.
> > > Not related to this diff exactly, but is this something we need to fix?
> > 
> > I suppose that the problem can only occur if a kernel thread is exiting
> > since this code is only executed for the kernel pmap.  Anyway I added an
> > assertion.
> 
> Right, and as I understand it, kernel threads all share the proc0 vm space,
> so its reference count won't ever reach 0, so the kernel map portions of
> uvm_unmap_kill_entry() can't be reached from the reaper.  Looks like this is
> all safe, it just requires a bit more reading than I did the first time.
> I'll see if I can find a way to make it more clear.

And now that I've tested this out and checked that it doesn't blow up when
you drive the machine into swap, ok jmatthew@

> 
> > 
> > Index: uvm/uvm_km.c
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_km.c,v
> > retrieving revision 1.137
> > diff -u -p -r1.137 uvm_km.c
> > --- uvm/uvm_km.c23 May 2020 06:15:09 -  1.137
> > +++ uvm/uvm_km.c10 Dec 2020 13:33:49 -
> > @@ -243,6 +243,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
> > voff_t curoff;
> > int slot;
> >  
> > +   KERNEL_ASSERT_LOCKED();
> > KASSERT(uobj->pgops == &aobj_pager);
> >  
> > for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) {
> > Index: uvm/uvm_swap.c
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
> > retrieving revision 1.147
> > diff -u -p -r1.147 uvm_swap.c
> > --- uvm/uvm_swap.c  29 Sep 2020 11:47:41 -  1.147
> > +++ uvm/uvm_swap.c  10 Dec 2020 13:30:30 -
> > @@ -1403,7 +1403,7 @@ uvm_swap_alloc(int *nslots, boolean_t le
> > /*
> >  * lock data lock, convert slots into blocks, and enter loop
> >  */
> > -
> > +   KERNEL_ASSERT_LOCKED();
> >  ReTry: /* XXXMRG */
> > LIST_FOREACH(spp, &swap_priority, spi_swappri) {
> > TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
> > @@ -1449,8 +1449,10 @@ uvm_swapisfull(void)
> >  {
> > int result;
> >  
> > +   KERNEL_LOCK();
> > KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
> > result = (uvmexp.swpgonly == uvmexp.swpages);
> > +   KERNEL_UNLOCK();
> >  
> > return result;
> >  }
> > @@ -1465,6 +1467,7 @@ uvm_swap_markbad(int startslot, int nslo
> >  {
> > struct swapdev *sdp;
> >  
> > +   KERNEL_LOCK();
> > sdp = swapdrum_getsdp(startslot);
> > if (sdp != NULL) {
> > /*
> > @@ -1475,6 +1478,7 @@ uvm_swap_markbad(int startslot, int nslo
> >  */
> > sdp->swd_npgbad += nslots;
> > }
> > +   KERNEL_UNLOCK();
> >  }
> >  
> >  /*
> > @@ -1501,7 +1505,7 @@ uvm_swap_free(int startslot, int nslots)
> >  * in the extent, and return.   must hold pri lock to do
> >  * lookup and access the extent.
> >  */
> > -
> > +   KERNEL_LOCK();
> > sdp = swapdrum_getsdp(startslot);
> > KASSERT(uvmexp.nswapdev >= 1);
> > KASSERT(sdp != NULL);
> > @@ -1533,6 +1537,7 @@ uvm_swap_free(int startslot, int nslots)
> > }
> > }
> >  #endif /* UVM_SWAP_ENCRYPT */
> > +   KERNEL_UNLOCK();
> >  }
> >  
> >  /*
> > @@ -1567,6 +1572,7 @@ uvm_swap_get(struct vm_page *page, int s
> > return VM_PAGER

Re: uvm_fault: entering swap code

2020-12-12 Thread Jonathan Matthew
On Thu, Dec 10, 2020 at 10:46:58AM -0300, Martin Pieuchot wrote:
> On 08/12/20(Tue) 22:55, Jonathan Matthew wrote:
> > On Mon, Dec 07, 2020 at 03:15:50PM -0300, Martin Pieuchot wrote:
> > > Getting a page from the fault handler might require poking at some
> > > swap-related states.
> > > 
> > > These are not in the hot-path of the fault handler so for the moment
> > > just assert that the KERNEL_LOCK() is held or grab it if the function
> > > might be called from an future unlocked path.
> > > 
> > > ok?
> > 
> > Could you add 'K' to the list of locks in the comment above struct uvmexp 
> > too?
> 
> Updated diff below.
> 
> > I went looking for other uses of swpgonly and saw that it's used under
> > uvm_map_teardown -> uvm_unmap_kill_entry -> uvm_km_pgremove,
> > and uvm_map_teardown ensures that the kernel lock is not held.
> > Not related to this diff exactly, but is this something we need to fix?
> 
> I suppose that the problem can only occur if a kernel thread is exiting
> since this code is only executed for the kernel pmap.  Anyway I added an
> assertion.

Right, and as I understand it, kernel threads all share the proc0 vm space,
so its reference count won't ever reach 0, so the kernel map portions of
uvm_unmap_kill_entry() can't be reached from the reaper.  Looks like this is
all safe, it just requires a bit more reading than I did the first time.
I'll see if I can find a way to make it more clear.

> 
> Index: uvm/uvm_km.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_km.c,v
> retrieving revision 1.137
> diff -u -p -r1.137 uvm_km.c
> --- uvm/uvm_km.c  23 May 2020 06:15:09 -  1.137
> +++ uvm/uvm_km.c  10 Dec 2020 13:33:49 -
> @@ -243,6 +243,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
>   voff_t curoff;
>   int slot;
>  
> + KERNEL_ASSERT_LOCKED();
>   KASSERT(uobj->pgops == &aobj_pager);
>  
>   for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) {
> Index: uvm/uvm_swap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
> retrieving revision 1.147
> diff -u -p -r1.147 uvm_swap.c
> --- uvm/uvm_swap.c29 Sep 2020 11:47:41 -  1.147
> +++ uvm/uvm_swap.c10 Dec 2020 13:30:30 -
> @@ -1403,7 +1403,7 @@ uvm_swap_alloc(int *nslots, boolean_t le
>   /*
>* lock data lock, convert slots into blocks, and enter loop
>*/
> -
> + KERNEL_ASSERT_LOCKED();
>  ReTry:   /* XXXMRG */
>   LIST_FOREACH(spp, &swap_priority, spi_swappri) {
>   TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
> @@ -1449,8 +1449,10 @@ uvm_swapisfull(void)
>  {
>   int result;
>  
> + KERNEL_LOCK();
>   KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
>   result = (uvmexp.swpgonly == uvmexp.swpages);
> + KERNEL_UNLOCK();
>  
>   return result;
>  }
> @@ -1465,6 +1467,7 @@ uvm_swap_markbad(int startslot, int nslo
>  {
>   struct swapdev *sdp;
>  
> + KERNEL_LOCK();
>   sdp = swapdrum_getsdp(startslot);
>   if (sdp != NULL) {
>   /*
> @@ -1475,6 +1478,7 @@ uvm_swap_markbad(int startslot, int nslo
>*/
>   sdp->swd_npgbad += nslots;
>   }
> + KERNEL_UNLOCK();
>  }
>  
>  /*
> @@ -1501,7 +1505,7 @@ uvm_swap_free(int startslot, int nslots)
>* in the extent, and return.   must hold pri lock to do
>* lookup and access the extent.
>*/
> -
> + KERNEL_LOCK();
>   sdp = swapdrum_getsdp(startslot);
>   KASSERT(uvmexp.nswapdev >= 1);
>   KASSERT(sdp != NULL);
> @@ -1533,6 +1537,7 @@ uvm_swap_free(int startslot, int nslots)
>   }
>   }
>  #endif /* UVM_SWAP_ENCRYPT */
> + KERNEL_UNLOCK();
>  }
>  
>  /*
> @@ -1567,6 +1572,7 @@ uvm_swap_get(struct vm_page *page, int s
>   return VM_PAGER_ERROR;
>   }
>  
> + KERNEL_LOCK();
>   /* this page is (about to be) no longer only in swap. */
>   uvmexp.swpgonly--;
>  
> @@ -1577,7 +1583,7 @@ uvm_swap_get(struct vm_page *page, int s
>   /* oops, the read failed so it really is still only in swap. */
>   uvmexp.swpgonly++;
>   }
> -
> + KERNEL_UNLOCK();
>   return (result);
>  }
>  
> @@ -1599,6 +1605,8 @@ uvm_swap_io(struct vm_page **pps, int st
>   struct swapdev *sdp;
>   int encrypt = 0;
>  #endif
> +
> + KERNEL_ASSERT_LOCKED();
>  
>   write = (flags &a

Re: uvm_fault: entering swap code

2020-12-08 Thread Jonathan Matthew
On Mon, Dec 07, 2020 at 03:15:50PM -0300, Martin Pieuchot wrote:
> Getting a page from the fault handler might require poking at some
> swap-related states.
> 
> These are not in the hot-path of the fault handler so for the moment
> just assert that the KERNEL_LOCK() is held or grab it if the function
> might be called from an future unlocked path.
> 
> ok?

Could you add 'K' to the list of locks in the comment above struct uvmexp too?

I went looking for other uses of swpgonly and saw that it's used under
uvm_map_teardown -> uvm_unmap_kill_entry -> uvm_km_pgremove,
and uvm_map_teardown ensures that the kernel lock is not held.
Not related to this diff exactly, but is this something we need to fix?


> 
> Index: uvm/uvm_swap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
> retrieving revision 1.147
> diff -u -p -r1.147 uvm_swap.c
> --- uvm/uvm_swap.c29 Sep 2020 11:47:41 -  1.147
> +++ uvm/uvm_swap.c7 Dec 2020 18:07:03 -
> @@ -1403,7 +1403,7 @@ uvm_swap_alloc(int *nslots, boolean_t le
>   /*
>* lock data lock, convert slots into blocks, and enter loop
>*/
> -
> + KERNEL_ASSERT_LOCKED();
>  ReTry:   /* XXXMRG */
>   LIST_FOREACH(spp, &swap_priority, spi_swappri) {
>   TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
> @@ -1449,8 +1449,10 @@ uvm_swapisfull(void)
>  {
>   int result;
>  
> + KERNEL_LOCK();
>   KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
>   result = (uvmexp.swpgonly == uvmexp.swpages);
> + KERNEL_UNLOCK();
>  
>   return result;
>  }
> @@ -1465,6 +1467,7 @@ uvm_swap_markbad(int startslot, int nslo
>  {
>   struct swapdev *sdp;
>  
> + KERNEL_LOCK();
>   sdp = swapdrum_getsdp(startslot);
>   if (sdp != NULL) {
>   /*
> @@ -1475,6 +1478,7 @@ uvm_swap_markbad(int startslot, int nslo
>*/
>   sdp->swd_npgbad += nslots;
>   }
> + KERNEL_UNLOCK();
>  }
>  
>  /*
> @@ -1501,7 +1505,7 @@ uvm_swap_free(int startslot, int nslots)
>* in the extent, and return.   must hold pri lock to do
>* lookup and access the extent.
>*/
> -
> + KERNEL_LOCK();
>   sdp = swapdrum_getsdp(startslot);
>   KASSERT(uvmexp.nswapdev >= 1);
>   KASSERT(sdp != NULL);
> @@ -1533,6 +1537,7 @@ uvm_swap_free(int startslot, int nslots)
>   }
>   }
>  #endif /* UVM_SWAP_ENCRYPT */
> + KERNEL_UNLOCK();
>  }
>  
>  /*
> @@ -1567,6 +1572,7 @@ uvm_swap_get(struct vm_page *page, int s
>   return VM_PAGER_ERROR;
>   }
>  
> + KERNEL_LOCK();
>   /* this page is (about to be) no longer only in swap. */
>   uvmexp.swpgonly--;
>  
> @@ -1577,7 +1583,7 @@ uvm_swap_get(struct vm_page *page, int s
>   /* oops, the read failed so it really is still only in swap. */
>   uvmexp.swpgonly++;
>   }
> -
> + KERNEL_UNLOCK();
>   return (result);
>  }
>  
> @@ -1599,6 +1605,8 @@ uvm_swap_io(struct vm_page **pps, int st
>   struct swapdev *sdp;
>   int encrypt = 0;
>  #endif
> +
> + KERNEL_ASSERT_LOCKED();
>  
>   write = (flags & B_READ) == 0;
>   async = (flags & B_ASYNC) != 0;
> Index: uvm/uvmexp.h
> ===
> RCS file: /cvs/src/sys/uvm/uvmexp.h,v
> retrieving revision 1.6
> diff -u -p -r1.6 uvmexp.h
> --- uvm/uvmexp.h  1 Dec 2020 13:56:22 -   1.6
> +++ uvm/uvmexp.h  7 Dec 2020 18:09:06 -
> @@ -79,9 +79,9 @@ struct uvmexp {
>  
>   /* swap */
>   int nswapdev;   /* number of configured swap devices in system */
> - int swpages;/* number of PAGE_SIZE'ed swap pages */
> + int swpages;/* [K] number of PAGE_SIZE'ed swap pages */
>   int swpginuse;  /* number of swap pages in use */
> - int swpgonly;   /* number of swap pages in use, not also in RAM */
> + int swpgonly;   /* [K] number of swap pages in use, not also in RAM */
>   int nswget; /* number of swap pages moved from disk to RAM */
>   int nanon;  /* XXX number total of anon's in system */
>   int unused05;   /* formerly nanonneeded */
> 



Re: uvm_fault: kill goto in uvm_fault()

2020-12-08 Thread Jonathan Matthew
On Mon, Dec 07, 2020 at 04:08:46PM -0300, Martin Pieuchot wrote:
> Diff below rewrites uvm_fault() using a loop.
> 
> I added a KERNEL_LOCK/UNLOCK() dance around the part that won't be
> unlocked soon to illustrate where this is going.
> 
> ok?

yes, ok jmatthew@

> 
> Index: uvm/uvm_fault.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> retrieving revision 1.108
> diff -u -p -r1.108 uvm_fault.c
> --- uvm/uvm_fault.c   19 Nov 2020 17:06:40 -  1.108
> +++ uvm/uvm_fault.c   7 Dec 2020 18:20:16 -
> @@ -907,7 +907,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
>   boolean_t shadowed;
>   struct vm_anon *anons_store[UVM_MAXRANGE], **anons;
>   struct vm_page *pages[UVM_MAXRANGE];
> - int error;
> + int error = ERESTART;
>  
>   uvmexp.faults++;/* XXX: locking? */
>   TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL);
> @@ -923,43 +923,32 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
>   flt.narrow = FALSE; /* normal fault */
>  
>  
> - /* "goto ReFault" means restart the page fault from ground zero. */
> -ReFault:
> - anons = anons_store;
> -
> - error = uvm_fault_check(&ufi, &flt, &anons, access_type);
> - switch (error) {
> - case 0:
> - break;
> - case ERESTART:
> - goto ReFault;
> - default:
> - return error;
> - }
> -
> - /* (shadowed == TRUE) if there is an anon at the faulting address */
> - shadowed = uvm_fault_upper_lookup(&ufi, &flt, anons, pages);
> -
> - /* handle case 1: fault on an anon in our amap */
> - if (shadowed == TRUE) {
> - error = uvm_fault_upper(&ufi, &flt, anons, fault_type,
> - access_type);
> - switch (error) {
> - case ERESTART:
> - goto ReFault;
> - default:
> - return error;
> + /*
> +  * ReFault
> +  */
> + while (error == ERESTART) {
> + anons = anons_store;
> +
> + error = uvm_fault_check(&ufi, &flt, &anons, access_type);
> + if (error != 0)
> + continue;
> +
> + /* True if there is an anon at the faulting address */
> + shadowed = uvm_fault_upper_lookup(&ufi, &flt, anons, pages);
> + if (shadowed == TRUE) {
> + /* case 1: fault on an anon in our amap */
> + error = uvm_fault_upper(&ufi, &flt, anons, fault_type,
> + access_type);
> + } else {
> + /* case 2: fault on backing object or zero fill */
> + KERNEL_LOCK();
> + error = uvm_fault_lower(&ufi, &flt, pages, fault_type,
> + access_type);
> + KERNEL_UNLOCK();
>   }
>   }
>  
> - /* handle case 2: faulting on backing object or zero fill */
> - error = uvm_fault_lower(&ufi, &flt, pages, fault_type, access_type);
> - switch (error) {
> - case ERESTART:
> - goto ReFault;
> - default:
> - return error;
> - }
> + return error;
>  }
>  
>  int
> 



Re: Use SMR_TAILQ for `ps_threads'

2020-12-05 Thread Jonathan Matthew
On Fri, Dec 04, 2020 at 10:03:46AM -0300, Martin Pieuchot wrote:
> On 04/12/20(Fri) 12:01, Jonathan Matthew wrote:
> > On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> > > [...] 
> > > Could you try the diff below that only call smr_barrier() for multi-
> > > threaded processes with threads still in the list.  I guess this also
> > > answers guenther@'s question.  The same could be done with smr_flush().
> > 
> > This removes the overhead, more or less.  Are we only looking at unlocking 
> > access
> > to ps_threads from within a process (not the sysctl or ptrace stuff)?  
> > Otherwise
> > this doesn't seem safe.
> 
> I'd argue that if `ps_thread' is being iterated the CPU doing the
> iteration must already have a reference to the "struct process" so
> the serialization should be done on this reference.

Sounds reasonable to me.

> 
> Now I doubt we'll be able to answer all the questions right now.  If we
> can find a path forward that doesn't decrease performances too much and
> allow us to move signal delivery and sleep out of the KERNEL_LOCK()
> that's a huge win.

I think we're at an acceptable performance hit now, so if this lets you
progress with unlocking signal delivery, I'm happy.



Re: srp_finalize(9): tsleep(9) -> tsleep_nsec(9)

2020-12-05 Thread Jonathan Matthew
On Fri, Dec 04, 2020 at 12:17:31PM -0600, Scott Cheloha wrote:
> On Fri, Dec 04, 2020 at 09:56:02AM +0100, Claudio Jeker wrote:
> > On Thu, Dec 03, 2020 at 10:05:30PM -0600, Scott Cheloha wrote:
> > > Hi,
> > > 
> > > srp_finalize(9) uses tsleep(9) to spin while it waits for the object's
> > > refcount to reach zero.  It blocks for up to 1 tick and then checks
> > > the refecount again and again.
> > > 
> > > We can just as easily do this with tsleep_nsec(9) and block for 1
> > > millisecond per interval.
> > > 
> > > ok?
> > > 
> > > Index: kern_srp.c
> > > ===
> > > RCS file: /cvs/src/sys/kern/kern_srp.c,v
> > > retrieving revision 1.12
> > > diff -u -p -r1.12 kern_srp.c
> > > --- kern_srp.c8 Sep 2017 05:36:53 -   1.12
> > > +++ kern_srp.c4 Dec 2020 04:04:39 -
> > > @@ -274,7 +274,7 @@ void
> > >  srp_finalize(void *v, const char *wmesg)
> > >  {
> > >   while (srp_referenced(v))
> > > - tsleep(v, PWAIT, wmesg, 1);
> > > + tsleep_nsec(v, PWAIT, wmesg, MSEC_TO_NSEC(1));
> > >  }
> > >  
> > >  #else /* MULTIPROCESSOR */
> > > 
> > 
> > Why only 1ms instead of the original 10ms (at least on most archs)?
> 
> The underlying implementation can only process timeouts from
> hardclock(9) which runs about hz times per second.  If we tell the
> thread to "sleep for 10ms" it's almost always going to overshoot the
> next hardclock(9) and wind up sleeping ~20ms.
> 
> Some people run with HZ=1000 kernels.  I don't think many people run
> with kernels with a higher HZ than that, though.  So I figure a 1ms
> sleep is "good enough" for all practical kernels.  On HZ=100 kernels
> the thread will oversleep because it doesn't process timeouts often
> enough to honor the 1ms request.
> 
> Basically I'm trying to pick a reasonable polling interval (not too
> fast) that also won't cause the existing default kernel to block for
> longer than it already does (~10ms).  The default kernel is HZ=100, so
> a 1ms sleep will, in this case, almost always sleep ~10ms per
> iteration of this loop.

This sleep should basically be 'as short as possible', since it's waiting
out SRP references which are very short lived.  ok jmatthew@



Re: Use SMR_TAILQ for `ps_threads'

2020-12-03 Thread Jonathan Matthew
On Wed, Dec 02, 2020 at 07:44:02PM +0100, Anton Lindqvist wrote:
> On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> > On 02/12/20(Wed) 17:27, Jonathan Matthew wrote:
> > > On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote:
> > > > On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> > > > > [...] 
> > > > > Did you run a make build with that smr_barrier() in it and checked 
> > > > > that it
> > > > > does not cause a slow down? I am sceptical, smr_barrier() is a very 
> > > > > slow
> > > > > construct which introduces large delays and should be avoided whenever
> > > > > possible.
> > > > 
> > > > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
> > > > below, without noticeable difference.
> > > > 
> > > > I'm happy to hear from sceptical performance checkers :o)
> > > 
> > > On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build 
> > > time from
> > > ~3m06s to ~3m44s, which seems a bit much to me.
> > 
> > Do you know if this is due to an increase of %spin time?
> > 
> > > Replacing smr_barrier() with smr_flush() reduces the overhead to a couple 
> > > of
> > > seconds, and it seems warranted here.
> > 
> > Could you try the diff below that only call smr_barrier() for multi-
> > threaded processes with threads still in the list.  I guess this also
> > answers guenther@'s question.  The same could be done with smr_flush().
> 
> I'm wondering if smr_grace_wait() could be improved on amd64, assuming
> SMT is disabled, by skipping offline CPUs.

This doesn't make much of a difference when using smr_barrier(), but with
smr_flush() it removes much of the overhead on this system with 8 of 16 cpus
online.  Of course as Visa and Mark point out this is risky without more
guarantees about what offline cpus are actually doing.  If we start using SMR
in ways that make the delay visible to user processes, it'll probably be worth
looking at.

> 
> Index: kern/kern_smr.c
> ===
> RCS file: /cvs/src/sys/kern/kern_smr.c,v
> retrieving revision 1.8
> diff -u -p -r1.8 kern_smr.c
> --- kern/kern_smr.c   3 Apr 2020 03:36:56 -   1.8
> +++ kern/kern_smr.c   2 Dec 2020 18:41:29 -
> @@ -142,7 +142,7 @@ smr_grace_wait(void)
>  
>   ci_start = curcpu();
>   CPU_INFO_FOREACH(cii, ci) {
> - if (ci == ci_start)
> + if (ci == ci_start || !cpu_is_online(ci))
>   continue;
>   sched_peg_curproc(ci);
>   }
> 



Re: Use SMR_TAILQ for `ps_threads'

2020-12-03 Thread Jonathan Matthew
On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> On 02/12/20(Wed) 17:27, Jonathan Matthew wrote:
> > On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote:
> > > On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> > > > [...] 
> > > > Did you run a make build with that smr_barrier() in it and checked that 
> > > > it
> > > > does not cause a slow down? I am sceptical, smr_barrier() is a very slow
> > > > construct which introduces large delays and should be avoided whenever
> > > > possible.
> > > 
> > > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
> > > below, without noticeable difference.
> > > 
> > > I'm happy to hear from sceptical performance checkers :o)
> > 
> > On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build 
> > time from
> > ~3m06s to ~3m44s, which seems a bit much to me.
> 
> Do you know if this is due to an increase of %spin time?

It actually decreased %spin, and the total system cpu time used during the 
build was
decreased from around 6m30s to around 5m15, so I think it's mostly the effect 
of the
delayed wakeup of the SMR thread in smr_dispatch().

There's also this:

$ time sleep 1
0m01.11s real 0m00.00s user 0m00.00s system


> 
> > Replacing smr_barrier() with smr_flush() reduces the overhead to a couple of
> > seconds, and it seems warranted here.
> 
> Could you try the diff below that only call smr_barrier() for multi-
> threaded processes with threads still in the list.  I guess this also
> answers guenther@'s question.  The same could be done with smr_flush().

This removes the overhead, more or less.  Are we only looking at unlocking 
access
to ps_threads from within a process (not the sysctl or ptrace stuff)?  Otherwise
this doesn't seem safe.



Re: Use SMR_TAILQ for `ps_threads'

2020-12-01 Thread Jonathan Matthew
On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote:
> On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> > [...] 
> > Did you run a make build with that smr_barrier() in it and checked that it
> > does not cause a slow down? I am sceptical, smr_barrier() is a very slow
> > construct which introduces large delays and should be avoided whenever
> > possible.
> 
> I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
> below, without noticeable difference.
> 
> I'm happy to hear from sceptical performance checkers :o)

On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build time 
from
~3m06s to ~3m44s, which seems a bit much to me.

Replacing smr_barrier() with smr_flush() reduces the overhead to a couple of
seconds, and it seems warranted here.

> 
> diff --git lib/libkvm/kvm_proc2.c lib/libkvm/kvm_proc2.c
> index 96f7dc91b92..1f4f9b914bb 100644
> --- lib/libkvm/kvm_proc2.c
> +++ lib/libkvm/kvm_proc2.c
> @@ -341,8 +341,9 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process 
> *pr,
>   kp.p_pctcpu = 0;
>   kp.p_stat = (process.ps_flags & PS_ZOMBIE) ? SDEAD :
>   SIDL;
> - for (p = TAILQ_FIRST(&process.ps_threads); p != NULL; 
> - p = TAILQ_NEXT(&proc, p_thr_link)) {
> + for (p = SMR_TAILQ_FIRST_LOCKED(&process.ps_threads);
> + p != NULL;
> + p = SMR_TAILQ_NEXT_LOCKED(&proc, p_thr_link)) {
>   if (KREAD(kd, (u_long)p, &proc)) {
>   _kvm_err(kd, kd->program,
>   "can't read proc at %lx",
> @@ -376,8 +377,8 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process 
> *pr,
>   if (!dothreads)
>   continue;
>  
> - for (p = TAILQ_FIRST(&process.ps_threads); p != NULL; 
> - p = TAILQ_NEXT(&proc, p_thr_link)) {
> + for (p = SMR_TAILQ_FIRST_LOCKED(&process.ps_threads); p != NULL;
> + p = SMR_TAILQ_NEXT_LOCKED(&proc, p_thr_link)) {
>   if (KREAD(kd, (u_long)p, &proc)) {
>   _kvm_err(kd, kd->program,
>   "can't read proc at %lx",
> diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
> index 5e455208663..575273b306c 100644
> --- sys/kern/exec_elf.c
> +++ sys/kern/exec_elf.c
> @@ -85,6 +85,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -1360,7 +1361,7 @@ coredump_notes_elf(struct proc *p, void *iocookie, 
> size_t *sizep)
>* threads in the process have been stopped and the list can't
>* change.
>*/
> - TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) {
> + SMR_TAILQ_FOREACH_LOCKED(q, &pr->ps_threads, p_thr_link) {
>   if (q == p) /* we've taken care of this thread */
>   continue;
>   error = coredump_note_elf(q, iocookie, ¬esize);
> diff --git sys/kern/init_main.c sys/kern/init_main.c
> index fed6be19435..2b657ffe328 100644
> --- sys/kern/init_main.c
> +++ sys/kern/init_main.c
> @@ -519,7 +519,7 @@ main(void *framep)
>*/
>   LIST_FOREACH(pr, &allprocess, ps_list) {
>   nanouptime(&pr->ps_start);
> - TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) {
> + SMR_TAILQ_FOREACH_LOCKED(p, &pr->ps_threads, p_thr_link) {
>   nanouptime(&p->p_cpu->ci_schedstate.spc_runtime);
>   timespecclear(&p->p_rtime);
>   }
> diff --git sys/kern/kern_exit.c sys/kern/kern_exit.c
> index a20775419e3..3c526ab83b8 100644
> --- sys/kern/kern_exit.c
> +++ sys/kern/kern_exit.c
> @@ -63,6 +63,7 @@
>  #ifdef SYSVSEM
>  #include 
>  #endif
> +#include 
>  #include 
>  
>  #include 
> @@ -161,7 +162,8 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>   }
>  
>   /* unlink ourselves from the active threads */
> - TAILQ_REMOVE(&pr->ps_threads, p, p_thr_link);
> + SMR_TAILQ_REMOVE_LOCKED(&pr->ps_threads, p, p_thr_link);
> + smr_barrier();
>   if ((p->p_flag & P_THREAD) == 0) {
>   /* main thread gotta wait because it has the pid, et al */
>   while (pr->ps_refcnt > 1)
> @@ -724,7 +726,7 @@ process_zap(struct process *pr)
>   if (pr->ps_ptstat != NULL)
>   free(pr->ps_ptstat, M_SUBPROC, sizeof(*pr->ps_ptstat));
>   pool_put(&rusage_pool, pr->ps_ru);
> - KASSERT(TAILQ_EMPTY(&pr->ps_threads));
> + KASSERT(SMR_TAILQ_EMPTY_LOCKED(&pr->ps_threads));
>   lim_free(pr->ps_limit);
>   crfree(pr->ps_ucred);
>   pool_put(&process_pool, pr);
> diff --git sys/kern/kern_fork.c sys/kern/kern_fork.c
> index 9fb239bc8b4..e1cb587b2b8 100644
> --- sys/kern/kern_fork.c
> +++ sys/kern/kern_fork.c
> @@ -52,6 +52,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
> 

Re: Use SMR_TAILQ for `ps_threads'

2020-12-01 Thread Jonathan Matthew
On Tue, Dec 01, 2020 at 10:31:43AM +0100, Claudio Jeker wrote:
> On Mon, Nov 30, 2020 at 07:10:47PM -0300, Martin Pieuchot wrote:
> > Every multi-threaded process keeps a list of threads in `ps_threads'.
> > This list is iterated in interrupt and process context which makes it
> > complicated to protect it with a rwlock.
> > 
> > One of the places where such iteration is done is inside the tsleep(9)
> > routines, directly in single_thread_check() or via CURSIG().  In order
> > to take this code path out of the KERNEL_LOCK(), claudio@ proposed to
> > use SMR_TAILQ.  This has the advantage of not introducing lock
> > dependencies and allow us to address every iteration one-by-one.
> > 
> > Diff below is a first step into this direction, it replaces the existing
> > TAILQ_* macros by the locked version of SMR_TAILQ*.  This is mostly lifted
> > from claudio@'s diff and should not introduce any side effect.
> > 
> > ok?
> > 
> > diff --git sys/uvm/uvm_glue.c sys/uvm/uvm_glue.c
> > index 390307c4c81..40a10e4c1c5 100644
> > --- sys/uvm/uvm_glue.c
> > +++ sys/uvm/uvm_glue.c
> > @@ -369,7 +369,7 @@ uvm_swapout_threads(void)
> >  * the smallest p_slptime
> >  */
> > slpp = NULL;
> > -   TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) {
> > +   SMR_TAILQ_FOREACH_LOCKED(p, &pr->ps_threads, p_thr_link) {
> > switch (p->p_stat) {
> > case SRUN:
> > case SONPROC:
> > 
> 
> Why did you not include the smr_call() to safely free struct proc in this
> diff?

I was wondering about this too.  Freeing the struct proc is already delayed
by some amount since it happens in the reaper or in the parent process, does
it make sense to combine that with the SMR wait?



Re: ldapd warning

2020-11-29 Thread Jonathan Matthew
On Sat, Nov 28, 2020 at 11:20:30PM +0100, Theo Buehler wrote:
> /usr/src/usr.sbin/ldapd/util.c:46:21: warning: comparison of integers of 
> different signs:
>   'int' and 'size_t' (aka 'unsigned long') [-Wsign-compare]
> if (ret < 0 || ret >= size)
>~~~ ^  
> 
> This has been around for a while. I forgot that I had this patch in my
> tree.

'size' was cast to int before r1.11 of util.c, I'm not sure why the cast was
removed.  smtpd also has a copy of this function that still has the cast.


> 
> Index: util.c
> ===
> RCS file: /cvs/src/usr.sbin/ldapd/util.c,v
> retrieving revision 1.12
> diff -u -p -r1.12 util.c
> --- util.c24 Oct 2019 12:39:26 -  1.12
> +++ util.c4 Aug 2020 07:14:33 -
> @@ -43,7 +43,7 @@ bsnprintf(char *str, size_t size, const 
>   va_start(ap, format);
>   ret = vsnprintf(str, size, format, ap);
>   va_end(ap);
> - if (ret < 0 || ret >= size)
> + if (ret < 0 || (size_t)ret >= size)
>   return 0;
>  
>   return 1;
> 



Re: Locking of uvm_pageclean()

2020-11-19 Thread Jonathan Matthew
On Wed, Nov 18, 2020 at 08:31:23PM -0300, Martin Pieuchot wrote:
> I found another race related to some missing locking, this time around
> uvm_pageclean().
> 
> Diff below fixes the two places in /sys/uvm where the page queue lock
> should be taken.  To prevent further corruption I added some assertions 
> and documented some global data structures that are currently protected
> by this lock.
> 
> Note that uvm_pagefree() is called by many pmaps most of the time
> without the lock held.  The diff below doesn't fix them and that's why
> some assertions are commented out.
> 
> ok?

It looks like there are a couple of other paths to uvm_pagefree() that
don't take the page queue lock - uvm_km_pgremove_intrface() and (on non
pmap direct archs) uvm_km_doputpage().

Since that doesn't really affect the diff, and everything else is right
as far as I can tell, ok jmatthew@

> 
> Index: uvm/uvm.h
> ===
> RCS file: /cvs/src/sys/uvm/uvm.h,v
> retrieving revision 1.67
> diff -u -p -r1.67 uvm.h
> --- uvm/uvm.h 6 Dec 2019 08:33:25 -   1.67
> +++ uvm/uvm.h 18 Nov 2020 23:22:15 -
> @@ -44,18 +44,20 @@
>  /*
>   * uvm structure (vm global state: collected in one structure for ease
>   * of reference...)
> + *
> + *  Locks used to protect struct members in this file:
> + *   Q   uvm.pageqlock
>   */
> -
>  struct uvm {
>   /* vm_page related parameters */
>  
>   /* vm_page queues */
> - struct pglist page_active;  /* allocated pages, in use */
> - struct pglist page_inactive_swp;/* pages inactive (reclaim or free) */
> - struct pglist page_inactive_obj;/* pages inactive (reclaim or free) */
> + struct pglist page_active;  /* [Q] allocated pages, in use */
> + struct pglist page_inactive_swp;/* [Q] pages inactive (reclaim/free) */
> + struct pglist page_inactive_obj;/* [Q] pages inactive (reclaim/free) */
>   /* Lock order: pageqlock, then fpageqlock. */
> - struct mutex pageqlock; /* lock for active/inactive page q */
> - struct mutex fpageqlock;/* lock for free page q  + pdaemon */
> + struct mutex pageqlock; /* [] lock for active/inactive page q */
> + struct mutex fpageqlock;/* [] lock for free page q  + pdaemon */
>   boolean_t page_init_done;   /* TRUE if uvm_page_init() finished */
>   struct uvm_pmr_control pmr_control; /* pmemrange data */
>  
> Index: uvm/uvm_anon.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
> retrieving revision 1.49
> diff -u -p -r1.49 uvm_anon.c
> --- uvm/uvm_anon.c4 Jan 2020 16:17:29 -   1.49
> +++ uvm/uvm_anon.c18 Nov 2020 23:22:15 -
> @@ -106,7 +106,9 @@ uvm_anfree_list(struct vm_anon *anon, st
>* clean page, and put on on pglist
>* for later freeing.
>*/
> + uvm_lock_pageq();
>   uvm_pageclean(pg);
> + uvm_unlock_pageq();
>   TAILQ_INSERT_HEAD(pgl, pg, pageq);
>   } else {
>   uvm_lock_pageq();   /* lock out pagedaemon */
> Index: uvm/uvm_object.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_object.c,v
> retrieving revision 1.17
> diff -u -p -r1.17 uvm_object.c
> --- uvm/uvm_object.c  21 Oct 2020 09:08:14 -  1.17
> +++ uvm/uvm_object.c  18 Nov 2020 23:22:15 -
> @@ -172,7 +172,9 @@ uvm_objfree(struct uvm_object *uobj)
>* this pg from the uobj we are throwing away
>*/
>   atomic_clearbits_int(&pg->pg_flags, PG_TABLED);
> + uvm_lock_pageq();
>   uvm_pageclean(pg);
> + uvm_unlock_pageq();
>   TAILQ_INSERT_TAIL(&pgl, pg, pageq);
>   }
>   uvm_pmr_freepageq(&pgl);
> Index: uvm/uvm_page.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_page.c,v
> retrieving revision 1.150
> diff -u -p -r1.150 uvm_page.c
> --- uvm/uvm_page.c22 Sep 2020 14:31:08 -  1.150
> +++ uvm/uvm_page.c18 Nov 2020 23:22:15 -
> @@ -973,6 +973,10 @@ uvm_pageclean(struct vm_page *pg)
>  {
>   u_int flags_to_clear = 0;
>  
> +#if all_pmap_are_fixed
> + MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
> +#endif
> +
>  #ifdef DEBUG
>   if (pg->uobject == (void *)0xdeadbeef &&
>   pg->uanon == (void *)0xdeadbeef) {
> @@ -1037,6 +1041,10 @@ uvm_pageclean(struct vm_page *pg)
>  void
>  uvm_pagefree(struct vm_page *pg)
>  {
> +#if all_pmap_are_fixed
> + MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
> +#endif
> +
>   uvm_pageclean(pg);
>   uvm_pmr_freepages(pg, 1);
>  }
> @@ -1229,6 +1237,8 @@ uvm_pagelookup(struct uvm_object *obj, v
>  void
>  uvm_pagewire(struct vm_page *pg)
>  {
> + MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
> +
>

Re: uvm_fault: refactoring for case 2 faults

2020-11-19 Thread Jonathan Matthew
On Tue, Nov 17, 2020 at 09:25:10AM -0300, Martin Pieuchot wrote:
> Here's another refactoring that moves the remaining logic of uvm_fault()
> handling lower faults, case 2, to its own function.  This logic shouldn't
> be modified in the first step of unlocking amap & anon and will still be
> executed under KERNEL_LOCK().  Having a separate function will however
> help to turn the 'ReFault' goto into a more readable loop.  This will be
> the next step.
> 
> ok?

ok jmatthew@

> 
> Index: uvm/uvm_fault.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> retrieving revision 1.107
> diff -u -p -r1.107 uvm_fault.c
> --- uvm/uvm_fault.c   16 Nov 2020 12:30:16 -  1.107
> +++ uvm/uvm_fault.c   16 Nov 2020 13:27:32 -
> @@ -484,6 +484,9 @@ struct uvm_faultctx {
>   paddr_t pa_flags;
>  };
>  
> +int  uvm_fault_lower(struct uvm_faultinfo *, struct uvm_faultctx *,
> + struct vm_page **, vm_fault_t, vm_prot_t);
> +
>  /*
>   * uvm_fault_check: check prot, handle needs-copy, etc.
>   *
> @@ -901,19 +904,11 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
>  {
>   struct uvm_faultinfo ufi;
>   struct uvm_faultctx flt;
> - boolean_t promote, locked, shadowed;
> - int result, lcv, gotpages;
> - vaddr_t currva;
> - voff_t uoff;
> - struct vm_amap *amap;
> - struct uvm_object *uobj;
> - struct vm_anon *anons_store[UVM_MAXRANGE], **anons, *anon;
> - struct vm_page *pages[UVM_MAXRANGE], *pg, *uobjpage;
> + boolean_t shadowed;
> + struct vm_anon *anons_store[UVM_MAXRANGE], **anons;
> + struct vm_page *pages[UVM_MAXRANGE];
>   int error;
>  
> - anon = NULL;
> - pg = NULL;
> -
>   uvmexp.faults++;/* XXX: locking? */
>   TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL);
>  
> @@ -957,8 +952,28 @@ ReFault:
>   }
>   }
>  
> - amap = ufi.entry->aref.ar_amap;
> - uobj = ufi.entry->object.uvm_obj;
> + /* handle case 2: faulting on backing object or zero fill */
> + error = uvm_fault_lower(&ufi, &flt, pages, fault_type, access_type);
> + switch (error) {
> + case ERESTART:
> + goto ReFault;
> + default:
> + return error;
> + }
> +}
> +
> +int
> +uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
> +   struct vm_page **pages, vm_fault_t fault_type, vm_prot_t access_type)
> +{
> + struct vm_amap *amap = ufi->entry->aref.ar_amap;
> + struct uvm_object *uobj = ufi->entry->object.uvm_obj;
> + boolean_t promote, locked;
> + int result, lcv, gotpages;
> + struct vm_page *uobjpage, *pg = NULL;
> + struct vm_anon *anon = NULL;
> + vaddr_t currva;
> + voff_t uoff;
>  
>   /*
>* if the desired page is not shadowed by the amap and we have a
> @@ -967,15 +982,15 @@ ReFault:
>* with the usual pgo_get hook).  the backing object signals this by
>* providing a pgo_fault routine.
>*/
> - if (uobj && shadowed == FALSE && uobj->pgops->pgo_fault != NULL) {
> - result = uobj->pgops->pgo_fault(&ufi, flt.startva, pages,
> - flt.npages, flt.centeridx, fault_type, access_type,
> + if (uobj != NULL && uobj->pgops->pgo_fault != NULL) {
> + result = uobj->pgops->pgo_fault(ufi, flt->startva, pages,
> + flt->npages, flt->centeridx, fault_type, access_type,
>   PGO_LOCKED);
>  
>   if (result == VM_PAGER_OK)
>   return (0); /* pgo_fault did pmap enter */
>   else if (result == VM_PAGER_REFAULT)
> - goto ReFault;   /* try again! */
> + return ERESTART;/* try again! */
>   else
>   return (EACCES);
>   }
> @@ -989,20 +1004,20 @@ ReFault:
>*
>* ("get" has the option of doing a pmap_enter for us)
>*/
> - if (uobj && shadowed == FALSE) {
> + if (uobj != NULL) {
>   uvmexp.fltlget++;
> - gotpages = flt.npages;
> - (void) uobj->pgops->pgo_get(uobj, ufi.entry->offset +
> - (flt.startva - ufi.entry->start),
> - pages, &gotpages, flt.centeridx,
> - access_type & MASK(ufi.entry),
> - ufi.entry->advice, PGO_LOCKED);
> + gotpages = flt->npages;
> + (void) uobj->pgops->pgo_get(uobj, ufi->entry->offset +
> + (flt->startva - ufi->entry->start),
> + pages, &gotpages, flt->centeridx,
> + access_type & MASK(ufi->entry),
> + ufi->entry->advice, PGO_LOCKED);
>  
>   /* check for pages to map, if we got any */
>   uobjpage = NULL;
>   if (gotpages) {
> - currva = flt.star

Re: uvm_fault: Kill goto Case2

2020-11-15 Thread Jonathan Matthew
On Fri, Nov 13, 2020 at 12:04:23PM -0300, Martin Pieuchot wrote:
> Another simple refactoring of uvm_fault() removing a goto, ok?

I like it, ok jmatthew@

> 
> Index: uvm/uvm_fault.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> retrieving revision 1.106
> diff -u -p -r1.106 uvm_fault.c
> --- uvm/uvm_fault.c   13 Nov 2020 14:18:25 -  1.106
> +++ uvm/uvm_fault.c   13 Nov 2020 15:01:41 -
> @@ -942,12 +942,24 @@ ReFault:
>   return error;
>   }
>  
> - amap = ufi.entry->aref.ar_amap;
> - uobj = ufi.entry->object.uvm_obj;
> -
>   /* (shadowed == TRUE) if there is an anon at the faulting address */
>   shadowed = uvm_fault_upper_lookup(&ufi, &flt, anons, pages);
>  
> + /* handle case 1: fault on an anon in our amap */
> + if (shadowed == TRUE) {
> + error = uvm_fault_upper(&ufi, &flt, anons, fault_type,
> + access_type);
> + switch (error) {
> + case ERESTART:
> + goto ReFault;
> + default:
> + return error;
> + }
> + }
> +
> + amap = ufi.entry->aref.ar_amap;
> + uobj = ufi.entry->object.uvm_obj;
> +
>   /*
>* if the desired page is not shadowed by the amap and we have a
>* backing object, then we check to see if the backing object would
> @@ -1055,30 +1067,12 @@ ReFault:
>   /*
>* note that at this point we are done with any front or back pages.
>* we are now going to focus on the center page (i.e. the one we've
> -  * faulted on).  if we have faulted on the top (anon) layer
> -  * [i.e. case 1], then the anon we want is anons[centeridx] (we have
> -  * not touched it yet).  if we have faulted on the bottom (uobj)
> +  * faulted on).  if we have faulted on the bottom (uobj)
>* layer [i.e. case 2] and the page was both present and available,
>* then we've got a pointer to it as "uobjpage" and we've already
>* made it BUSY.
>*/
> - /*
> -  * there are four possible cases we must address: 1A, 1B, 2A, and 2B
> -  */
> - /* redirect case 2: if we are not shadowed, go to case 2. */
> - if (shadowed == FALSE)
> - goto Case2;
> -
> - /* handle case 1: fault on an anon in our amap */
> - error = uvm_fault_upper(&ufi, &flt, anons, fault_type, access_type);
> - switch (error) {
> - case ERESTART:
> - goto ReFault;
> - default:
> - return error;
> - }
>  
> -Case2:
>   /* handle case 2: faulting on backing object or zero fill */
>   /*
>* note that uobjpage can not be PGO_DONTCARE at this point.  we now
> 



Re: uvm_fault: is there an anon?

2020-11-13 Thread Jonathan Matthew
On Fri, Nov 13, 2020 at 12:17:04PM +0100, Theo Buehler wrote:
> On Wed, Nov 04, 2020 at 11:04:12AM -0300, Martin Pieuchot wrote:
> > Diff below introduces a helper that looks for existing mapping.  The
> > value returned by this lookup function determines if there's an anon
> > at the faulting address which tells us if we're dealign with a fault
> > of type 1 or 2.
> > 
> > This small refactoring is part of the current work to separate the code
> > handling faults of type 1 and 2.  The end goal being to move the type 1
> > faults handling out of the KERNEL_LOCK().
> > 
> > The function name is taken from NetBSD to not introduce more difference
> > than there's already.
> > 
> > ok?
> 
> ok tb.
> 
> I've been running the three diffs for two days and this went through two
> 'make release'
> 

Same here.  NetBSD's uvm fault handler is a lot more readable than ours, so
heading in that direction seems like a pretty good idea, ok jmatthew@



Re: amap: introduce amap_adjref_anons()

2020-11-12 Thread Jonathan Matthew
On Fri, Oct 30, 2020 at 08:46:20PM +0100, Martin Pieuchot wrote:
> On 23/10/20(Fri) 10:31, Martin Pieuchot wrote:
> > More refactoring.  This time let's introduce a helper to manipulate
> > references.  The goal is to reduce the upcoming diff adding locking.
> > 
> > This is extracted from a bigger diff from guenther@ as well as some
> > bits from NetBSD.
> 
> Now with the correct diff, ok?

This looks good to me (and survived a couple of full builds on amd64),
ok jmatthew@



> 
> Index: uvm/uvm_amap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
> retrieving revision 1.85
> diff -u -p -r1.85 uvm_amap.c
> --- uvm/uvm_amap.c12 Oct 2020 08:44:45 -  1.85
> +++ uvm/uvm_amap.c23 Oct 2020 08:23:59 -
> @@ -68,7 +68,23 @@ static inline void amap_list_remove(stru
>  
>  struct vm_amap_chunk *amap_chunk_get(struct vm_amap *, int, int, int);
>  void amap_chunk_free(struct vm_amap *, struct vm_amap_chunk *);
> -void amap_wiperange_chunk(struct vm_amap *, struct vm_amap_chunk *, int, 
> int);
> +
> +/*
> + * if we enable PPREF, then we have a couple of extra functions that
> + * we need to prototype here...
> + */
> +
> +#ifdef UVM_AMAP_PPREF
> +
> +#define PPREF_NONE ((int *) -1)  /* not using ppref */
> +
> +void amap_pp_adjref(struct vm_amap *, int, vsize_t, int);
> +void amap_pp_establish(struct vm_amap *);
> +void amap_wiperange_chunk(struct vm_amap *, struct vm_amap_chunk *, int,
> + int);
> +void amap_wiperange(struct vm_amap *, int, int);
> +
> +#endif   /* UVM_AMAP_PPREF */
>  
>  static inline void
>  amap_list_insert(struct vm_amap *amap)
> @@ -1153,6 +1169,32 @@ amap_unadd(struct vm_aref *aref, vaddr_t
>  }
>  
>  /*
> + * amap_adjref_anons: adjust the reference count(s) on amap and its anons.
> + */
> +static void
> +amap_adjref_anons(struct vm_amap *amap, vaddr_t offset, vsize_t len,
> +int refv, boolean_t all)
> +{
> +#ifdef UVM_AMAP_PPREF
> + if (amap->am_ppref == NULL && !all && len != amap->am_nslot) {
> + amap_pp_establish(amap);
> + }
> +#endif
> +
> + amap->am_ref += refv;
> +
> +#ifdef UVM_AMAP_PPREF
> + if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
> + if (all) {
> + amap_pp_adjref(amap, 0, amap->am_nslot, refv);
> + } else {
> + amap_pp_adjref(amap, offset, len, refv);
> + }
> + }
> +#endif
> +}
> +
> +/*
>   * amap_ref: gain a reference to an amap
>   *
>   * => "offset" and "len" are in units of pages
> @@ -1162,51 +1204,36 @@ void
>  amap_ref(struct vm_amap *amap, vaddr_t offset, vsize_t len, int flags)
>  {
>  
> - amap->am_ref++;
>   if (flags & AMAP_SHARED)
>   amap->am_flags |= AMAP_SHARED;
> -#ifdef UVM_AMAP_PPREF
> - if (amap->am_ppref == NULL && (flags & AMAP_REFALL) == 0 &&
> - len != amap->am_nslot)
> - amap_pp_establish(amap);
> - if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
> - if (flags & AMAP_REFALL)
> - amap_pp_adjref(amap, 0, amap->am_nslot, 1);
> - else
> - amap_pp_adjref(amap, offset, len, 1);
> - }
> -#endif
> + amap_adjref_anons(amap, offset, len, 1, (flags & AMAP_REFALL) != 0);
>  }
>  
>  /*
>   * amap_unref: remove a reference to an amap
>   *
> - * => caller must remove all pmap-level references to this amap before
> - *   dropping the reference
> - * => called from uvm_unmap_detach [only]  ... note that entry is no
> - *   longer part of a map
> + * => All pmap-level references to this amap must be already removed.
> + * => Called from uvm_unmap_detach(); entry is already removed from the map.
>   */
>  void
>  amap_unref(struct vm_amap *amap, vaddr_t offset, vsize_t len, boolean_t all)
>  {
> + KASSERT(amap->am_ref > 0);
>  
> - /* if we are the last reference, free the amap and return. */
> - if (amap->am_ref-- == 1) {
> - amap_wipeout(amap); /* drops final ref and frees */
> + if (amap->am_ref == 1) {
> + /*
> +  * If the last reference - wipeout and destroy the amap.
> +  */
> + amap->am_ref--;
> + amap_wipeout(amap);
>   return;
>   }
>  
> - /* otherwise just drop the reference count(s) */
> - if (amap->am_ref == 1 && (amap->am_flags & AMAP_SHARED) != 0)
> - amap->am_flags &= ~AMAP_SHARED; /* clear shared flag */
> -#ifdef UVM_AMAP_PPREF
> - if (amap->am_ppref == NULL && all == 0 && len != amap->am_nslot)
> - amap_pp_establish(amap);
> - if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
> - if (all)
> - amap_pp_adjref(amap, 0, amap->am_nslot, -1);
> - else
> - amap_pp_adjref(amap, offset, len, -1);
> + /*
> +  * Otherwise, drop the reference count(s) on anons.
> +  */
> + if (amap->am_ref 

Re: Document art locking fields

2020-11-12 Thread Jonathan Matthew
On Wed, Nov 11, 2020 at 05:25:25AM -0300, Martin Pieuchot wrote:
> While discussing the new source address mechanism with denis@, I figured
> those ought to be documented.
> 
> Note that `ar_rtableid' is unused and can die.  The ART code is actually
> free from any network knowledge.
> 
> ok?

ok jmatthew@

> 
> Index: net/art.c
> ===
> RCS file: /cvs/src/sys/net/art.c,v
> retrieving revision 1.28
> diff -u -p -r1.28 art.c
> --- net/art.c 31 Mar 2019 19:29:27 -  1.28
> +++ net/art.c 9 Nov 2020 19:52:48 -
> @@ -115,7 +115,6 @@ art_alloc(unsigned int rtableid, unsigne
>   }
>  
>   ar->ar_off = off;
> - ar->ar_rtableid = rtableid;
>   rw_init(&ar->ar_lock, "art");
>  
>   return (ar);
> Index: net/art.h
> ===
> RCS file: /cvs/src/sys/net/art.h,v
> retrieving revision 1.19
> diff -u -p -r1.19 art.h
> --- net/art.h 29 Oct 2020 21:15:27 -  1.19
> +++ net/art.h 9 Nov 2020 19:52:42 -
> @@ -27,16 +27,22 @@
>  
>  /*
>   * Root of the ART tables, equivalent to the radix head.
> + *
> + *  Locks used to protect struct members in this file:
> + *   I   immutable after creation
> + *   l   root's `ar_lock'
> + *   K   kernel lock
> + *  For SRP related structures that allow lock-free reads, the write lock
> + *  is indicated below.
>   */
>  struct art_root {
> - struct srp   ar_root;   /* First table */
> - struct rwlockar_lock;   /* Serialise modifications */
> - uint8_t  ar_bits[ART_MAXLVL];   /* Per level stride */
> - uint8_t  ar_nlvl;   /* Number of levels */
> - uint8_t  ar_alen;   /* Address length in bits */
> - uint8_t  ar_off;/* Offset of the key in bytes */
> - unsigned int ar_rtableid;   /* ID of this routing table */
> - struct sockaddr *source;/* optional src addr to use */
> + struct srp   ar_root;   /* [l] First table */
> + struct rwlockar_lock;   /* [] Serialise modifications */
> + uint8_t  ar_bits[ART_MAXLVL]; /* [I] Per level stride */
> + uint8_t  ar_nlvl;   /* [I] Number of levels */
> + uint8_t  ar_alen;   /* [I] Address length in bits */
> + uint8_t  ar_off;/* [I] Offset of key in bytes */
> + struct sockaddr *source;/* [K] optional src addr to use 
> */
>  };
>  
>  #define ISLEAF(e)(((unsigned long)(e) & 1) == 0)
> 



ospf6d: use ROUTE_FLAGFILTER

2020-09-01 Thread Jonathan Matthew
Like ospfd, ospf6d can use ROUTE_FLAGFILTER to opt out of receiving messages
relating to L2 and broadcast routes on its routing socket.  We've been running
this for a week or so with no problems.

ok?

Index: kroute.c
===
RCS file: /cvs/src/usr.sbin/ospf6d/kroute.c,v
retrieving revision 1.64
diff -u -p -u -p -r1.64 kroute.c
--- kroute.c17 May 2020 18:29:25 -  1.64
+++ kroute.c18 Aug 2020 11:56:09 -
@@ -102,6 +102,7 @@ kr_init(int fs, u_int rdomain, int redis
int opt = 0, rcvbuf, default_rcvbuf;
socklen_t   optlen;
int filter_prio = fib_prio;
+   int filter_flags = RTF_LLINFO | RTF_BROADCAST;
 
kr_state.fib_sync = fs;
kr_state.rdomain = rdomain;
@@ -127,6 +128,12 @@ kr_init(int fs, u_int rdomain, int redis
if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_PRIOFILTER, &filter_prio,
sizeof(filter_prio)) == -1) {
log_warn("%s: setsockopt AF_ROUTE ROUTE_PRIOFILTER", __func__);
+   /* not fatal */
+   }
+
+   if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_FLAGFILTER, &filter_flags,
+   sizeof(filter_flags)) == -1) {
+   log_warn("%s: setsockopt AF_ROUTE ROUTE_FLAGFILTER", __func__);
/* not fatal */
}
 



Re: ldapd(8): fix, simplify UUID timestamp code

2020-08-21 Thread Jonathan Matthew
On Wed, Aug 19, 2020 at 09:28:41PM -0500, Scott Cheloha wrote:
> Hi,
> 
> I was auditing the tree for odd-looking time structure usage and I
> came across the UUID code in ldapd(8), uuid.c.
> 
> time_cmp() is backwards.  Or the caller is misusing it.  One or the
> other.  It returns -1 if tv1 exceeds tv2 but the comments in the
> caller indicate the opposite impression.  I don't think this code has
> ever worked as intended.
> 
> It would be a lot easier if we just threw the code out and used random
> UUIDs.  After reading over the RFC it seems to me that time-based
> UUIDs are collision-prone.  Their implementation is also complicated.
> Purely random UUIDs should effectively never collide and are trivial
> to implement.

RFC 4530, defining the entryUUID attribute, says this:

   UUID are to be generated in accordance with Section 4 of [RFC4122].
   In particular, servers MUST ensure that each generated UUID is unique
   in space and time.

Which doesn't rule out random uuids at all.  Is arc4random_buf() a better
attempt at ensuring the uuids are unique than doing complicated stuff with
clocks and mac addresses?  Maybe.  Windows has been generating random uuids
for about 20 years now.

> 
> However, assuming we can't just use random UUIDs, here's an attempt at
> improving this code:
> 
> - Use clock_gettime(2).  With nanosecond resolution we don't need
>   a 'counter'.
> 
> - Reduce the scope of all the static state to uuid_create().
> 
> - Shrink the loop.  Just read the clock until it changes, then decide
>   what to do re. seq_num.  This is effectively what the example code in
>   RFC 4122 does.
> 
> I'm unsure what the right thing to do is if the system clock predates
> the UUID epoch (Oct 15 1582).  My code just returns zero.  Maybe we
> should just kill the daemon in that case?  The UUIDv1 scheme breaks
> down if time is that seriously screwed up.
> 
> Is there an active ldapd(8) person?  Or at least someone with an
> ldapd(8) setup who can test this?

I'm kind of an active ldapd person, though I don't actually use it
actively.  I can try this out if need be.

> 
> Thoughts?
> 
> Index: uuid.c
> ===
> RCS file: /cvs/src/usr.sbin/ldapd/uuid.c,v
> retrieving revision 1.6
> diff -u -p -r1.6 uuid.c
> --- uuid.c26 Apr 2018 12:42:51 -  1.6
> +++ uuid.c20 Aug 2020 01:44:00 -
> @@ -63,27 +63,8 @@
>  
>  #include "uuid.h"
>  
> -static uint32_t seq_num;
> -static struct timeval last_time;
> -static int32_t counter;
> -static char nodeaddr[6];
> -
>  enum { UUID_NODE_MULTICAST = 0x80 };
>  
> -static int
> -time_cmp(struct timeval *tv1, struct timeval *tv2)
> -{
> -if (tv1->tv_sec > tv2->tv_sec)
> - return -1;
> -if (tv1->tv_sec < tv2->tv_sec)
> - return 1;
> -if (tv1->tv_usec > tv2->tv_usec)
> - return -1;
> -if (tv1->tv_usec < tv2->tv_usec)
> - return 1;
> -return 0;
> -}
> -
>  static void
>  get_node_addr(char *addr)
>  {
> @@ -138,6 +119,40 @@ get_node_addr(char *addr)
>  }
>  
>  /*
> + * A UUID v1 timestamp:
> + *
> + * - 60 bits.
> + * - Unsigned.
> + * - Epoch at Oct 15 1582 00:00:00 UTC.
> + * - Increments every 100 nanoseconds.
> + */
> +#define UUID_EPOCH_OFFSET12219292800LL
> +#define UUID_TIME_MAX(1ULL << 60)
> +#define UUID_HZ  1000LL
> +#define NSEC_PER_UUID_TICK   100LL
> +
> +static uint64_t
> +get_uuid_timestamp(void)
> +{
> + static const struct timespec min = { -UUID_EPOCH_OFFSET, 0 };
> + static const struct timespec max = {
> + UUID_TIME_MAX / UUID_HZ,
> + UUID_TIME_MAX % UUID_HZ * NSEC_PER_UUID_TICK
> + };
> + struct timespec utc;
> + uint64_t timestamp;
> +
> + clock_gettime(CLOCK_REALTIME, &utc);
> + if (timespeccmp(&utc, &min, <))
> + return 0;
> + if (timespeccmp(&max, &utc, <))
> + return UUID_TIME_MAX;
> + timestamp = (UUID_EPOCH_OFFSET + utc.tv_sec) * UUID_HZ;
> + timestamp += utc.tv_nsec / NSEC_PER_UUID_TICK;
> + return timestamp;
> +}
> +
> +/*
>   *Creates a new UUID.
>   */
>  
> @@ -145,55 +160,32 @@ void
>  uuid_create(afsUUID *uuid)
>  {
>  static int uuid_inited = 0;
> -struct timeval tv;
> -int ret, got_time;
> +static uint64_t last_time;
> +static uint32_t seq_num;
> +static char nodeaddr[6];
>  uint64_t dce_time;
>  
>  if (uuid_inited == 0) {
> - gettimeofday(&last_time, NULL);
> + last_time = get_uuid_timestamp();
>   seq_num = arc4random();
>   get_node_addr(nodeaddr);
>   uuid_inited = 1;
>  }
>  
> -gettimeofday(&tv, NULL);
> -
> -got_time = 0;
> +while ((dce_time = get_uuid_timestamp()) == last_time)
> + continue;
>  
> -do {
> - ret = time_cmp(&tv, &last_time);
> - if (ret < 0) {
> - /* Time went backward, just inc seq_num and be done.
> -  * seq_num is 6 + 8 bit field it the uuid, so let it wrap
> -

ospfd: use ROUTE_FLAGFILTER

2020-08-16 Thread Jonathan Matthew
ospfd is our first target for using ROUTE_FLAGFILTER to reduce pressure on the
route socket, so here's the diff we've been running for a couple of weeks now
(minus the fix for RTM_DELETE flags, notably).

ok?

Index: kroute.c
===
RCS file: /cvs/src/usr.sbin/ospfd/kroute.c,v
retrieving revision 1.113
diff -u -p -r1.113 kroute.c
--- kroute.c9 Nov 2019 15:54:19 -   1.113
+++ kroute.c27 Jul 2020 03:45:41 -
@@ -133,6 +133,7 @@ kr_init(int fs, u_int rdomain, int redis
int opt = 0, rcvbuf, default_rcvbuf;
socklen_t   optlen;
int filter_prio = fib_prio;
+   int filter_flags = RTF_LLINFO | RTF_BROADCAST;
 
kr_state.fib_sync = fs;
kr_state.rdomain = rdomain;
@@ -158,6 +159,11 @@ kr_init(int fs, u_int rdomain, int redis
if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_PRIOFILTER, &filter_prio,
sizeof(filter_prio)) == -1) {
log_warn("%s: setsockopt AF_ROUTE ROUTE_PRIOFILTER", __func__);
+   /* not fatal */
+   }
+   if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_FLAGFILTER, &filter_flags,
+   sizeof(filter_flags)) == -1) {
+   log_warn("%s: setsockopt AF_ROUTE ROUTE_FLAGFILTER", __func__);
/* not fatal */
}
 



RTM_DELETE messages for L2 routes have incorrect flags

2020-08-10 Thread Jonathan Matthew
While looking into filtering out messages for L2 routes in the kernel to reduce
load on routing daemons, I noticed that the RTM_DELETE messages do not have
the RTF_LLINFO flag set, which is inconvenient because that's what I want to
filter on.

I tracked this down to r1.361 and r1.362 of net/route.c, where we stopped
saving rt->rt_flags before calling rtrequest_delete().  rtrequest_delete()
calls ifp->if_rtrequest(), which removes the llinfo from the route and clears
RTF_LLINFO.

I think the simplest way to fix this would be for rtdeletemsg() to go back to
calling rtm_miss() directly rather than using rtm_send().  Adding more
parameters to rtm_send() to specify additional flags seems like
overcomplicating it.


Index: route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.394
diff -u -p -r1.394 route.c
--- route.c 24 Jun 2020 22:03:43 -  1.394
+++ route.c 11 Aug 2020 04:12:51 -
@@ -663,6 +663,7 @@ rtdeletemsg(struct rtentry *rt, struct i
 {
int error;
struct rt_addrinfo  info;
+   struct sockaddr_rtlabel sa_rl;
struct sockaddr_in6 sa_mask;
 
KASSERT(rt->rt_ifidx == ifp->if_index);
@@ -677,8 +678,13 @@ rtdeletemsg(struct rtentry *rt, struct i
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
if (!ISSET(rt->rt_flags, RTF_HOST))
info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
+   info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
+   info.rti_flags = rt->rt_flags;
+   info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
+   info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
error = rtrequest_delete(&info, rt->rt_priority, ifp, &rt, tableid);
-   rtm_send(rt, RTM_DELETE, error, tableid);
+   rtm_miss(RTM_DELETE, &info, info.rti_flags, rt->rt_priority,
+   rt->rt_ifidx, error, tableid);
if (error == 0)
rtfree(rt);
return (error);



filtering routing socket messages by flags

2020-08-05 Thread Jonathan Matthew
Most (all?) of our routing daemons don't care about layer 2 or broadcast
routing entries, so they do something like this after reading a message
off the socket:

/* Skip ARP/ND cache and broadcast routes. */
if (rtm->rtm_flags & (RTF_LLINFO|RTF_BROADCAST))
continue;

ARP can generate a lot of routing messages during an address space scan,
and then again when the entries expire, and this can cause routing daemons
to desync.  To reduce the impact of this, we'd like to filter these out on
the kernel side.  There's another issue we need to fix to make this work
properly, but that can be done separately.

This adds a new type of filter on the routing socket, specifying a flag
bitmask, which filters out messages for routes with flags matching the mask.

ok?


Index: route.h
===
RCS file: /cvs/src/sys/net/route.h,v
retrieving revision 1.181
diff -u -p -u -p -r1.181 route.h
--- route.h 10 Mar 2020 21:35:41 -  1.181
+++ route.h 6 Aug 2020 01:47:11 -
@@ -297,6 +297,8 @@ struct rt_msghdr {
 #define ROUTE_PRIOFILTER 3 /* only pass updates with a priority higher or
   equal (actual value lower) to the specified
   priority. */
+#define ROUTE_FLAGFILTER 4 /* do not pass updates for routes with flags
+  in this bitmask. */
 
 #define ROUTE_FILTER(m)(1 << (m))
 #define RTABLE_ANY 0x
Index: rtsock.c
===
RCS file: /cvs/src/sys/net/rtsock.c,v
retrieving revision 1.299
diff -u -p -u -p -r1.299 rtsock.c
--- rtsock.c24 Jun 2020 22:03:42 -  1.299
+++ rtsock.c6 Aug 2020 01:47:11 -
@@ -145,6 +145,7 @@ struct rtpcb {
struct refcnt   rop_refcnt;
struct timeout  rop_timeout;
unsigned introp_msgfilter;
+   unsigned introp_flagfilter;
unsigned introp_flags;
u_int   rop_rtableid;
unsigned short  rop_proto;
@@ -402,6 +403,12 @@ route_ctloutput(int op, struct socket *s
else
rop->rop_priority = prio;
break;
+   case ROUTE_FLAGFILTER:
+   if (m == NULL || m->m_len != sizeof(unsigned int))
+   error = EINVAL;
+   else
+   rop->rop_flagfilter = *mtod(m, unsigned int *);
+   break;
default:
error = ENOPROTOOPT;
break;
@@ -421,6 +428,10 @@ route_ctloutput(int op, struct socket *s
m->m_len = sizeof(unsigned int);
*mtod(m, unsigned int *) = rop->rop_priority;
break;
+   case ROUTE_FLAGFILTER:
+   m->m_len = sizeof(unsigned int);
+   *mtod(m, unsigned int *) = rop->rop_flagfilter;
+   break;
default:
error = ENOPROTOOPT;
break;
@@ -516,9 +527,13 @@ next:
/* filter messages that the process does not want */
rtm = mtod(m, struct rt_msghdr *);
/* but RTM_DESYNC can't be filtered */
-   if (rtm->rtm_type != RTM_DESYNC && rop->rop_msgfilter != 0 &&
-   !(rop->rop_msgfilter & (1 << rtm->rtm_type)))
-   goto next;
+   if (rtm->rtm_type != RTM_DESYNC) {
+   if (rop->rop_msgfilter != 0 &&
+   !(rop->rop_msgfilter & (1 << rtm->rtm_type)))
+   goto next;
+   if (ISSET(rop->rop_flagfilter, rtm->rtm_flags))
+   goto next;
+   }
switch (rtm->rtm_type) {
case RTM_IFANNOUNCE:
case RTM_DESYNC:



acpicpu: remove acpicpu_sc array

2020-08-05 Thread Jonathan Matthew
This came out of the work on supporting ACPI0007 devices in acpicpu(4), but
it's independent of that and I'd like to get it in the tree separately.

Since it was first added, acpicpu stores instances of itself in an array, 
which it uses to find the acpicpu device for a cpu.  This runs into problems
when there are more than MAXCPUS acpicpu devices.  Currently it overwrites
whatever's after the array, leading to varying crashes and hangs depending
on kernel link order.

More recently, we've added a pointer to struct cpu_info that does this more
directly, and also has the advantage that it actually matches up the cpu ids
rather than assuming cpu3 maps to acpicpu3.

This diff removes the acpicpu_sc array and uses the pointer from struct
cpu_info instead.  Most of the accesses are just looking for the first acpicpu,
so we can use cpu_info_primary to find that.

I've tested this on a few different machines (including one with 128 acpicpu
devices) and everything still works.

ok?


Index: acpicpu.c
===
RCS file: /cvs/src/sys/dev/acpi/acpicpu.c,v
retrieving revision 1.85
diff -u -p -r1.85 acpicpu.c
--- acpicpu.c   27 May 2020 05:02:21 -  1.85
+++ acpicpu.c   3 Aug 2020 05:10:45 -
@@ -188,8 +188,6 @@ struct cfdriver acpicpu_cd = {
 
 extern int setperf_prio;
 
-struct acpicpu_softc *acpicpu_sc[MAXCPUS];
-
 #if 0
 void
 acpicpu_set_throttle(struct acpicpu_softc *sc, int level)
@@ -672,7 +670,6 @@ acpicpu_attach(struct device *parent, st
 
sc->sc_acpi = (struct acpi_softc *)parent;
sc->sc_devnode = aa->aaa_node;
-   acpicpu_sc[sc->sc_dev.dv_unit] = sc;
 
SLIST_INIT(&sc->sc_cstates);
 
@@ -979,7 +976,7 @@ acpicpu_fetch_pss(struct acpicpu_pss **p
 * the bios ensures this...
 */
 
-   sc = acpicpu_sc[0];
+   sc = (struct acpicpu_softc *)cpu_info_primary.ci_acpicpudev;
if (!sc)
return 0;
*pss = sc->sc_pss;
@@ -1024,7 +1021,7 @@ acpicpu_set_notify(void (*func)(struct a
 {
struct acpicpu_softc*sc;
 
-   sc = acpicpu_sc[0];
+   sc = (struct acpicpu_softc *)cpu_info_primary.ci_acpicpudev;
if (sc != NULL)
sc->sc_notify = func;
 }
@@ -1034,7 +1031,7 @@ acpicpu_setperf_ppc_change(struct acpicp
 {
struct acpicpu_softc*sc;
 
-   sc = acpicpu_sc[0];
+   sc = (struct acpicpu_softc *)cpu_info_primary.ci_acpicpudev;
 
if (sc != NULL)
cpu_setperf(sc->sc_level);
@@ -1048,7 +1045,7 @@ acpicpu_setperf(int level)
int idx, len;
uint32_tstatus = 0;
 
-   sc = acpicpu_sc[cpu_number()];
+   sc = (struct acpicpu_softc *)curcpu()->ci_acpicpudev;
 
dnprintf(10, "%s: acpicpu setperf level %d\n",
sc->sc_devnode->name, level);



Re: acpicpu(4) and ACPI0007

2020-08-01 Thread Jonathan Matthew
On Wed, Jul 29, 2020 at 08:29:31PM +1000, Jonathan Matthew wrote:
> On Wed, Jul 29, 2020 at 10:06:14AM +0200, Mark Kettenis wrote:
> > > Date: Wed, 29 Jul 2020 10:38:55 +1000
> > > From: Jonathan Matthew 
> > > 
> > > On Tue, Jul 28, 2020 at 07:30:36PM +0200, Mark Kettenis wrote:
> > > > > Date: Tue, 28 Jul 2020 21:42:46 +1000
> > > > > From: Jonathan Matthew 
> > > > > 
> > > > > On Tue, Jul 28, 2020 at 11:12:21AM +0200, Mark Kettenis wrote:
> > > > > > > Date: Tue, 28 Jul 2020 13:46:34 +1000
> > > > > > > From: Jonathan Matthew 
> > > > > > > 
> > > > > > > On Mon, Jul 27, 2020 at 05:16:47PM +0200, Mark Kettenis wrote:
> > > > > > > > > Date: Mon, 27 Jul 2020 17:02:41 +0200 (CEST)
> > > > > > > > > From: Mark Kettenis 
> > > > > > > > > 
> > > > > > > > > Recent ACPI versions have deprecated "Processor()" nodes in 
> > > > > > > > > favout of
> > > > > > > > > "Device()" nodes with a _HID() method that returns 
> > > > > > > > > "ACPI0007".  This
> > > > > > > > > diff tries to support machines with firmware that implements 
> > > > > > > > > this.  If
> > > > > > > > > you see something like:
> > > > > > > > > 
> > > > > > > > >   "ACPI0007" at acpi0 not configured
> > > > > > > > > 
> > > > > > > > > please try the following diff and report back with an updated 
> > > > > > > > > dmesg.
> > > > > > > > > 
> > > > > > > > > Cheers,
> > > > > > > > > 
> > > > > > > > > Mark
> > > > > > > > 
> > > > > > > > And now with the right diff...
> > > > > > > 
> > > > > > > On a dell r6415, it looks like this:
> > > > > > > 
> > > > > > > acpicpu0 at acpi0copyvalue: 6: C1(@1 halt!)
> > > > > > > all the way up to
> > > > > > > acpicpu127 at acpi0copyvalue: 6: no cpu matching ACPI ID 127
> > > > > > > 
> > > > > > > which I guess means aml_copyvalue() needs to learn how to copy 
> > > > > > > AML_OBJTYPE_DEVICE.
> > > > > > 
> > > > > > Yes.  It is not immediately obvious how this should work.  Do we 
> > > > > > need
> > > > > > to copy the aml_node pointer or not?  We don't do that for
> > > > > > AML_OBJTYPE_PROCESSOR and AML_OBJTYPE_POWERRSRC types which are
> > > > > > similar to AML_OBJTYPE_DEVICE.  But AML_OBJTYPE_DEVICE object don't
> > > > > > carry any additional information.  So we end up with just an empty
> > > > > > case to avoid the warning.
> > > > > > 
> > > > > > Does this work on the Dell machines?
> > > > > 
> > > > > We've seen crashes in pool_cache_get() in various places after all 
> > > > > the acpicpus
> > > > > attach, which we haven't seen before on these machines, so I think 
> > > > > it's
> > > > > corrupting memory somehow.
> > > > 
> > > > Does that happen with only the acpicpu(4) diff?
> > > 
> > > Yes.  Looking at this a bit more, in the case where aml_evalnode() can't
> > > copy the result value, it leaves it uninitialised, which means we'll call
> > > aml_freevalue(&res) where res is stack junk.  memset(&res, 0, sizeof(res))
> > > seems to fix it.
> > 
> > Eh, where exactly?
> 
> I had it just before the call to aml_evalnode(), but that can't be it,
> since aml_evalnode() does the same thing.

Much better theory: the acpicpu_sc array has MAXCPUS elements, but on this
system (and all R6415s, as far as I can tell) we have more acpicpu devices
than that.  I suppose we should just make acpicpu_match fail if cf->cf_unit
is >= MAXCPUS as we do with the actual cpu devices.


Index: acpicpu.c
===
RCS file: /cvs/src/sys/dev/acpi/acpicpu.c,v
retrieving revision 1.85
diff -u -p -r1.85 acpicpu.c
--- acpicpu.c   27 May 2020 05:02:21 -  1.85
+++ acpicpu.c

Re: acpicpu(4) and ACPI0007

2020-07-29 Thread Jonathan Matthew
On Wed, Jul 29, 2020 at 10:06:14AM +0200, Mark Kettenis wrote:
> > Date: Wed, 29 Jul 2020 10:38:55 +1000
> > From: Jonathan Matthew 
> > 
> > On Tue, Jul 28, 2020 at 07:30:36PM +0200, Mark Kettenis wrote:
> > > > Date: Tue, 28 Jul 2020 21:42:46 +1000
> > > > From: Jonathan Matthew 
> > > > 
> > > > On Tue, Jul 28, 2020 at 11:12:21AM +0200, Mark Kettenis wrote:
> > > > > > Date: Tue, 28 Jul 2020 13:46:34 +1000
> > > > > > From: Jonathan Matthew 
> > > > > > 
> > > > > > On Mon, Jul 27, 2020 at 05:16:47PM +0200, Mark Kettenis wrote:
> > > > > > > > Date: Mon, 27 Jul 2020 17:02:41 +0200 (CEST)
> > > > > > > > From: Mark Kettenis 
> > > > > > > > 
> > > > > > > > Recent ACPI versions have deprecated "Processor()" nodes in 
> > > > > > > > favout of
> > > > > > > > "Device()" nodes with a _HID() method that returns "ACPI0007".  
> > > > > > > > This
> > > > > > > > diff tries to support machines with firmware that implements 
> > > > > > > > this.  If
> > > > > > > > you see something like:
> > > > > > > > 
> > > > > > > >   "ACPI0007" at acpi0 not configured
> > > > > > > > 
> > > > > > > > please try the following diff and report back with an updated 
> > > > > > > > dmesg.
> > > > > > > > 
> > > > > > > > Cheers,
> > > > > > > > 
> > > > > > > > Mark
> > > > > > > 
> > > > > > > And now with the right diff...
> > > > > > 
> > > > > > On a dell r6415, it looks like this:
> > > > > > 
> > > > > > acpicpu0 at acpi0copyvalue: 6: C1(@1 halt!)
> > > > > > all the way up to
> > > > > > acpicpu127 at acpi0copyvalue: 6: no cpu matching ACPI ID 127
> > > > > > 
> > > > > > which I guess means aml_copyvalue() needs to learn how to copy 
> > > > > > AML_OBJTYPE_DEVICE.
> > > > > 
> > > > > Yes.  It is not immediately obvious how this should work.  Do we need
> > > > > to copy the aml_node pointer or not?  We don't do that for
> > > > > AML_OBJTYPE_PROCESSOR and AML_OBJTYPE_POWERRSRC types which are
> > > > > similar to AML_OBJTYPE_DEVICE.  But AML_OBJTYPE_DEVICE object don't
> > > > > carry any additional information.  So we end up with just an empty
> > > > > case to avoid the warning.
> > > > > 
> > > > > Does this work on the Dell machines?
> > > > 
> > > > We've seen crashes in pool_cache_get() in various places after all the 
> > > > acpicpus
> > > > attach, which we haven't seen before on these machines, so I think it's
> > > > corrupting memory somehow.
> > > 
> > > Does that happen with only the acpicpu(4) diff?
> > 
> > Yes.  Looking at this a bit more, in the case where aml_evalnode() can't
> > copy the result value, it leaves it uninitialised, which means we'll call
> > aml_freevalue(&res) where res is stack junk.  memset(&res, 0, sizeof(res))
> > seems to fix it.
> 
> Eh, where exactly?

I had it just before the call to aml_evalnode(), but that can't be it,
since aml_evalnode() does the same thing.

> 
> > > > With this addition, we get this for each cpu:
> > > > acpicpu0 at acpi0: C1(@1 halt!)
> > > 
> > > The exclamation mark indicates that this is the "fallback" C-state.
> > > Is there a _CST method at all?
> > > 
> > > Anyway, given that this is a server system, it isn't really surprising
> > > that there isn't any fancy power saving stuff.
> > 
> > Right, there doesn't seem to be any.  The processor devices look like this
> > in the aml:
> > 
> > Scope (_SB)
> > {
> > Device (C000)
> > {
> > Name (_HID, "ACPI0007" /* Processor Device */)  // _HID: 
> > Hardware ID
> > Name (_UID, 0x00)  // _UID: Unique ID
> > }
> > 
> > Device (C001)
> > {
> > Name (_HID, "ACPI0007" /* Processor Device */)  // _HID: 
> > Hardware ID
> > Name (_UID, 0x01)  // _UID: Unique ID
> > }
> > 
> >  .. and so on.
> 
> Usually there is an SSDT that fills in the details.  The acpidump
> output I have for the r6415 does have one. but it doesn't add
> anything.

Same here.

> 
> > > > > Index: dev/acpi/dsdt.c
> > > > > ===
> > > > > RCS file: /cvs/src/sys/dev/acpi/dsdt.c,v
> > > > > retrieving revision 1.252
> > > > > diff -u -p -r1.252 dsdt.c
> > > > > --- dev/acpi/dsdt.c   21 Jul 2020 03:48:06 -  1.252
> > > > > +++ dev/acpi/dsdt.c   28 Jul 2020 09:04:15 -
> > > > > @@ -996,6 +996,8 @@ aml_copyvalue(struct aml_value *lhs, str
> > > > >   lhs->v_objref = rhs->v_objref;
> > > > >   aml_addref(lhs->v_objref.ref, "");
> > > > >   break;
> > > > > + case AML_OBJTYPE_DEVICE:
> > > > > + break;
> > > > >   default:
> > > > >   printf("copyvalue: %x", rhs->type);
> > > > >   break;
> > > > 
> > > > 
> > 



Re: acpicpu(4) and ACPI0007

2020-07-28 Thread Jonathan Matthew
On Tue, Jul 28, 2020 at 07:30:36PM +0200, Mark Kettenis wrote:
> > Date: Tue, 28 Jul 2020 21:42:46 +1000
> > From: Jonathan Matthew 
> > 
> > On Tue, Jul 28, 2020 at 11:12:21AM +0200, Mark Kettenis wrote:
> > > > Date: Tue, 28 Jul 2020 13:46:34 +1000
> > > > From: Jonathan Matthew 
> > > > 
> > > > On Mon, Jul 27, 2020 at 05:16:47PM +0200, Mark Kettenis wrote:
> > > > > > Date: Mon, 27 Jul 2020 17:02:41 +0200 (CEST)
> > > > > > From: Mark Kettenis 
> > > > > > 
> > > > > > Recent ACPI versions have deprecated "Processor()" nodes in favout 
> > > > > > of
> > > > > > "Device()" nodes with a _HID() method that returns "ACPI0007".  This
> > > > > > diff tries to support machines with firmware that implements this.  
> > > > > > If
> > > > > > you see something like:
> > > > > > 
> > > > > >   "ACPI0007" at acpi0 not configured
> > > > > > 
> > > > > > please try the following diff and report back with an updated dmesg.
> > > > > > 
> > > > > > Cheers,
> > > > > > 
> > > > > > Mark
> > > > > 
> > > > > And now with the right diff...
> > > > 
> > > > On a dell r6415, it looks like this:
> > > > 
> > > > acpicpu0 at acpi0copyvalue: 6: C1(@1 halt!)
> > > > all the way up to
> > > > acpicpu127 at acpi0copyvalue: 6: no cpu matching ACPI ID 127
> > > > 
> > > > which I guess means aml_copyvalue() needs to learn how to copy 
> > > > AML_OBJTYPE_DEVICE.
> > > 
> > > Yes.  It is not immediately obvious how this should work.  Do we need
> > > to copy the aml_node pointer or not?  We don't do that for
> > > AML_OBJTYPE_PROCESSOR and AML_OBJTYPE_POWERRSRC types which are
> > > similar to AML_OBJTYPE_DEVICE.  But AML_OBJTYPE_DEVICE object don't
> > > carry any additional information.  So we end up with just an empty
> > > case to avoid the warning.
> > > 
> > > Does this work on the Dell machines?
> > 
> > We've seen crashes in pool_cache_get() in various places after all the 
> > acpicpus
> > attach, which we haven't seen before on these machines, so I think it's
> > corrupting memory somehow.
> 
> Does that happen with only the acpicpu(4) diff?

Yes.  Looking at this a bit more, in the case where aml_evalnode() can't
copy the result value, it leaves it uninitialised, which means we'll call
aml_freevalue(&res) where res is stack junk.  memset(&res, 0, sizeof(res))
seems to fix it.

> 
> > With this addition, we get this for each cpu:
> > acpicpu0 at acpi0: C1(@1 halt!)
> 
> The exclamation mark indicates that this is the "fallback" C-state.
> Is there a _CST method at all?
> 
> Anyway, given that this is a server system, it isn't really surprising
> that there isn't any fancy power saving stuff.

Right, there doesn't seem to be any.  The processor devices look like this
in the aml:

Scope (_SB)
{
Device (C000)
{
Name (_HID, "ACPI0007" /* Processor Device */)  // _HID: Hardware ID
Name (_UID, 0x00)  // _UID: Unique ID
}

Device (C001)
{
Name (_HID, "ACPI0007" /* Processor Device */)  // _HID: Hardware ID
Name (_UID, 0x01)  // _UID: Unique ID
}

 .. and so on.

> 
> > > Index: dev/acpi/dsdt.c
> > > ===
> > > RCS file: /cvs/src/sys/dev/acpi/dsdt.c,v
> > > retrieving revision 1.252
> > > diff -u -p -r1.252 dsdt.c
> > > --- dev/acpi/dsdt.c   21 Jul 2020 03:48:06 -  1.252
> > > +++ dev/acpi/dsdt.c   28 Jul 2020 09:04:15 -
> > > @@ -996,6 +996,8 @@ aml_copyvalue(struct aml_value *lhs, str
> > >   lhs->v_objref = rhs->v_objref;
> > >   aml_addref(lhs->v_objref.ref, "");
> > >   break;
> > > + case AML_OBJTYPE_DEVICE:
> > > + break;
> > >   default:
> > >   printf("copyvalue: %x", rhs->type);
> > >   break;
> > 
> > 



Re: acpicpu(4) and ACPI0007

2020-07-28 Thread Jonathan Matthew
On Tue, Jul 28, 2020 at 11:12:21AM +0200, Mark Kettenis wrote:
> > Date: Tue, 28 Jul 2020 13:46:34 +1000
> > From: Jonathan Matthew 
> > 
> > On Mon, Jul 27, 2020 at 05:16:47PM +0200, Mark Kettenis wrote:
> > > > Date: Mon, 27 Jul 2020 17:02:41 +0200 (CEST)
> > > > From: Mark Kettenis 
> > > > 
> > > > Recent ACPI versions have deprecated "Processor()" nodes in favout of
> > > > "Device()" nodes with a _HID() method that returns "ACPI0007".  This
> > > > diff tries to support machines with firmware that implements this.  If
> > > > you see something like:
> > > > 
> > > >   "ACPI0007" at acpi0 not configured
> > > > 
> > > > please try the following diff and report back with an updated dmesg.
> > > > 
> > > > Cheers,
> > > > 
> > > > Mark
> > > 
> > > And now with the right diff...
> > 
> > On a dell r6415, it looks like this:
> > 
> > acpicpu0 at acpi0copyvalue: 6: C1(@1 halt!)
> > all the way up to
> > acpicpu127 at acpi0copyvalue: 6: no cpu matching ACPI ID 127
> > 
> > which I guess means aml_copyvalue() needs to learn how to copy 
> > AML_OBJTYPE_DEVICE.
> 
> Yes.  It is not immediately obvious how this should work.  Do we need
> to copy the aml_node pointer or not?  We don't do that for
> AML_OBJTYPE_PROCESSOR and AML_OBJTYPE_POWERRSRC types which are
> similar to AML_OBJTYPE_DEVICE.  But AML_OBJTYPE_DEVICE object don't
> carry any additional information.  So we end up with just an empty
> case to avoid the warning.
> 
> Does this work on the Dell machines?

We've seen crashes in pool_cache_get() in various places after all the acpicpus
attach, which we haven't seen before on these machines, so I think it's
corrupting memory somehow.

With this addition, we get this for each cpu:
acpicpu0 at acpi0: C1(@1 halt!)

> 
> 
> Index: dev/acpi/dsdt.c
> ===
> RCS file: /cvs/src/sys/dev/acpi/dsdt.c,v
> retrieving revision 1.252
> diff -u -p -r1.252 dsdt.c
> --- dev/acpi/dsdt.c   21 Jul 2020 03:48:06 -  1.252
> +++ dev/acpi/dsdt.c   28 Jul 2020 09:04:15 -
> @@ -996,6 +996,8 @@ aml_copyvalue(struct aml_value *lhs, str
>   lhs->v_objref = rhs->v_objref;
>   aml_addref(lhs->v_objref.ref, "");
>   break;
> + case AML_OBJTYPE_DEVICE:
> + break;
>   default:
>   printf("copyvalue: %x", rhs->type);
>   break;



Re: acpicpu(4) and ACPI0007

2020-07-27 Thread Jonathan Matthew
On Mon, Jul 27, 2020 at 05:16:47PM +0200, Mark Kettenis wrote:
> > Date: Mon, 27 Jul 2020 17:02:41 +0200 (CEST)
> > From: Mark Kettenis 
> > 
> > Recent ACPI versions have deprecated "Processor()" nodes in favout of
> > "Device()" nodes with a _HID() method that returns "ACPI0007".  This
> > diff tries to support machines with firmware that implements this.  If
> > you see something like:
> > 
> >   "ACPI0007" at acpi0 not configured
> > 
> > please try the following diff and report back with an updated dmesg.
> > 
> > Cheers,
> > 
> > Mark
> 
> And now with the right diff...

On a dell r6415, it looks like this:

acpicpu0 at acpi0copyvalue: 6: C1(@1 halt!)
all the way up to
acpicpu127 at acpi0copyvalue: 6: no cpu matching ACPI ID 127

which I guess means aml_copyvalue() needs to learn how to copy 
AML_OBJTYPE_DEVICE.

> 
> 
> Index: dev/acpi/acpicpu.c
> ===
> RCS file: /cvs/src/sys/dev/acpi/acpicpu.c,v
> retrieving revision 1.85
> diff -u -p -r1.85 acpicpu.c
> --- dev/acpi/acpicpu.c27 May 2020 05:02:21 -  1.85
> +++ dev/acpi/acpicpu.c27 Jul 2020 14:58:38 -
> @@ -186,6 +186,11 @@ struct cfdriver acpicpu_cd = {
>   NULL, "acpicpu", DV_DULL
>  };
>  
> +const char *acpicpu_hids[] = {
> + "ACPI0007",
> + NULL
> +};
> +
>  extern int setperf_prio;
>  
>  struct acpicpu_softc *acpicpu_sc[MAXCPUS];
> @@ -650,6 +655,9 @@ acpicpu_match(struct device *parent, voi
>   struct acpi_attach_args *aa = aux;
>   struct cfdata   *cf = match;
>  
> + if (acpi_matchhids(aa, acpicpu_hids, cf->cf_driver->cd_name))
> + return (1);
> +
>   /* sanity */
>   if (aa->aaa_name == NULL ||
>   strcmp(aa->aaa_name, cf->cf_driver->cd_name) != 0 ||
> @@ -665,6 +673,7 @@ acpicpu_attach(struct device *parent, st
>   struct acpicpu_softc*sc = (struct acpicpu_softc *)self;
>   struct acpi_attach_args *aa = aux;
>   struct aml_valueres;
> + int64_t uid;
>   int i;
>   uint32_tstatus = 0;
>   CPU_INFO_ITERATOR   cii;
> @@ -675,6 +684,10 @@ acpicpu_attach(struct device *parent, st
>   acpicpu_sc[sc->sc_dev.dv_unit] = sc;
>  
>   SLIST_INIT(&sc->sc_cstates);
> +
> + if (aml_evalinteger(sc->sc_acpi, sc->sc_devnode,
> + "_UID", 0, NULL, &uid) == 0)
> + sc->sc_cpu = uid;
>  
>   if (aml_evalnode(sc->sc_acpi, sc->sc_devnode, 0, NULL, &res) == 0) {
>   if (res.type == AML_OBJTYPE_PROCESSOR) {
> 



mcx(4) RSS

2020-07-13 Thread Jonathan Matthew
mcx(4) is almost ready to enable RSS, except arm64 doesn't yet support
mapping interrupts to cpus.  Until that's in place, here's a diff with the
missing pieces from the driver in case anyone wants to test.  This will
enable up to 8 rx/tx queues, depending on the number of cpus available.


Index: if_mcx.c
===
RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
retrieving revision 1.64
diff -u -p -r1.64 if_mcx.c
--- if_mcx.c14 Jul 2020 04:10:18 -  1.64
+++ if_mcx.c14 Jul 2020 04:49:36 -
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -83,7 +84,7 @@
 #define MCX_LOG_RQ_SIZE10
 #define MCX_LOG_SQ_SIZE11
 
-#define MCX_MAX_QUEUES 1
+#define MCX_MAX_QUEUES 8
 
 /* completion event moderation - about 10khz, or 90% of the cq */
 #define MCX_CQ_MOD_PERIOD  50
@@ -2331,6 +2332,7 @@ struct mcx_softc {
unsigned int sc_calibration_gen;
struct timeout   sc_calibrate;
 
+   struct intrmap  *sc_intrmap;
struct mcx_queuessc_queues[MCX_MAX_QUEUES];
unsigned int sc_nqueues;
 
@@ -2716,7 +2718,11 @@ mcx_attach(struct device *parent, struct
ether_sprintf(sc->sc_ac.ac_enaddr));
 
msix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag);
-   sc->sc_nqueues = 1;
+   sc->sc_intrmap = intrmap_create(&sc->sc_dev, msix, MCX_MAX_QUEUES,
+   INTRMAP_POWEROF2);
+   sc->sc_nqueues = intrmap_count(sc->sc_intrmap);
+   KASSERT(sc->sc_nqueues > 0);
+   KASSERT(powerof2(sc->sc_nqueues));
 
strlcpy(ifp->if_xname, DEVNAME(sc), IFNAMSIZ);
ifp->if_softc = sc;
@@ -2786,8 +2792,9 @@ mcx_attach(struct device *parent, struct
}
snprintf(q->q_name, sizeof(q->q_name), "%s:%d",
DEVNAME(sc), i);
-   q->q_ihc = pci_intr_establish(sc->sc_pc, ih,
-   IPL_NET | IPL_MPSAFE, mcx_cq_intr, q, q->q_name);
+   q->q_ihc = pci_intr_establish_cpu(sc->sc_pc, ih,
+   IPL_NET | IPL_MPSAFE, intrmap_cpu(sc->sc_intrmap, i),
+   mcx_cq_intr, q, q->q_name);
}
 
timeout_set(&sc->sc_calibrate, mcx_calibrate, sc);
Index: files.pci
===
RCS file: /cvs/src/sys/dev/pci/files.pci,v
retrieving revision 1.350
diff -u -p -r1.350 files.pci
--- files.pci   14 Jul 2020 04:10:18 -  1.350
+++ files.pci   14 Jul 2020 04:49:36 -
@@ -831,7 +831,7 @@ attach  bnxt at pci
 file   dev/pci/if_bnxt.c   bnxt
 
 # Mellanox ConnectX-4 and later
-device  mcx: ether, ifnet, ifmedia, stoeplitz
+device  mcx: ether, ifnet, ifmedia, stoeplitz, intrmap
 attach  mcx at pci
 filedev/pci/if_mcx.cmcx
 



Re: multiple rings and cpus for ix(4)

2020-06-17 Thread Jonathan Matthew
On Wed, Jun 17, 2020 at 12:50:46PM +0200, Hrvoje Popovski wrote:
> On 17.6.2020. 12:45, Hrvoje Popovski wrote:
> > On 17.6.2020. 11:27, Hrvoje Popovski wrote:
> >> On 17.6.2020. 10:36, David Gwynne wrote:
> >>> this is an updated version of a diff from christiano haesbaert by way of
> >>> mpi@ to enable the use of multiple tx and rx rings with msi-x.
> >>>
> >>> the high level description is that that driver checks to see if msix is
> >>> available, and if so how many vectors it has. it then gets an intrmap
> >>> based on that information, and bumps the number of queues to the number
> >>> of cpus that intrmap says are available.
> >>>
> >>> once the queues are allocated, it then iterates over them and wires up
> >>> interrupts to the cpus provided by the intrmap.
> >>>
> >>> im happy for people to try this out, but i can't commit it until all the
> >>> architectures that ix(4) is enabled on support the APIs that it's using.
> >>> this basically means it'll work on amd64 (and a little bit on i386), but
> >>> not much else. please hold back your tears and cries of anguish.
> >>>
> >>> thanks to christiano and mpi for doing most of the work leading up to
> >>> this diff :)
> >>
> >> Hi,
> >>
> >> first, thank you all for mq work :)
> >>
> >> with this diff, if i'm sending traffic over ix and at the same time
> >> execute ifconfig ix down/up, forwarding stops until i stop generator,
> >> wait for few seconds and execute ifconfig ix down/up few times and than
> >> forwarding start normally
> > 
> 
> 
> in vmstat i should see ix0:0-5 and ix1:0-5 ?

vmstat -i only shows interrupts that have actually fired. Use -zi to show
all interrupts.

This diff doesn't set up RSS, so received packets will only go to the first
vector, which is why only one of the ix1 interrupts has fired. Outgoing
packets are scattered across the tx queues, so all the ix0 interrupts have
fired.
 
> 
> r620-1# vmstat -i
> interrupt   total rate
> irq0/clock3985752  599
> irq0/ipi  3462063  520
> irq144/acpi040
> irq114/ix0:0  8042709 1209
> irq115/ix0:1  2906070  437
> irq116/ix0:2  1975350  297
> irq117/ix0:3849089681   127721
> irq118/ix0:4  4441608  668
> irq119/ix0:5  4330871  651
> irq120/ix0 100
> irq121/ix1:0 43209056 6499
> irq127/ix1 160
> irq97/mfi0  368465
> irq132/ixl2 70
> irq133/ixl2:0 4590
> irq134/ixl3 70
> irq135/ixl3:0 4510
> irq99/ehci0   1390
> irq136/em0  186372
> irq137/em14510
> irq100/ehci1   280
> irq101/ahci010
> irq146/com1  44110
> Total   921504627   138613
> 



urtwn(4) hardware crypto

2020-06-05 Thread Jonathan Matthew
This enables use of hardware crypto for CCMP in urtwn(4). As with other
drivers, this reduces cpu usage significantly when moving lots of data.
I've tested this on an assortment of hardware (RTL8188CUS, RTL8188EU,
RTL8192EU) with no problems, and this is one of the few things that
remains constant across a lot of Realtek wifi chips, but some wider
testing couldn't hurt. Since this touches the code shared with rtwn(4),
I've also tested that that still works.


Index: ic/r92creg.h
===
RCS file: /cvs/src/sys/dev/ic/r92creg.h,v
retrieving revision 1.24
diff -u -p -r1.24 r92creg.h
--- ic/r92creg.h11 Mar 2019 06:19:33 -  1.24
+++ ic/r92creg.h5 Jun 2020 11:52:21 -
@@ -688,6 +688,16 @@
 #define R92C_CAMCMD_CLR0x4000
 #define R92C_CAMCMD_POLLING0x8000
 
+/* Bits for R92C_SECCFG. */
+#define R92C_SECCFG_TXUCKEY_DEF 0x0001
+#define R92C_SECCFG_RXUCKEY_DEF0x0002
+#define R92C_SECCFG_TXENC_ENA  0x0004
+#define R92C_SECCFG_RXENC_ENA  0x0008
+#define R92C_SECCFG_CMP_A2 0x0010
+#define R92C_SECCFG_MC_SRCH_DIS0x0020
+#define R92C_SECCFG_TXBCKEY_DEF 0x0040
+#define R92C_SECCFG_RXBCKEY_DEF 0x0080
+
 /* IMR */
  
 /*Beacon DMA interrupt 6 */
Index: ic/rtwn.c
===
RCS file: /cvs/src/sys/dev/ic/rtwn.c,v
retrieving revision 1.49
diff -u -p -r1.49 rtwn.c
--- ic/rtwn.c   9 Jan 2020 14:35:19 -   1.49
+++ ic/rtwn.c   5 Jun 2020 11:52:22 -
@@ -3154,6 +3154,14 @@ rtwn_init(struct ifnet *ifp)
/* Clear per-station keys table. */
rtwn_cam_init(sc);
 
+   /* Enable decryption / encryption. */
+   if (sc->chip & RTWN_CHIP_USB) {
+   rtwn_write_2(sc, R92C_SECCFG,
+   R92C_SECCFG_TXUCKEY_DEF | R92C_SECCFG_RXUCKEY_DEF |
+   R92C_SECCFG_TXENC_ENA | R92C_SECCFG_RXENC_ENA |
+   R92C_SECCFG_TXBCKEY_DEF | R92C_SECCFG_RXBCKEY_DEF);
+   }
+
/* Enable hardware sequence numbering. */
rtwn_write_1(sc, R92C_HWSEQ_CTRL, 0xff);
 
@@ -3204,14 +3212,14 @@ rtwn_init(struct ifnet *ifp)
ifq_clr_oactive(&ifp->if_snd);
ifp->if_flags |= IFF_RUNNING;
 
-#ifdef notyet
-   if (ic->ic_flags & IEEE80211_F_WEPON) {
+   if ((ic->ic_flags & IEEE80211_F_WEPON) &&
+   (sc->chip & RTWN_CHIP_USB)) {
/* Install WEP keys. */
for (i = 0; i < IEEE80211_WEP_NKID; i++)
ic->ic_set_key(ic, NULL, &ic->ic_nw_keys[i]);
sc->sc_ops.wait_async(sc->sc_ops.cookie);
}
-#endif
+
if (ic->ic_opmode == IEEE80211_M_MONITOR)
ieee80211_new_state(ic, IEEE80211_S_RUN, -1);
else
Index: usb/if_urtwn.c
===
RCS file: /cvs/src/sys/dev/usb/if_urtwn.c,v
retrieving revision 1.89
diff -u -p -r1.89 if_urtwn.c
--- usb/if_urtwn.c  26 May 2020 06:04:30 -  1.89
+++ usb/if_urtwn.c  5 Jun 2020 11:52:22 -
@@ -490,10 +490,8 @@ urtwn_attach(struct device *parent, stru
 
ic->ic_updateslot = urtwn_updateslot;
ic->ic_updateedca = urtwn_updateedca;
-#ifdef notyet
ic->ic_set_key = urtwn_set_key;
ic->ic_delete_key = urtwn_delete_key;
-#endif
/* Override state transition machine. */
ic->ic_newstate = urtwn_newstate;
 
@@ -1035,6 +1033,10 @@ urtwn_set_key(struct ieee80211com *ic, s
struct urtwn_softc *sc = (struct urtwn_softc *)self;
struct urtwn_cmd_key cmd;
 
+   /* Only handle keys for CCMP */
+   if (k->k_cipher != IEEE80211_CIPHER_CCMP)
+   return ieee80211_set_key(ic, ni, k);
+
/* Defer setting of WEP keys until interface is brought up. */
if ((ic->ic_if.if_flags & (IFF_UP | IFF_RUNNING)) !=
(IFF_UP | IFF_RUNNING))
@@ -1065,6 +1067,12 @@ urtwn_delete_key(struct ieee80211com *ic
struct urtwn_softc *sc = (struct urtwn_softc *)self;
struct urtwn_cmd_key cmd;
 
+   /* Only handle keys for CCMP */
+   if (k->k_cipher != IEEE80211_CIPHER_CCMP) {
+   ieee80211_delete_key(ic, ni, k);
+   return;
+   }
+
if (!(ic->ic_if.if_flags & IFF_RUNNING) ||
ic->ic_state != IEEE80211_S_RUN)
return; /* Nothing to do. */
@@ -1084,6 +1092,52 @@ urtwn_delete_key_cb(struct urtwn_softc *
rtwn_delete_key(ic, cmd->ni, &cmd->key);
 }
 
+int
+urtwn_ccmp_decap(struct urtwn_softc *sc, struct mbuf *m,
+struct ieee80211_node *ni)
+{
+   struct ieee80211com *ic = &sc->sc_sc.sc_ic;
+   struct ieee80211_key *k;
+   struct ieee80211_frame *wh;
+   uint64_t pn, *prsc;
+   uint8_t *ivp;
+   uint8_t tid;
+   int hdrlen, hasqos;
+
+   k = ieee80211_get_rxkey(ic, m, ni);
+   if (k == NULL)
+   return 1;
+
+   wh = mtod(m, struct ieee80211_frame *);
+   hdrlen = iee

mcx(4) vlan offload

2020-05-28 Thread Jonathan Matthew
This implements vlan offload in mcx(4).  vlan stripping is fairly
straightforward, as the nic just removes the tag and populates a field in
the completion queue entry.  vlan insertion is a bit funny, as the nic doesn't
do any of the work here at all.  The driver has to copy at least the L2 headers
of the packet into the send queue entry, so it can insert a tag into a
previously untagged packet while it's doing that, and somewhat lower cost
than shuffling the packet data around in an mbuf.

I've tested that this doesn't break tcp or udp checksums on vlan-tagged
packets (including udp fragments).

ok?

Index: if_mcx.c
===
RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
retrieving revision 1.48
diff -u -p -r1.48 if_mcx.c
--- if_mcx.c27 May 2020 04:03:20 -  1.48
+++ if_mcx.c28 May 2020 09:30:31 -
@@ -18,6 +18,7 @@
  */
 
 #include "bpfilter.h"
+#include "vlan.h"
 
 #include 
 #include 
@@ -92,6 +93,7 @@
((1 << MCX_LOG_FLOW_TABLE_SIZE) - MCX_NUM_STATIC_FLOWS)
 
 #define MCX_SQ_INLINE_SIZE  18
+CTASSERT(ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN == MCX_SQ_INLINE_SIZE);
 
 /* doorbell offsets */
 #define MCX_CQ_DOORBELL_OFFSET  0
@@ -1258,6 +1260,8 @@ struct mcx_cq_entry {
 #define MCX_CQ_ENTRY_FLAGS_L4_OK   (1 << 26)
 #define MCX_CQ_ENTRY_FLAGS_L3_OK   (1 << 25)
 #define MCX_CQ_ENTRY_FLAGS_L2_OK   (1 << 24)
+#define MCX_CQ_ENTRY_FLAGS_CV  (1 << 16)
+#define MCX_CQ_ENTRY_FLAGS_VLAN_MASK   (0x)
 
uint32_tcq_lro_srqn;
uint32_t__reserved__[2];
@@ -2363,6 +2367,9 @@ mcx_attach(struct device *parent, struct
ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_IPv4 |
IFCAP_CSUM_UDPv4 | IFCAP_CSUM_UDPv6 | IFCAP_CSUM_TCPv4 |
IFCAP_CSUM_TCPv6;
+#if NVLAN > 0
+   ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
+#endif
IFQ_SET_MAXLEN(&ifp->if_snd, 1024);
 
ifmedia_init(&sc->sc_media, IFM_IMASK, mcx_media_change,
@@ -4013,6 +4020,7 @@ mcx_create_rq(struct mcx_softc *sc, int 
struct mcx_rq_ctx *mbin;
int error;
uint64_t *pas;
+   uint32_t rq_flags;
uint8_t *doorbell;
int insize, npages, paslen, token;
 
@@ -4044,7 +4052,11 @@ mcx_create_rq(struct mcx_softc *sc, int 
goto free;
}
mbin = (struct mcx_rq_ctx *)(((char 
*)mcx_cq_mbox_data(mcx_cq_mbox(&mxm, 0))) + 0x10);
-   mbin->rq_flags = htobe32(MCX_RQ_CTX_RLKEY | MCX_RQ_CTX_VLAN_STRIP_DIS);
+   rq_flags = MCX_RQ_CTX_RLKEY;
+#if NVLAN == 0
+   rq_flags |= MCX_RQ_CTX_VLAN_STRIP_DIS;
+#endif
+   mbin->rq_flags = htobe32(rq_flags);
mbin->rq_cqn = htobe32(cqn);
mbin->rq_wq.wq_type = MCX_WQ_CTX_TYPE_CYCLIC;
mbin->rq_wq.wq_pd = htobe32(sc->sc_pd);
@@ -5697,6 +5709,13 @@ mcx_process_rx(struct mcx_softc *sc, str
if (flags & MCX_CQ_ENTRY_FLAGS_L4_OK)
m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK |
M_UDP_CSUM_IN_OK;
+#if NVLAN > 0
+   if (flags & MCX_CQ_ENTRY_FLAGS_CV) {
+   m->m_pkthdr.ether_vtag = (flags &
+   MCX_CQ_ENTRY_FLAGS_VLAN_MASK);
+   m->m_flags |= M_VLANTAG;
+   }
+#endif
 
if (c->c_tdiff) {
uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
@@ -6369,9 +6388,26 @@ mcx_start(struct ifqueue *ifq)
csum |= MCX_SQE_L4_CSUM;
sqe->sqe_mss_csum = htobe32(csum);
sqe->sqe_inline_header_size = htobe16(MCX_SQ_INLINE_SIZE);
-   m_copydata(m, 0, MCX_SQ_INLINE_SIZE,
-   (caddr_t)sqe->sqe_inline_headers);
-   m_adj(m, MCX_SQ_INLINE_SIZE);
+#if NVLAN > 0
+   if (m->m_flags & M_VLANTAG) {
+   struct ether_vlan_header *evh;
+   evh = (struct ether_vlan_header *)
+   &sqe->sqe_inline_headers;
+
+   /* slightly cheaper vlan_inject() */
+   m_copydata(m, 0, ETHER_HDR_LEN, (caddr_t)evh);
+   evh->evl_proto = evh->evl_encap_proto;
+   evh->evl_encap_proto = htons(ETHERTYPE_VLAN);
+   evh->evl_tag = htons(m->m_pkthdr.ether_vtag);
+
+   m_adj(m, ETHER_HDR_LEN);
+   } else
+#endif
+   {
+   m_copydata(m, 0, MCX_SQ_INLINE_SIZE,
+   (caddr_t)sqe->sqe_inline_headers);
+   m_adj(m, MCX_SQ_INLINE_SIZE);
+   }
 
if (mcx_load_mbuf(sc, ms, m) != 0) {
m_freem(m);



vmx(4) msi-x

2020-05-25 Thread Jonathan Matthew
This prepares vmx(4) for multi-queue operation, first by making use of msi-x
where available, and second by rearranging the queue structures to fit the
direction we're heading in.  As with other drivers, here I'm reserving msi-x
vector 0 for events, then mapping tx/rx queues to the subsequent vectors.

Aside from the interrupt setup itself, the only change in behaviour here is
that queue setup is done after interrupt setup, as we'll need to know what type
of interrupt we're using to decide how many queues to use.  This is how other
vmx drivers work so this must be safe.

I've tested this with esxi 6.7 and qemu.  Can somone try this out on vmware
workstation or player please?  I wouldn't expect those to be any different to
esxi in this respect, but it's always possible.

ok?

Index: if_vmx.c
===
RCS file: /cvs/src/sys/dev/pci/if_vmx.c,v
retrieving revision 1.55
diff -u -p -r1.55 if_vmx.c
--- if_vmx.c27 Oct 2019 22:24:40 -  1.55
+++ if_vmx.c25 May 2020 09:35:33 -
@@ -42,8 +42,7 @@
 #include 
 #include 
 
-#define NRXQUEUE 1
-#define NTXQUEUE 1
+#define VMX_MAX_QUEUES 1
 
 #define NTXDESC 512 /* tx ring size */
 #define NTXSEGS 8 /* tx descriptors per packet */
@@ -95,6 +94,7 @@ struct vmxnet3_txqueue {
struct vmxnet3_txring cmd_ring;
struct vmxnet3_comp_ring comp_ring;
struct vmxnet3_txq_shared *ts;
+   struct ifqueue *ifq;
 };
 
 struct vmxnet3_rxqueue {
@@ -103,6 +103,14 @@ struct vmxnet3_rxqueue {
struct vmxnet3_rxq_shared *rs;
 };
 
+struct vmxnet3_queue {
+   struct vmxnet3_txqueue tx;
+   struct vmxnet3_rxqueue rx;
+   struct vmxnet3_softc *sc;
+   char intrname[8];
+   int intr;
+};
+
 struct vmxnet3_softc {
struct device sc_dev;
struct arpcom sc_arpcom;
@@ -114,9 +122,11 @@ struct vmxnet3_softc {
bus_space_handle_t sc_ioh1;
bus_dma_tag_t sc_dmat;
void *sc_ih;
+   void *sc_qih[VMX_MAX_QUEUES];
+   int sc_nintr;
+   int sc_nqueues;
 
-   struct vmxnet3_txqueue sc_txq[NTXQUEUE];
-   struct vmxnet3_rxqueue sc_rxq[NRXQUEUE];
+   struct vmxnet3_queue sc_q[VMX_MAX_QUEUES];
struct vmxnet3_driver_shared *sc_ds;
u_int8_t *sc_mcast;
 };
@@ -153,8 +163,8 @@ struct {
 int vmxnet3_match(struct device *, void *, void *);
 void vmxnet3_attach(struct device *, struct device *, void *);
 int vmxnet3_dma_init(struct vmxnet3_softc *);
-int vmxnet3_alloc_txring(struct vmxnet3_softc *, int);
-int vmxnet3_alloc_rxring(struct vmxnet3_softc *, int);
+int vmxnet3_alloc_txring(struct vmxnet3_softc *, int, int);
+int vmxnet3_alloc_rxring(struct vmxnet3_softc *, int, int);
 void vmxnet3_txinit(struct vmxnet3_softc *, struct vmxnet3_txqueue *);
 void vmxnet3_rxinit(struct vmxnet3_softc *, struct vmxnet3_rxqueue *);
 void vmxnet3_txstop(struct vmxnet3_softc *, struct vmxnet3_txqueue *);
@@ -164,6 +174,8 @@ void vmxnet3_enable_all_intrs(struct vmx
 void vmxnet3_disable_all_intrs(struct vmxnet3_softc *);
 int vmxnet3_intr(void *);
 int vmxnet3_intr_intx(void *);
+int vmxnet3_intr_event(void *);
+int vmxnet3_intr_queue(void *);
 void vmxnet3_evintr(struct vmxnet3_softc *);
 void vmxnet3_txintr(struct vmxnet3_softc *, struct vmxnet3_txqueue *);
 void vmxnet3_rxintr(struct vmxnet3_softc *, struct vmxnet3_rxqueue *);
@@ -212,6 +224,7 @@ vmxnet3_attach(struct device *parent, st
u_int memtype, ver, macl, mach, intrcfg;
u_char enaddr[ETHER_ADDR_LEN];
int (*isr)(void *);
+   int i;
 
memtype = pci_mapreg_type(pa->pa_pc, pa->pa_tag, 0x10);
if (pci_mapreg_map(pa, 0x10, memtype, 0, &sc->sc_iot0, &sc->sc_ioh0,
@@ -241,18 +254,22 @@ vmxnet3_attach(struct device *parent, st
WRITE_BAR1(sc, VMXNET3_BAR1_UVRS, 1);
 
sc->sc_dmat = pa->pa_dmat;
-   if (vmxnet3_dma_init(sc)) {
-   printf(": failed to setup DMA\n");
-   return;
-   }
 
WRITE_CMD(sc, VMXNET3_CMD_GET_INTRCFG);
intrcfg = READ_BAR1(sc, VMXNET3_BAR1_CMD);
isr = vmxnet3_intr;
+   sc->sc_nintr = 0;
+   sc->sc_nqueues = 1;
 
switch (intrcfg & VMXNET3_INTRCFG_TYPE_MASK) {
case VMXNET3_INTRCFG_TYPE_AUTO:
case VMXNET3_INTRCFG_TYPE_MSIX:
+   if (pci_intr_map_msix(pa, 0, &ih) == 0) {
+   isr = vmxnet3_intr_event;
+   sc->sc_nintr = sc->sc_nqueues + 1;
+   break;
+   }
+
/* FALLTHROUGH */
case VMXNET3_INTRCFG_TYPE_MSI:
if (pci_intr_map_msi(pa, &ih) == 0)
@@ -273,6 +290,35 @@ vmxnet3_attach(struct device *parent, st
if (intrstr)
printf(": %s", intrstr);
 
+   if (sc->sc_nintr > 1) {
+   for (i = 0; i < sc->sc_nqueues; i++) {
+   struct vmxnet3_queue *q;
+   int vec;
+
+   q = &sc->sc_q[i];
+   vec = i

mcx(4) checksum offload

2020-05-18 Thread Jonathan Matthew
So far I've completely ignored offloads in the ethernet drivers I've
written, but on having a quick look at the documentation I found that
mcx(4) checksum offload is extremely easy to use, and some simple testing
suggests that it helps quite a bit.  I've seen tcpbench receive throughput
increase by around 15%.

The nic supports all the checksum offloads we know about, reports checksum
status for every packet without being asked to, and can figure out packet
header lengths etc. for itself, so on the tx side, the driver just sets
some flags to say "checksum this for me please", and on the rx side, it
looks at two bits in the completion queue entry.

I'm mostly sending this out to see if anyone can gather any interesting
performance numbers.


Index: if_mcx.c
===
RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
retrieving revision 1.44
diff -u -p -u -p -r1.44 if_mcx.c
--- if_mcx.c24 Apr 2020 07:28:37 -  1.44
+++ if_mcx.c18 May 2020 10:22:32 -
@@ -1255,6 +1292,10 @@ struct mcx_cq_entry {
uint32_tcq_checksum;
uint32_t__reserved__;
uint32_tcq_flags;
+#define MCX_CQ_ENTRY_FLAGS_L4_OK   (1 << 26)
+#define MCX_CQ_ENTRY_FLAGS_L3_OK   (1 << 25)
+#define MCX_CQ_ENTRY_FLAGS_L2_OK   (1 << 24)
+
uint32_tcq_lro_srqn;
uint32_t__reserved__[2];
uint32_tcq_byte_cnt;
@@ -2355,7 +2396,9 @@ mcx_attach(struct device *parent, struct
ifp->if_qstart = mcx_start;
ifp->if_watchdog = mcx_watchdog;
ifp->if_hardmtu = sc->sc_hardmtu;
-   ifp->if_capabilities = IFCAP_VLAN_MTU;
+   ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_IPv4 |
+   IFCAP_CSUM_UDPv4 | IFCAP_CSUM_UDPv6 | IFCAP_CSUM_TCPv4 |
+   IFCAP_CSUM_TCPv6;
IFQ_SET_MAXLEN(&ifp->if_snd, 1024);
 
ifmedia_init(&sc->sc_media, IFM_IMASK, mcx_media_change,
@@ -5662,6 +5966,7 @@ mcx_process_rx(struct mcx_softc *sc, str
struct mcx_slot *ms;
struct mbuf *m;
int slot;
+   uint32_t flags;
 
slot = betoh16(cqe->cq_wqe_count) % (1 << MCX_LOG_RQ_SIZE);
 
@@ -5680,6 +5985,13 @@ mcx_process_rx(struct mcx_softc *sc, str
betoh32(cqe->cq_rx_hash);
}
 
+   flags = bemtoh32(&cqe->cq_flags);
+   if (flags & MCX_CQ_ENTRY_FLAGS_L3_OK)
+   m->m_pkthdr.csum_flags = M_IPV4_CSUM_IN_OK;
+   if (flags & MCX_CQ_ENTRY_FLAGS_L4_OK)
+   m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK |
+   M_UDP_CSUM_IN_OK;
+
if (c->c_tdiff) {
uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
t *= c->c_udiff;
@@ -6343,6 +6657,7 @@ mcx_start(struct ifqueue *ifq)
sqe->sqe_signature = htobe32(MCX_SQE_CE_CQE_ALWAYS);
 
/* eth segment */
+   sqe->sqe_mss_csum = htobe32(MCX_SQE_L3_CSUM | MCX_SQE_L4_CSUM);
sqe->sqe_inline_header_size = htobe16(MCX_SQ_INLINE_SIZE);
m_copydata(m, 0, MCX_SQ_INLINE_SIZE,
(caddr_t)sqe->sqe_inline_headers);



msi-x for ixl(4)

2020-04-27 Thread Jonathan Matthew
This makes ixl(4) use MSI-X where available.  The hardware is set up
for the same kind of approach as we're heading towards in em(4) and
ix(4) - interrupts for admin commands and events (link state etc.)
can only be delivered to vector 0, and the natural approach is to
map rx and tx queues to other vectors, so that's what I've done here.

The driver was already set up for multiple rx/tx queues (though it only
uses one still), so the diff sets up one vector per queue.  The vector
setup here involves creating linked lists of interrupt causes, which
are identified by a queue type (tx or rx) and a queue index.  The
queues also need to be told which msix vector they interrupt on.
This is done through per-vector and per-queue registers.

I've tested this with and without msix on amd64 with this nic:
ixl0 at pci14 dev 0 function 0 "Intel X710 SFP+" rev 0x02: port 3, FW 6.0.48754 
API 1.7, msix, 1 queue

ok?

Index: if_ixl.c
===
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.47
diff -u -p -r1.47 if_ixl.c
--- if_ixl.c22 Apr 2020 07:09:40 -  1.47
+++ if_ixl.c28 Apr 2020 00:24:02 -
@@ -1092,6 +1092,13 @@ struct ixl_atq {
 };
 SIMPLEQ_HEAD(ixl_atq_list, ixl_atq);
 
+struct ixl_queue_intr {
+   struct ixl_softc*sc;
+   int  queue;
+   void*ihc;
+   char name[8];
+};
+
 struct ixl_softc {
struct devicesc_dev;
struct arpcomsc_ac;
@@ -1103,6 +1110,7 @@ struct ixl_softc {
pci_intr_handle_tsc_ih;
void*sc_ihc;
pcitag_t sc_tag;
+   struct ixl_queue_intr   *sc_qintr;
 
bus_dma_tag_tsc_dmat;
bus_space_tag_t  sc_memt;
@@ -1160,6 +1168,8 @@ struct ixl_softc {
 static voidixl_clear_hw(struct ixl_softc *);
 static int ixl_pf_reset(struct ixl_softc *);
 
+static int ixl_setup_msix(struct ixl_softc *, struct pci_attach_args *);
+
 static int ixl_dmamem_alloc(struct ixl_softc *, struct ixl_dmamem *,
bus_size_t, u_int);
 static voidixl_dmamem_free(struct ixl_softc *, struct ixl_dmamem *);
@@ -1214,7 +1224,8 @@ static void   ixl_media_status(struct ifne
 static voidixl_watchdog(struct ifnet *);
 static int ixl_ioctl(struct ifnet *, u_long, caddr_t);
 static voidixl_start(struct ifqueue *);
-static int ixl_intr(void *);
+static int ixl_intr0(void *);
+static int ixl_intr_queue(void *);
 static int ixl_up(struct ixl_softc *);
 static int ixl_down(struct ixl_softc *);
 static int ixl_iff(struct ixl_softc *);
@@ -1524,13 +1535,24 @@ ixl_attach(struct device *parent, struct
goto shutdown;
}
 
-   if (pci_intr_map_msi(pa, &sc->sc_ih) != 0 &&
-   pci_intr_map(pa, &sc->sc_ih) != 0) {
-   printf(", unable to map interrupt\n");
-   goto shutdown;
+   if (pci_intr_map_msix(pa, 0, &sc->sc_ih) == 0) {
+   sc->sc_qintr = mallocarray(sizeof(struct ixl_queue_intr),
+   ixl_nqueues(sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO);
+   if (sc->sc_qintr == NULL) {
+   printf(", unable to allocate queue interrupts\n");
+   goto shutdown;
+   }
+   } else {
+   if (pci_intr_map_msi(pa, &sc->sc_ih) != 0 &&
+   pci_intr_map(pa, &sc->sc_ih) != 0) {
+   printf(", unable to map interrupt\n");
+   goto shutdown;
+   }
}
 
-   printf(", %s, address %s\n", pci_intr_string(sc->sc_pc, sc->sc_ih),
+   printf(", %s, %d queue%s, address %s\n",
+   pci_intr_string(sc->sc_pc, sc->sc_ih), ixl_nqueues(sc),
+   (ixl_nqueues(sc) > 1 ? "s" : ""),
ether_sprintf(sc->sc_ac.ac_enaddr));
 
if (ixl_hmc(sc) != 0) {
@@ -1585,13 +1607,18 @@ ixl_attach(struct device *parent, struct
}
 
sc->sc_ihc = pci_intr_establish(sc->sc_pc, sc->sc_ih,
-   IPL_NET | IPL_MPSAFE, ixl_intr, sc, DEVNAME(sc));
+   IPL_NET | IPL_MPSAFE, ixl_intr0, sc, DEVNAME(sc));
if (sc->sc_ihc == NULL) {
printf("%s: unable to establish interrupt handler\n",
DEVNAME(sc));
goto free_scratch;
}
 
+   if (ixl_setup_msix(sc, pa) != 0) {
+   /* error printed by ixl_setup_msix */
+   goto free_scratch;
+   }
+
ifp->if_softc = sc;
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
ifp->if_xflags = IFXF_MPSAFE;
@@ -1667,6 +1694,9 @@ shutdown:
BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
 
ixl_arq_unfill(sc);
+
+   free(sc->sc_qintr, M_DEVBUF, ixl_nqueues(sc) *
+   sizeof(struct ixl_queue_intr));
 free_arq:
ixl_dmamem_free(sc, &sc->sc_arq);
 f

  1   2   3   >