umb: delete old v4 address before adding the new one
Currently when a umb device gets a new IPv4 address, it just adds it to the interface and tries to set the default route through it. If there's a different previous address, it stays there, and any routes using it also remain in place, so umb can't set the new default route. You see something like this: umb0: unable to set IPv4 default route, error 17 and then you no longer have a working default route. The diff below removes the old address before setting the new one. SIOCDIFADDR with no details in the request will delete whatever v4 address exists on the interface, and consequently remove routes, so umb will then be able to add the new interface and set the new default route. With this diff, I've had an ssh session inside a wg tunnel survive across 5 address changes on the umb interface that carries the wireguard traffic. Without, wireguard stops working after the first address change and my ssh session goes with it. ok? Index: if_umb.c === RCS file: /cvs/src/sys/dev/usb/if_umb.c,v retrieving revision 1.55 diff -u -p -r1.55 if_umb.c --- if_umb.c1 Sep 2023 20:24:29 - 1.55 +++ if_umb.c11 Oct 2023 23:18:50 - @@ -1815,6 +1815,14 @@ umb_add_inet_config(struct umb_softc *sc int rv; memset(&ifra, 0, sizeof (ifra)); + rv = in_ioctl(SIOCDIFADDR, (caddr_t)&ifra, ifp, 1); + if (rv != 0 && rv != EADDRNOTAVAIL) { + printf("%s: unable to delete IPv4 address, error %d\n", + DEVNAM(ifp->if_softc), rv); + return rv; + } + + memset(&ifra, 0, sizeof (ifra)); sin = &ifra.ifra_addr; sin->sin_family = AF_INET; sin->sin_len = sizeof (*sin);
ypldap: stop flattening trees
ypldap currently packs all the user and group lines into contiguous blocks of memory so it can move from one entry to the next by pointer arithmetic. This doesn't make much sense because the entries are also in red black trees (that's how it looks up the entry in the first place) and RB_NEXT() is not slow. The one piece of the tree flattening code that seems worth keeping is strdup()ing the netid lines so they don't take 1kB per user. ok? Index: entries.c === RCS file: /cvs/src/usr.sbin/ypldap/entries.c,v retrieving revision 1.6 diff -u -p -u -p -r1.6 entries.c --- entries.c 18 Jul 2023 13:06:33 - 1.6 +++ entries.c 20 Sep 2023 07:17:00 - @@ -34,86 +34,6 @@ #include #include "ypldap.h" -#include "log.h" - -void -flatten_entries(struct env *env) -{ - size_t len; - char*linep; - char*endp; - char*tmp; - struct userent *ue; - struct groupent *ge; - - log_debug("flattening trees"); - /* -* This takes all the line pointers in RB elements and -* concatenates them in a single string, to be able to -* implement next element lookup without tree traversal. -* -* An extra octet is alloced to make space for an additional NUL. -*/ - if ((linep = calloc(1, env->sc_user_line_len + 1)) == NULL) { - /* -* XXX: try allocating a smaller chunk of memory -*/ - fatal("out of memory"); - } - endp = linep; - - RB_FOREACH(ue, user_name_tree, env->sc_user_names) { - /* -* we convert the first nul back to a column, -* copy the string and then convert it back to a nul. -*/ - ue->ue_line[strlen(ue->ue_line)] = ':'; - log_debug("pushing line: %s", ue->ue_line); - len = strlen(ue->ue_line) + 1; - memcpy(endp, ue->ue_line, len); - endp[strcspn(endp, ":")] = '\0'; - free(ue->ue_line); - ue->ue_line = endp; - endp += len; - - /* -* To save memory strdup(3) the netid_line which originally used -* LINE_WIDTH bytes -*/ - tmp = ue->ue_netid_line; - ue->ue_netid_line = strdup(tmp); - if (ue->ue_netid_line == NULL) { - fatal("out of memory"); - } - free(tmp); - } - env->sc_user_lines = linep; - log_debug("done pushing users"); - - if ((linep = calloc(1, env->sc_group_line_len + 1)) == NULL) { - /* -* XXX: try allocating a smaller chunk of memory -*/ - fatal("out of memory"); - } - endp = linep; - RB_FOREACH(ge, group_name_tree, env->sc_group_names) { - /* -* we convert the first nul back to a column, -* copy the string and then convert it back to a nul. -*/ - ge->ge_line[strlen(ge->ge_line)] = ':'; - log_debug("pushing line: %s", ge->ge_line); - len = strlen(ge->ge_line) + 1; - memcpy(endp, ge->ge_line, len); - endp[strcspn(endp, ":")] = '\0'; - free(ge->ge_line); - ge->ge_line = endp; - endp += len; - } - env->sc_group_lines = linep; - log_debug("done pushing groups"); -} int userent_name_cmp(struct userent *ue1, struct userent *ue2) Index: yp.c === RCS file: /cvs/src/usr.sbin/ypldap/yp.c,v retrieving revision 1.22 diff -u -p -u -p -r1.22 yp.c --- yp.c18 Jul 2023 13:06:33 - 1.22 +++ yp.c20 Sep 2023 07:17:00 - @@ -557,21 +557,25 @@ ypresp_key_val * ypproc_first_2_svc(ypreq_nokey *arg, struct svc_req *req) { static struct ypresp_key_valres; + struct userent *ue; + struct groupent *ge; if (yp_valid_domain(arg->domain, (struct ypresp_val *)&res) == -1) return (&res); if (strcmp(arg->map, "passwd.byname") == 0 || strcmp(arg->map, "master.passwd.byname") == 0) { - if (env->sc_user_lines == NULL) + ue = RB_MIN(user_name_tree, env->sc_user_names); + if (ue == NULL) return (NULL); - yp_make_keyval(&res, env->sc_user_lines, env->sc_user_lines); + yp_make_keyval(&res, ue->ue_line, ue->ue_line); } else if (strcmp(arg->map, "group.byname") == 0) { - if (env->sc_group_lines == NULL) + ge = RB_MIN(group_name_tree, env->sc_group_names); + if (ge == NULL) return (NULL); -
Re: Mellanox driver : add 100G_LR4 capability
On Fri, Sep 15, 2023 at 09:48:16AM +0200, Olivier Croquin wrote: > Hi, > > The media capability 100GBase_LR4 is not listed in the mcx driver. > > Could you please take a look at this short patch ? I found the value of 23 > in the Linux mlx driver. Thanks, I've committed it. > Is this enough to say that QSFP28 100GBase_LR are supported with the mcx > driver ? As much as anything else, sure.
Re: JH7110 PCIe device tree binding update
On Wed, Aug 30, 2023 at 01:19:42PM +0800, Kevin Lo wrote: > On Tue, Aug 29, 2023 at 09:15:41PM +0200, Mark Kettenis wrote: > > > > > Date: Tue, 29 Aug 2023 11:58:23 +0200 > > > From: Mark Kettenis > > > > > > Upstreaming of the JH7110 PCIe device tree bindings isn't finished > > > yet, but it seems some progress has been made and things have been > > > reviewed by some of the key people involved: > > > > > > https://patchwork.kernel.org/project/linux-pci/list/?series=779297 > > > > > > Here is a diff that adjusts the driver to the current state of things > > > such that we can use the latest device tree from: > > > > > > https://github.com/starfive-tech/linux/tree/JH7110_VisionFive2_upstream > > > > > > to continue development. The idea is to support the preliminary > > > bindings a little bit longer such that folks can update their device > > > trees. Will probably drop support for the preliminary bindings in a > > > few weeks. > > > > > > ok? > > > > patrick@ pointed out that the dv_unit check won't work properly if the > > first PCIe controller is disabled. So here is a diff that checks the > > device address instead like we do for dwqe(4). > > > > ok? > > ok kevlo@ > > Tested on my VisionFive 2 v1.3b with the device tree from: > > https://raw.githubusercontent.com/starfive-tech/linux/JH7110_VisionFive2_upstream/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2-v1.3b.dts > > It works fine, the NVMe is detected. > > BTW, I noticed that the memory statistics seem to be incorrect. > The VisionFive 2 is equipped with 8GB RAM. > > OpenBSD 7.3-current (GENERIC.MP) #0: Wed Aug 30 11:52:03 CST 2023 > kevlo@vf2:/usr/src/sys/arch/riscv64/compile/GENERIC.MP > real mem = 4294967296 (4096MB) > ^^^ > avail mem = 8110370816 (7734MB) > ^^^ riscv64 calculates physmem (the 'real mem' number) by adding up the ranges in the /memory device tree node, but uses the EFI memory map to set up UVM, which is where 'avail mem' comes from. Should riscv64 be more like arm64 here and calculate physmem by adding up the memreg segments, which are built from the EFI memory map if available, and the /memory node if not?
Re: JH7110 PCIe device tree binding update
On Wed, Aug 30, 2023 at 01:19:42PM +0800, Kevin Lo wrote: > On Tue, Aug 29, 2023 at 09:15:41PM +0200, Mark Kettenis wrote: > > > > > Date: Tue, 29 Aug 2023 11:58:23 +0200 > > > From: Mark Kettenis > > > > > > Upstreaming of the JH7110 PCIe device tree bindings isn't finished > > > yet, but it seems some progress has been made and things have been > > > reviewed by some of the key people involved: > > > > > > https://patchwork.kernel.org/project/linux-pci/list/?series=779297 > > > > > > Here is a diff that adjusts the driver to the current state of things > > > such that we can use the latest device tree from: > > > > > > https://github.com/starfive-tech/linux/tree/JH7110_VisionFive2_upstream > > > > > > to continue development. The idea is to support the preliminary > > > bindings a little bit longer such that folks can update their device > > > trees. Will probably drop support for the preliminary bindings in a > > > few weeks. > > > > > > ok? > > > > patrick@ pointed out that the dv_unit check won't work properly if the > > first PCIe controller is disabled. So here is a diff that checks the > > device address instead like we do for dwqe(4). > > > > ok? > > ok kevlo@ > > Tested on my VisionFive 2 v1.3b with the device tree from: > > https://raw.githubusercontent.com/starfive-tech/linux/JH7110_VisionFive2_upstream/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2-v1.3b.dts > > It works fine, the NVMe is detected. Also works on VisionFive 2 v1.3b with the older device tree here, ok jmatthew@
Re: all platforms: separate cpu_initclocks() from cpu_startclock()
On Sat, Aug 19, 2023 at 01:44:47PM -0500, Scott Cheloha wrote: > On Sun, Aug 13, 2023 at 01:48:21PM -0500, Scott Cheloha wrote: > > This is the next patch in the clock interrupt reorganization series. > > > > Before we continue breaking up the hardclock(9) we need to detour into > > the MD code. > > > > This patch divides the "initialization" parts of cpu_initclocks() from > > the "start the clock interrupt" parts. Seprating the two parts leaves > > initclocks() an opportunity to prepare the primary CPU for clock > > interrupt dispatch in a machine-independent manner before actually > > pulling the trigger. It's nearly impossible to do any MI setup during > > initclocks() because cpu_initclocks() does everything in one go: both > > initialization and kickoff are done when cpu_initclocks() returns. > > > > Many platforms have a "cpu_startclock()" function, so this patch takes > > that de facto standard and makes it a rule: cpu_startclock() is now > > required. It is prototyped in sys/systm.h and every platform must > > implement it. > > > > The revised initclocks() sequence is then: > > > > 1. Call cpu_initclocks(). At minimum, cpu_initclocks() ensures > >hz, stathz, and profhz are initialized. All the machine > >independent setup in step (2) (currently) depends upon > >these machine-dependent values. > > > > 2. Compute intervals using hz, stathz, and profhz. > > > >In a later step I will move the full contents of clockintr_init() > >up into initclocks() and get rid of clockintr_init() entirely. > > > > 3. Call cpu_startclock(). At minimum, cpu_startclock() starts the > >clock interrupt dispatch cycle on the primary CPU. > > > > I have compiled/booted this patch on amd64 (lapic path), arm64, i386 > > (lapic path), macppc, octeon, and sparc64 (sun4v). > > > > I am looking for compile/boot tests on alpha, armv7, hppa, landisk, > > luna88k, powerpc64, and riscv64. I think armv7 is the tricky one > > here. Everything else is relatively straightforward, though I may > > have missed a few stray variables here or there. > > > > Test results? Ok? > > Here is an updated patch that removes several MD prototypes for > cpu_startclock() that I missed the first time through. > > I went back and tested these again: > > - amd64 (lapic) > - arm64 > - i386 (lapic) > - powerpc/macppc > - mips64/octeon (loongson should be fine) > - sparc64 (sys_tick; tick/stick should be fine) > > arm/armv7 and riscv64 were tested under the previous version, but I > would appreciate a second compile-test to make sure the header changes > in the updated patch did not break the build (CC phessler@, jsg@). Still builds on riscv64 and armv7.
Re: all platforms: separate cpu_initclocks() from cpu_startclock()
On Mon, Aug 14, 2023 at 06:24:14PM +1000, Jonathan Matthew wrote: > On Sun, Aug 13, 2023 at 01:48:21PM -0500, Scott Cheloha wrote: > > This is the next patch in the clock interrupt reorganization series. > > > > Before we continue breaking up the hardclock(9) we need to detour into > > the MD code. > > > > This patch divides the "initialization" parts of cpu_initclocks() from > > the "start the clock interrupt" parts. Seprating the two parts leaves > > initclocks() an opportunity to prepare the primary CPU for clock > > interrupt dispatch in a machine-independent manner before actually > > pulling the trigger. It's nearly impossible to do any MI setup during > > initclocks() because cpu_initclocks() does everything in one go: both > > initialization and kickoff are done when cpu_initclocks() returns. > > > > Many platforms have a "cpu_startclock()" function, so this patch takes > > that de facto standard and makes it a rule: cpu_startclock() is now > > required. It is prototyped in sys/systm.h and every platform must > > implement it. > > > > The revised initclocks() sequence is then: > > > > 1. Call cpu_initclocks(). At minimum, cpu_initclocks() ensures > >hz, stathz, and profhz are initialized. All the machine > >independent setup in step (2) (currently) depends upon > >these machine-dependent values. > > > > 2. Compute intervals using hz, stathz, and profhz. > > > >In a later step I will move the full contents of clockintr_init() > >up into initclocks() and get rid of clockintr_init() entirely. > > > > 3. Call cpu_startclock(). At minimum, cpu_startclock() starts the > >clock interrupt dispatch cycle on the primary CPU. > > > > I have compiled/booted this patch on amd64 (lapic path), arm64, i386 > > (lapic path), macppc, octeon, and sparc64 (sun4v). > > > > I am looking for compile/boot tests on alpha, armv7, hppa, landisk, > > luna88k, powerpc64, and riscv64. I think armv7 is the tricky one > > here. Everything else is relatively straightforward, though I may > > have missed a few stray variables here or there. > > > > Test results? Ok? > > Compiles on armv7 and boots on an Allwinner A20 machine using agtimer(4). > I don't think I have any armv7 systems using other timer devices. > Also compiles and boots on riscv64 (visionfive 2).
Re: all platforms: separate cpu_initclocks() from cpu_startclock()
On Sun, Aug 13, 2023 at 01:48:21PM -0500, Scott Cheloha wrote: > This is the next patch in the clock interrupt reorganization series. > > Before we continue breaking up the hardclock(9) we need to detour into > the MD code. > > This patch divides the "initialization" parts of cpu_initclocks() from > the "start the clock interrupt" parts. Seprating the two parts leaves > initclocks() an opportunity to prepare the primary CPU for clock > interrupt dispatch in a machine-independent manner before actually > pulling the trigger. It's nearly impossible to do any MI setup during > initclocks() because cpu_initclocks() does everything in one go: both > initialization and kickoff are done when cpu_initclocks() returns. > > Many platforms have a "cpu_startclock()" function, so this patch takes > that de facto standard and makes it a rule: cpu_startclock() is now > required. It is prototyped in sys/systm.h and every platform must > implement it. > > The revised initclocks() sequence is then: > > 1. Call cpu_initclocks(). At minimum, cpu_initclocks() ensures >hz, stathz, and profhz are initialized. All the machine >independent setup in step (2) (currently) depends upon >these machine-dependent values. > > 2. Compute intervals using hz, stathz, and profhz. > >In a later step I will move the full contents of clockintr_init() >up into initclocks() and get rid of clockintr_init() entirely. > > 3. Call cpu_startclock(). At minimum, cpu_startclock() starts the >clock interrupt dispatch cycle on the primary CPU. > > I have compiled/booted this patch on amd64 (lapic path), arm64, i386 > (lapic path), macppc, octeon, and sparc64 (sun4v). > > I am looking for compile/boot tests on alpha, armv7, hppa, landisk, > luna88k, powerpc64, and riscv64. I think armv7 is the tricky one > here. Everything else is relatively straightforward, though I may > have missed a few stray variables here or there. > > Test results? Ok? Compiles on armv7 and boots on an Allwinner A20 machine using agtimer(4). I don't think I have any armv7 systems using other timer devices.
ix(4) shouldn't crash on memory allocation failure
One of the problems described here: https://www.mail-archive.com/tech@openbsd.org/msg71790.html amounts to ix(4) not checking that it allocated a dma map before trying to free it. ok? Index: if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.197 diff -u -p -r1.197 if_ix.c --- if_ix.c 1 Jun 2023 09:05:33 - 1.197 +++ if_ix.c 7 Jul 2023 09:22:30 - @@ -3094,8 +3094,11 @@ ixgbe_free_receive_buffers(struct rx_rin m_freem(rxbuf->buf); rxbuf->buf = NULL; } - bus_dmamap_destroy(rxr->rxdma.dma_tag, rxbuf->map); - rxbuf->map = NULL; + if (rxbuf->map != NULL) { + bus_dmamap_destroy(rxr->rxdma.dma_tag, + rxbuf->map); + rxbuf->map = NULL; + } } free(rxr->rx_buffers, M_DEVBUF, sc->num_rx_desc * sizeof(struct ixgbe_rx_buf));
use if_register in dwge(4)
Like dwqe(4), dwge(4) should also register its instances for lookup by ofw node or phandle. ok? Index: if_dwge.c === RCS file: /cvs/src/sys/dev/fdt/if_dwge.c,v retrieving revision 1.17 diff -u -p -r1.17 if_dwge.c --- if_dwge.c 5 Jul 2023 18:48:49 - 1.17 +++ if_dwge.c 5 Jul 2023 19:04:18 - @@ -267,6 +267,8 @@ struct dwge_softc { bus_dma_tag_t sc_dmat; void*sc_ih; + struct if_devicesc_ifd; + struct arpcom sc_ac; #define sc_lladdr sc_ac.ac_enaddr struct mii_data sc_mii; @@ -634,6 +636,10 @@ dwge_attach(struct device *parent, struc dwge_intr, sc, sc->sc_dev.dv_xname); if (sc->sc_ih == NULL) printf("%s: can't establish interrupt\n", sc->sc_dev.dv_xname); + + sc->sc_ifd.if_node = faa->fa_node; + sc->sc_ifd.if_ifp = ifp; + if_register(&sc->sc_ifd); } void
Re: dwge(4) fixed-link support
On Wed, Jul 05, 2023 at 01:13:34PM +0200, Mark Kettenis wrote: > > Date: Wed, 5 Jul 2023 12:46:36 +0300 > > From: Jonathan Matthew > > > > On the Banana Pi R1 (aka Lamobo R1), the dwge interface on the soc is > > connected to a broadcom switch chip. It looks like this in the device > > tree: > > > > &gmac { > > pinctrl-names = "default"; > > pinctrl-0 = <&gmac_rgmii_pins>; > > phy-mode = "rgmii"; > > phy-supply = <®_gmac_3v3>; > > status = "okay"; > > > > fixed-link { > > speed = <1000>; > > full-duplex; > > }; > > > > mdio { > > ... > > } > > }; > > > > This diff makes the fixed-link part work, setting the interface's link > > state to up and the media type to IFM_1000_T|IFM_FDX instead of trying to > > attach a phy. After setting the media type, we need to call mii_statchg() > > to configure the MAC appropriately. > > > > ok? > > Is there a reason why you structured this differently than how this is > done for dwqe(4)? I did this quite a while ago, so I'm not completely sure, but I think what happened is that I based it on an earlier version of the dwqe diff. Here's a new version that looks a lot more like how dwqe does it: Index: if_dwge.c === RCS file: /cvs/src/sys/dev/fdt/if_dwge.c,v retrieving revision 1.16 diff -u -p -r1.16 if_dwge.c --- if_dwge.c 25 Jun 2023 22:36:09 - 1.16 +++ if_dwge.c 5 Jul 2023 13:03:47 - @@ -271,6 +271,7 @@ struct dwge_softc { #define sc_lladdr sc_ac.ac_enaddr struct mii_data sc_mii; #define sc_media sc_mii.mii_media + uint64_tsc_fixed_media; int sc_link; int sc_phyloc; int sc_force_thresh_dma_mode; @@ -386,7 +387,7 @@ dwge_attach(struct device *parent, struc { struct dwge_softc *sc = (void *)self; struct fdt_attach_args *faa = aux; - struct ifnet *ifp; + struct ifnet *ifp = &sc->sc_ac.ac_if; uint32_t phy, phy_supply; uint32_t axi_config; uint32_t mode, pbl; @@ -457,6 +458,30 @@ dwge_attach(struct device *parent, struc /* Reset PHY */ dwge_reset_phy(sc); + node = OF_getnodebyname(faa->fa_node, "fixed-link"); + if (node) { + ifp->if_baudrate = IF_Mbps(OF_getpropint(node, "speed", 0)); + + switch (OF_getpropint(node, "speed", 0)) { + case 1000: + sc->sc_fixed_media = IFM_ETHER | IFM_1000_T; + break; + case 100: + sc->sc_fixed_media = IFM_ETHER | IFM_100_TX; + break; + default: + sc->sc_fixed_media = IFM_ETHER | IFM_AUTO; + break; + } + + if (OF_getpropbool(node, "full-duplex")) { + ifp->if_link_state = LINK_STATE_FULL_DUPLEX; + sc->sc_fixed_media |= IFM_FDX; + } else { + ifp->if_link_state = LINK_STATE_UP; + } + } + sc->sc_clk = clock_get_frequency(faa->fa_node, "stmmaceth"); if (sc->sc_clk > 25000) sc->sc_clk = GMAC_GMII_ADDR_CR_DIV_124; @@ -479,7 +504,6 @@ dwge_attach(struct device *parent, struc timeout_set(&sc->sc_tick, dwge_tick, sc); timeout_set(&sc->sc_rxto, dwge_rxtick, sc); - ifp = &sc->sc_ac.ac_if; ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_xflags = IFXF_MPSAFE; @@ -576,14 +600,23 @@ dwge_attach(struct device *parent, struc dwge_write(sc, GMAC_AXI_BUS_MODE, mode); } - mii_attach(self, &sc->sc_mii, 0x, sc->sc_phyloc, - (sc->sc_phyloc == MII_PHY_ANY) ? 0 : MII_OFFSET_ANY, 0); - if (LIST_FIRST(&sc->sc_mii.mii_phys) == NULL) { - printf("%s: no PHY found!\n", sc->sc_dev.dv_xname); - ifmedia_add(&sc->sc_media, IFM_ETHER|IFM_MANUAL, 0, NULL); - ifmedia_set(&sc->sc_media, IFM_ETHER|IFM_MANUAL); - } else - ifmedia_set(&sc->sc_media, IFM_ETHER|IFM_AUTO); + if (sc->sc_fixed_media == 0) { + mii_attach(self, &sc->sc_mii, 0x, sc->sc_phyloc, + (sc->sc_phyloc == MII_PHY_ANY) ? 0
dwge(4) fixed-link support
On the Banana Pi R1 (aka Lamobo R1), the dwge interface on the soc is connected to a broadcom switch chip. It looks like this in the device tree: &gmac { pinctrl-names = "default"; pinctrl-0 = <&gmac_rgmii_pins>; phy-mode = "rgmii"; phy-supply = <®_gmac_3v3>; status = "okay"; fixed-link { speed = <1000>; full-duplex; }; mdio { ... } }; This diff makes the fixed-link part work, setting the interface's link state to up and the media type to IFM_1000_T|IFM_FDX instead of trying to attach a phy. After setting the media type, we need to call mii_statchg() to configure the MAC appropriately. ok? Index: if_dwge.c === RCS file: /cvs/src/sys/dev/fdt/if_dwge.c,v retrieving revision 1.16 diff -u -p -r1.16 if_dwge.c --- if_dwge.c 25 Jun 2023 22:36:09 - 1.16 +++ if_dwge.c 5 Jul 2023 09:16:41 - @@ -271,6 +271,7 @@ struct dwge_softc { #define sc_lladdr sc_ac.ac_enaddr struct mii_data sc_mii; #define sc_media sc_mii.mii_media + uint64_tsc_fixed_media; int sc_link; int sc_phyloc; int sc_force_thresh_dma_mode; @@ -386,7 +387,7 @@ dwge_attach(struct device *parent, struc { struct dwge_softc *sc = (void *)self; struct fdt_attach_args *faa = aux; - struct ifnet *ifp; + struct ifnet *ifp = &sc->sc_ac.ac_if; uint32_t phy, phy_supply; uint32_t axi_config; uint32_t mode, pbl; @@ -403,16 +404,6 @@ dwge_attach(struct device *parent, struc } sc->sc_dmat = faa->fa_dmat; - /* Lookup PHY. */ - phy = OF_getpropint(faa->fa_node, "phy", 0); - if (phy == 0) - phy = OF_getpropint(faa->fa_node, "phy-handle", 0); - node = OF_getnodebyphandle(phy); - if (node) - sc->sc_phyloc = OF_getpropint(node, "reg", MII_PHY_ANY); - else - sc->sc_phyloc = MII_PHY_ANY; - pinctrl_byname(faa->fa_node, "default"); /* Enable clocks. */ @@ -449,13 +440,48 @@ dwge_attach(struct device *parent, struc if (OF_is_compatible(faa->fa_node, "starfive,jh7100-gmac")) sc->sc_defrag = 1; - /* Power up PHY. */ - phy_supply = OF_getpropint(faa->fa_node, "phy-supply", 0); - if (phy_supply) - regulator_enable(phy_supply); + node = OF_getnodebyname(faa->fa_node, "fixed-link"); + if (node == 0) { + /* Lookup PHY. */ + phy = OF_getpropint(faa->fa_node, "phy", 0); + if (phy == 0) + phy = OF_getpropint(faa->fa_node, "phy-handle", 0); + node = OF_getnodebyphandle(phy); + if (node) + sc->sc_phyloc = OF_getpropint(node, "reg", MII_PHY_ANY); + else + sc->sc_phyloc = MII_PHY_ANY; + + /* Power up PHY. */ + phy_supply = OF_getpropint(faa->fa_node, "phy-supply", 0); + if (phy_supply) + regulator_enable(phy_supply); - /* Reset PHY */ - dwge_reset_phy(sc); + /* Reset PHY */ + dwge_reset_phy(sc); + } else { + ifp->if_baudrate = IF_Mbps(OF_getpropint(node, + "speed", 0)); + + switch (OF_getpropint(node, "speed", 0)) { + case 1000: + sc->sc_fixed_media = IFM_ETHER | IFM_1000_T; + break; + case 100: + sc->sc_fixed_media = IFM_ETHER | IFM_100_TX; + break; + default: + sc->sc_fixed_media = IFM_ETHER | IFM_AUTO; + break; + } + + if (OF_getpropbool(node, "full-duplex")) { + ifp->if_link_state = LINK_STATE_FULL_DUPLEX; + sc->sc_fixed_media |= IFM_FDX; + } else { + ifp->if_link_state = LINK_STATE_UP; + } + } sc->sc_clk = clock_get_frequency(faa->fa_node, "stmmaceth"); if (sc->sc_clk > 25000) @@ -479,7 +505,6 @@ dwge_attach(struct device *parent, struc timeout_set(&sc->sc_tick, dwge_tick, sc); timeout_set(&sc->sc_rxto, dwge_rxtick, sc); - ifp = &sc->sc_ac.ac_if; ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_xflags = IFXF_MPSAFE; @@ -576,14 +601,21 @@ dwge_attach(struct device *parent, struc dwge_write(sc, GMAC_AXI_BUS_MODE, mode); } - mii_attach(self, &sc->sc_mii, 0x, sc->sc_phyloc, - (sc->sc_phyloc == MII_PHY_ANY) ? 0 : MII_OFFSET_ANY, 0); - if (LIST_FIRST(&sc->sc_mii.
Re: cksum remove redundant code
ok jmatthew@ On Tue, Jul 04, 2023 at 12:20:32PM +0300, Alexander Bluhm wrote: > anyone? > > On Fri, May 26, 2023 at 06:44:25PM +0200, Alexander Bluhm wrote: > > Hi, > > > > in_ifcap_cksum() checks ifp == NULL > > in_hdr_cksum_out() sets ip_sum = 0 > > in_proto_cksum_out() and in6_proto_cksum_out() always write > > th_sum if M_TCP_CSUM_OUT is set and proto is IPPROTO_TCP. > > > > ok? > > > > bluhm > > > > Index: netinet/ip_output.c > > === > > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v > > retrieving revision 1.388 > > diff -u -p -r1.388 ip_output.c > > --- netinet/ip_output.c 22 May 2023 16:08:34 - 1.388 > > +++ netinet/ip_output.c 26 May 2023 11:55:49 - > > @@ -1801,7 +1801,7 @@ in_hdr_cksum_out(struct mbuf *m, struct > > struct ip *ip = mtod(m, struct ip *); > > > > ip->ip_sum = 0; > > - if (ifp && in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) { > > + if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) { > > SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); > > } else { > > ipstat_inc(ips_outswcsum); > > Index: netinet/tcp_output.c > > === > > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v > > retrieving revision 1.138 > > diff -u -p -r1.138 tcp_output.c > > --- netinet/tcp_output.c15 May 2023 16:34:56 - 1.138 > > +++ netinet/tcp_output.c26 May 2023 15:19:12 - > > @@ -1295,7 +1295,6 @@ tcp_chopper(struct mbuf *m0, struct mbuf > > > > /* copy and adjust IP header, calculate checksum */ > > SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); > > - mhth->th_sum = 0; > > if (ip) { > > struct ip *mhip; > > > > @@ -1328,10 +1327,8 @@ tcp_chopper(struct mbuf *m0, struct mbuf > > } > > /* adjust IP header, calculate checksum */ > > SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); > > - th->th_sum = 0; > > if (ip) { > > ip->ip_len = htons(m0->m_pkthdr.len); > > - ip->ip_sum = 0; > > in_hdr_cksum_out(m0, ifp); > > in_proto_cksum_out(m0, ifp); > > } >
bge(4) kstats
This adds kstats for the hardware counters available in bge(4) devices, BCM5705 and newer. The main complication is that some of the counters are already used in bge_stats_update_regs() as part of a hardware bug workaround, some are affected by hardware bugs themselves, and some are read to update interface counters. I decided to leave that as-is as much as possible. The main changes to bge_stats_update_regs() are to always read the outgoing ucast/mcast/bcast packet counters (instead of just when we're working around the RDMA bug) and to accumulate any counters read into the kstat buffer, so bge_kstat_read() doesn't have to touch them. All the hardware counters reset on read, so avoiding double handling keeps things simple. This means bge_stats_update_regs() also has to be called with bge_kstat_mtx held, so to decrease the number of '#if NKSTAT > 0' the mutex is compiled in even in kernels without kstat. On a lightly used machine that sees a lot of multicast and broadcast due to being near Windows desktops, the stats look like this: ok? bge0:0:bge-stats:0 out octets: 738725 bytes collisions: 0 xon sent: 0 xoff sent: 0 xmit errors: 0 coll frames: 0 packets multicoll frame: 0 packets deferred xmit: 0 excess coll: 0 late coll: 0 out ucast pkts: 1495 packets out mcast pkts: 0 packets out bcast pkts: 5 packets in octets: 10192782 bytes fragments: 0 in ucast pkts: 1736 packets in mcast pkts: 27251 packets in bcast pkts: 42984 packets FCS errors: 0 align errors: 0 xon rcvd: 0 xoff rcvd: 0 ctrlframes rcvd: 0 xoff entered: 0 too long frames: 0 jabbers: 0 too short pkts: 0 DMA RQ full: 0 DMA HPRQ full: 0 SDC queue full: 0 sendprod set: 0 stats updated: 0 irqs: 0 avoided irqs: 0 tx thresh hit: 0 filtdrop: 0 DMA WRQ full: 0 DMA HPWRQ full: 0 out of BDs: 10 if in drops: 0 if in errors: 0 rx thresh hit: 0 Index: if_bge.c === RCS file: /cvs/src/sys/dev/pci/if_bge.c,v retrieving revision 1.400 diff -u -p -u -p -r1.400 if_bge.c --- if_bge.c18 Jan 2023 23:31:37 - 1.400 +++ if_bge.c3 Jul 2023 06:09:42 - @@ -74,6 +74,7 @@ #include "bpfilter.h" #include "vlan.h" +#include "kstat.h" #include #include @@ -85,6 +86,7 @@ #include #include #include +#include #include #include @@ -203,6 +205,58 @@ void bge_ape_unlock(struct bge_softc *, void bge_ape_send_event(struct bge_softc *, uint32_t); void bge_ape_driver_state_change(struct bge_softc *, int); +#if NKSTAT > 0 +void bge_kstat_attach(struct bge_softc *); + +enum { + bge_stat_out_octets = 0, + bge_stat_collisions, + bge_stat_xon_sent, + bge_stat_xoff_sent, + bge_stat_xmit_errors, + bge_stat_coll_frames, + bge_stat_multicoll_frames, + bge_stat_deferred_xmit, + bge_stat_excess_coll, + bge_stat_late_coll, + bge_stat_out_ucast_pkt, + bge_stat_out_mcast_pkt, + bge_stat_out_bcast_pkt, + bge_stat_in_octets, + bge_stat_fragments, + bge_stat_in_ucast_pkt, + bge_stat_in_mcast_pkt, + bge_stat_in_bcast_pkt, + bge_stat_fcs_errors, + bge_stat_align_errors, + bge_stat_xon_rcvd, + bge_stat_xoff_rcvd, + bge_stat_ctrl_frame_rcvd, + bge_stat_xoff_entered, + bge_stat_too_long_frames, + bge_stat_jabbers, + bge_stat_too_short_pkts, + + bge_stat_dma_rq_full, + bge_stat_dma_hprq_full, + bge_stat_sdc_queue_full, + bge_stat_nic_sendprod_set, + bge_stat_status_updated, + bge_stat_irqs, + bge_stat_avoided_irqs, + bge_stat_tx_thresh_hit, + + bge_stat_filtdrop, + bge_stat_dma_wrq_full, + bge_stat_dma_hpwrq_full, + bge_stat_out_of_bds, + bge_stat_if_in_drops, + bge_stat_if_in_errors, + bge_stat_rx_thresh_hit, +}; + +#endif + #ifdef BGE_DEBUG #define DPRINTF(x) do { if (bgedebug) printf x; } while (0) #define DPRINTFN(n,x) do { if (bgedebug >= (n)) printf x; } while (0) @@ -2993,6 +3047,12 @@ bge_attach(struct device *parent, struct else sc->bge_return_ring_cnt = BGE_RETURN_RING_CNT_5705; + mtx_init(&sc->bge_kstat_mtx, IPL_SOFTCLOCK); +#if NKSTAT > 0 + if (BGE_IS_5705_PLUS(sc)) + bge_kstat_attach(sc); +#endif + /* Set up ifnet structure */ ifp = &sc->arpcom.ac_if; ifp->if_softc = sc; @@ -3767,9 +3827,11 @@ bge_tick(void *xsc) s = splnet(); - if (BGE_IS_5705_PLUS(sc)) + if (BGE_IS_5705_PLUS(sc)) { + mtx_enter(&sc->bge_kstat_mtx); bge_stats_update_regs(sc); - else + mtx_leave(&sc->bge_kstat_mtx); + } else bge_stats_update(sc); if (sc->bge_flags & BGE_FIBER_TBI) { @@ -37
dwge(4) kstats
This adds kstats for hardware counters available in (some) dwge devices. If the counters are not present, as in Allwinner A20 devices among others, they just read 0. On an RK3399 device, they do exist, and they yield something like this: rp64$ kstat dwge0::dwge-stats: dwge0:0:dwge-stats:0 tx octets total: 6152530 bytes tx frames total: 7897 packets tx underflow: 0 packets tx carrier err: 0 packets tx good octets: 6152530 bytes tx good frames: 7897 packets rx frames total: 7685 packets rx octets total: 960726 bytes rx good octets: 960726 bytes rx good mcast: 0 packets rx crc errors: 0 packets rx len errors: 0 packets rx fifo err: 0 packets The counters are all 32 bit, so to avoid overflow, we set them to reset on read, accumulate the values read into 64 bit counters, and read all the counters when an MMC interrupt occurs, which happens when one or more counters reach 0x800. Writing all ones to GMAC_MMC_RX_INT_MSK and GMAC_MMC_TX_INT_MSK masks the MMC interrupts, so the diff removes those. ok? Index: if_dwge.c === RCS file: /cvs/src/sys/dev/fdt/if_dwge.c,v retrieving revision 1.15 diff -u -p -r1.15 if_dwge.c --- if_dwge.c 26 Feb 2023 13:28:12 - 1.15 +++ if_dwge.c 16 Jun 2023 06:50:58 - @@ -21,6 +21,7 @@ */ #include "bpfilter.h" +#include "kstat.h" #include #include @@ -54,6 +55,10 @@ #include #endif +#if NKSTAT > 0 +#include +#endif + #include #include @@ -97,8 +102,24 @@ #define GMAC_INT_MASK_RIM (1 << 0) #define GMAC_MAC_ADDR0_HI 0x0040 #define GMAC_MAC_ADDR0_LO 0x0044 +#define GMAC_MAC_MMC_CTRL 0x0100 +#define GMAC_MAC_MMC_CTRL_ROR (1 << 2) +#define GMAC_MAC_MMC_CTRL_CR (1 << 0) #define GMAC_MMC_RX_INT_MSK0x010c #define GMAC_MMC_TX_INT_MSK0x0110 +#define GMAC_MMC_TXOCTETCNT_GB 0x0114 +#define GMAC_MMC_TXFRMCNT_GB 0x0118 +#define GMAC_MMC_TXUNDFLWERR 0x0148 +#define GMAC_MMC_TXCARERR 0x0160 +#define GMAC_MMC_TXOCTETCNT_G 0x0164 +#define GMAC_MMC_TXFRMCNT_G0x0168 +#define GMAC_MMC_RXFRMCNT_GB 0x0180 +#define GMAC_MMC_RXOCTETCNT_GB 0x0184 +#define GMAC_MMC_RXOCTETCNT_G 0x0188 +#define GMAC_MMC_RXMCFRMCNT_G 0x0190 +#define GMAC_MMC_RXCRCERR 0x0194 +#define GMAC_MMC_RXLENERR 0x01c8 +#define GMAC_MMC_RXFIFOOVRFLW 0x01d4 #define GMAC_MMC_IPC_INT_MSK 0x0200 #define GMAC_BUS_MODE 0x1000 #define GMAC_BUS_MODE_8XPBL (1 << 24) @@ -113,6 +134,7 @@ #define GMAC_RX_DESC_LIST_ADDR 0x100c #define GMAC_TX_DESC_LIST_ADDR 0x1010 #define GMAC_STATUS0x1014 +#define GMAC_STATUS_MMC (1 << 27) #define GMAC_STATUS_RI(1 << 6) #define GMAC_STATUS_TU(1 << 2) #define GMAC_STATUS_TI(1 << 0) @@ -277,6 +299,11 @@ struct dwge_softc { uint32_tsc_clk_sel_125; uint32_tsc_clk_sel_25; uint32_tsc_clk_sel_2_5; + +#if NKSTAT > 0 + struct mutexsc_kstat_mtx; + struct kstat*sc_kstat; +#endif }; #define DEVNAME(_s)((_s)->sc_dev.dv_xname) @@ -334,6 +361,11 @@ void dwge_dmamem_free(struct dwge_softc struct mbuf *dwge_alloc_mbuf(struct dwge_softc *, bus_dmamap_t); void dwge_fill_rx_ring(struct dwge_softc *); +#if NKSTAT > 0 +intdwge_kstat_read(struct kstat *); +void dwge_kstat_attach(struct dwge_softc *); +#endif + int dwge_match(struct device *parent, void *cfdata, void *aux) { @@ -555,13 +587,14 @@ dwge_attach(struct device *parent, struc if_attach(ifp); ether_ifattach(ifp); +#if NKSTAT > 0 + dwge_kstat_attach(sc); +#endif /* Disable interrupts. */ dwge_write(sc, GMAC_INT_ENA, 0); dwge_write(sc, GMAC_INT_MASK, GMAC_INT_MASK_LPIIM | GMAC_INT_MASK_PIM | GMAC_INT_MASK_RIM); - dwge_write(sc, GMAC_MMC_RX_INT_MSK, 0x); - dwge_write(sc, GMAC_MMC_TX_INT_MSK, 0x); dwge_write(sc, GMAC_MMC_IPC_INT_MSK, 0x); sc->sc_ih = fdt_intr_establish(faa->fa_node, IPL_NET | IPL_MPSAFE, @@ -921,6 +954,14 @@ dwge_intr(void *arg) reg & GMAC_STATUS_TU) dwge_tx_proc(sc); +#if NKSTAT > 0 + if (reg & GMAC_STATUS_MMC) { + mtx_enter(&sc->sc_kstat_mtx); + dwge_kstat_read(sc->sc_kstat); + mtx_leave(&sc->sc_kstat_mtx); + } +#endif + return (1); } @@ -1660,3 +1701,77 @@ dwge_mii_statchg_rockchip(struct device regmap_write_4(rm, sc->sc_clk_sel, gmac_clk_sel); } + +#if NKSTAT > 0 + +struct dwge_counter { + const char *c_name; + enum kstat_kv_unit c_unit; + uint32_tc_reg; +}; + +const struct dwge_counter dwge_counters[] = { + { "tx octets total", KSTAT_KV_U_BYTES, GMAC_MMC_TXOCTETCNT_GB }, + { "tx frames total", KSTAT_KV_U_PACKET
ypldap: try servers until one succeeds
We sometimes run into situations where one of the three servers a ypldap can talk to will accept a TCP connection but won't do TLS properly, or won't perform LDAP searches. ypldap currently only tries servers until one accepts the connection, so when this happens, it is less successful at updating than it could be. The diff below adjusts the ldap update code so it tries servers until it either successfully queries one or it runs out of addresses to try. If a server breaks after returning partial results, the ldap process will still send what it got to the main process. If the ldap process then gets full results from another server, those will overwrite the partial results, and if it doesn't, the main process will discard the partial results when it gets a 'trash update' message from the ldap process. While here, the diff also adds the server address to log messages about servers not working, so it's easier to figure out what's going wrong. ok? Index: ldapclient.c === RCS file: /cvs/src/usr.sbin/ypldap/ldapclient.c,v retrieving revision 1.46 diff -u -p -r1.46 ldapclient.c --- ldapclient.c13 Oct 2022 04:55:33 - 1.46 +++ ldapclient.c3 Feb 2023 03:58:17 - @@ -53,50 +53,10 @@ int client_build_req(struct idm *, struc int, int); intclient_search_idm(struct env *, struct idm *, struct aldap *, char **, char *, int, int, enum imsg_type); -intclient_try_idm(struct env *, struct idm *); +intclient_try_idm(struct env *, struct idm *, struct ypldap_addr *); void client_addr_init(struct idm *); intclient_addr_free(struct idm *); -struct aldap *client_aldap_open(struct ypldap_addr_list *); - -/* - * dummy wrapper to provide aldap_init with its fd's. - */ -struct aldap * -client_aldap_open(struct ypldap_addr_list *addr) -{ - int fd = -1; - struct ypldap_addr *p; - struct aldap*al; - - TAILQ_FOREACH(p, addr, next) { - char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; - struct sockaddr *sa = (struct sockaddr *)&p->ss; - - if (getnameinfo(sa, SA_LEN(sa), hbuf, sizeof(hbuf), sbuf, - sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV)) - errx(1, "could not get numeric hostname"); - - if ((fd = socket(sa->sa_family, SOCK_STREAM, 0)) == -1) - return NULL; - - if (connect(fd, sa, SA_LEN(sa)) == 0) - break; - - log_warn("connect to %s port %s failed", hbuf, sbuf); - close(fd); - fd = -1; - } - - if (fd == -1) - return NULL; - - al = aldap_init(fd); - if (al == NULL) - close(fd); - return al; -} - void client_addr_init(struct idm *idm) { @@ -241,8 +201,12 @@ client_dispatch_dns(int fd, short events } TAILQ_FOREACH(idm, &env->sc_idms, idm_entry) { - if (client_try_idm(env, idm) == -1) - idm->idm_state = STATE_LDAP_FAIL; + TAILQ_FOREACH(h, &idm->idm_addr, next) { + if (client_try_idm(env, idm, h) == -1) + idm->idm_state = STATE_LDAP_FAIL; + else + break; + } if (idm->idm_state < STATE_LDAP_DONE) wait_cnt++; @@ -585,17 +549,36 @@ fail: } int -client_try_idm(struct env *env, struct idm *idm) +client_try_idm(struct env *env, struct idm *idm, struct ypldap_addr *addr) { const char *where; + char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; char*attrs[ATTR_MAX+1]; + int fd = -1; int i, j; + struct sockaddr *sa = (struct sockaddr *)&addr->ss; struct aldap_message*m; struct aldap*al; + if (getnameinfo(sa, SA_LEN(sa), hbuf, sizeof(hbuf), sbuf, + sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV)) + errx(1, "could not get numeric hostname"); + where = "connect"; - if ((al = client_aldap_open(&idm->idm_addr)) == NULL) + if ((fd = socket(sa->sa_family, SOCK_STREAM, 0)) == -1) + return (-1); + + if (connect(fd, sa, SA_LEN(sa)) != 0) { + log_warn("connect to %s port %s failed", hbuf, sbuf); + close(fd); + return (-1); + } + + al = aldap_init(fd); + if (al == NULL) { + close(fd); return (-1); + } if (idm->idm_flags & F_STARTTLS) { log_debug("requesting starttls"); @@ -625,8 +608,8 @@ client_try_idm(struct env *env, struct i if (aldap_tls(al, idm->idm_tls_config, idm->i
mvsw phy-mode support
On the Turris Omnia, the host can't transmit over the interface linked to the switch unless mvsw applies phy-mode settings to the port on its side, specifically the rgmii delay settings. ok? Index: mvsw.c === RCS file: /cvs/src/sys/dev/fdt/mvsw.c,v retrieving revision 1.5 diff -u -p -r1.5 mvsw.c --- mvsw.c 6 Apr 2022 18:59:28 - 1.5 +++ mvsw.c 9 Apr 2023 04:19:21 - @@ -52,6 +52,10 @@ #define MVSW_PORT(x) (0x10 + (x)) #define MVSW_G20x1c +#define MVSW_PORT_MAC_CTL 0x01 +#define MVSW_PORT_MAC_CTL_RGMII_RXID 0x8000 +#define MVSW_PORT_MAC_CTL_RGMII_TXID 0x4000 +#define MVSW_PORT_MAC_CTL_RGMII_MASK 0xc000 #define MVSW_PORT_SWITCHID 0x03 #define MVSW_PORT_SWITCHID_PROD_MASK 0xfff0 #define MVSW_PORT_SWITCHID_PROD_88E6141 0x3400 @@ -70,6 +74,17 @@ /* XXX #include */ #define MDIO_MMD_PHYXS 4 +const struct { +const char *name; +uint16_tmac_ctl; +} mvsw_phy_modes[] = { +{ "rgmii", 0 }, +{ "rgmii-id", MVSW_PORT_MAC_CTL_RGMII_TXID | +MVSW_PORT_MAC_CTL_RGMII_RXID }, +{ "rgmii-rxid", MVSW_PORT_MAC_CTL_RGMII_RXID }, +{ "rgmii-txid", MVSW_PORT_MAC_CTL_RGMII_TXID } +}; + struct mvsw_softc { struct device sc_dev; @@ -310,12 +325,27 @@ mvsw_serdes_write(struct mvsw_softc *sc, void mvsw_port_enable(struct mvsw_softc *sc, int node) { + char phy_mode[16] = { 0 }; uint16_t val; - int port; + int port, i; port = OF_getpropint(node, "reg", -1); if (port == -1) return; + + OF_getprop(node, "phy-mode", phy_mode, sizeof(phy_mode)); + for (i = 0; i < nitems(mvsw_phy_modes); i++) { + if (strcmp(phy_mode, mvsw_phy_modes[i].name) == 0) { + val = mvsw_smi_read(sc, MVSW_PORT(port), + MVSW_PORT_MAC_CTL); + val &= ~MVSW_PORT_MAC_CTL_RGMII_MASK; + val |= mvsw_phy_modes[i].mac_ctl; + mvsw_smi_write(sc, MVSW_PORT(port), + MVSW_PORT_MAC_CTL, val); + + break; + } + } /* Enable port. */ val = mvsw_smi_read(sc, MVSW_PORT(port), MVSW_PORT_CTRL);
ypldap: reduce imsg traffic
On systems where we pull in around 100k users from ldap, ypldap uses a fair bit of memory (over 300MB peak) moving data from the ldapclient process to the main process. The ldapclient process sends each user and group record to the parent process in instances of struct idm_req, which includes a 1kB buffer for the user/group details. It currently sends the full struct through imsg, but only sending the used portion of the 1kB buffer reduces peak memory usage to around 100MB, and it turns out it's pretty easy, as in the diff below. ok? Index: ldapclient.c === RCS file: /cvs/src/usr.sbin/ypldap/ldapclient.c,v retrieving revision 1.46 diff -u -p -r1.46 ldapclient.c --- ldapclient.c13 Oct 2022 04:55:33 - 1.46 +++ ldapclient.c27 Mar 2023 04:19:53 - @@ -567,7 +567,8 @@ client_search_idm(struct env *env, struc if (client_build_req(idm, &ir, m, min_attr, max_attr) == 0) imsg_compose_event(env->sc_iev, type, 0, 0, -1, - &ir, sizeof(ir)); + &ir, sizeof(ir.ir_key) + + strlen(ir.ir_line) + 1); aldap_freemsg(m); } Index: ypldap.c === RCS file: /cvs/src/usr.sbin/ypldap/ypldap.c,v retrieving revision 1.23 diff -u -p -r1.23 ypldap.c --- ypldap.c22 Aug 2022 08:02:02 - 1.23 +++ ypldap.c27 Mar 2023 04:19:53 - @@ -392,7 +392,7 @@ main_dispatch_client(int fd, short event if (env->update_trashed) break; - (void)memcpy(&ir, imsg.data, sizeof(ir)); + (void)memcpy(&ir, imsg.data, n - IMSG_HEADER_SIZE); if ((ue = calloc(1, sizeof(*ue))) == NULL || (ue->ue_line = strdup(ir.ir_line)) == NULL) { /* @@ -418,7 +418,7 @@ main_dispatch_client(int fd, short event if (env->update_trashed) break; - (void)memcpy(&ir, imsg.data, sizeof(ir)); + (void)memcpy(&ir, imsg.data, n - IMSG_HEADER_SIZE); if ((ge = calloc(1, sizeof(*ge))) == NULL || (ge->ge_line = strdup(ir.ir_line)) == NULL) { /*
Re: vmm: mask WAITPKG cpuid feature to hide TPAUSE
On Sun, Jan 08, 2023 at 03:09:44PM -0500, Dave Voutila wrote: > > Philip Guenther writes: > > > On Sat, Jan 7, 2023 at 11:04 AM Dave Voutila wrote: > > > > Bringing this to tech@ to increase my chance of someone testing my > > diff. > > > > As reported in this thread on misc@ [1], I believe newer Intel hardware > > may be experiencing issues hosting Linux guests under vmm/vmd. It looks > > like there are some newer instructions Intel added (TPAUSE specifically) > > that also involve some new MSR(s). > > > > I don't have 12th gen Intel hardware to test this on (I think that's > > Alder Lake). I'd like to mask this feature from vmm guests since it's > > related to an MSR we don't yet pass through or emulate and has to do > > with the TSC (which has it's own challenges in vmm). > > > > For someone testing, you should be able to grab an Alpine Linux iso > > (-virt flavor) and boot it with vmd with the diff. (Without it should > > "hang" and spike CPU or just die.) Also check that WAITPKG shows up in > > your dmesg on the cpu feature output. > > > > This seem like it'll obviously work, but I guess it seems to me that this > > "opt-out" approach is generally > > unsafe/unstable and vmd should consider actively switching to "opt-in" on > > all these CPUID feature bits. I mean, > > what bits are defined in the SEFF first-leaf EDX that _do_ work with vmd? > > > > Great point (I think you mean ECX). Here's an updated diff that flips it > to a whitelist so Intel/AMD don't burn me with these new bits in the > future. This better? I tried this out on this cpu: cpu0: 12th Gen Intel(R) Core(TM) i7-12700, 4789.57 MHz, 06-97-02 and it works as advertised, alpine-virt-3.17.0-x86_64.iso boots up to the login prompt, which it doesn't do without the diff. > > > diff refs/heads/master refs/heads/vmm-tsleep > commit - bfce157fda90a812e1a99aa179a4c42f12ebfa24 > commit + 5b434c89250e1901340c11c8f9c380dc18d0ae91 > blob - 001a437045be145322be30288c1f47d63fb07634 > blob + 0bd908e273a1c0e6324e1bc9f8c8ca921555c86f > --- sys/arch/amd64/amd64/identcpu.c > +++ sys/arch/amd64/amd64/identcpu.c > @@ -208,6 +208,7 @@ const struct { > { SEFF0ECX_AVX512VBMI, "AVX512VBMI" }, > { SEFF0ECX_UMIP,"UMIP" }, > { SEFF0ECX_PKU, "PKU" }, > + { SEFF0ECX_WAITPKG, "WAITPKG" }, > }, cpu_seff0_edxfeatures[] = { > { SEFF0EDX_AVX512_4FNNIW, "AVX512FNNIW" }, > { SEFF0EDX_AVX512_4FMAPS, "AVX512FMAPS" }, > blob - cbde6cf9b02fc882a8ed17aa6adb5c43249e0302 > blob + b26bd32e2d9ea7386b1f58960dea40b787d6a341 > --- sys/arch/amd64/include/specialreg.h > +++ sys/arch/amd64/include/specialreg.h > @@ -201,6 +201,7 @@ > #define SEFF0ECX_AVX512VBMI 0x0002 /* AVX-512 vector bit inst */ > #define SEFF0ECX_UMIP0x0004 /* UMIP support */ > #define SEFF0ECX_PKU 0x0008 /* Page prot keys for user mode */ > +#define SEFF0ECX_WAITPKG 0x0010 /* UMONITOR/UMWAIT/TPAUSE insns */ > /* SEFF EDX bits */ > #define SEFF0EDX_AVX512_4FNNIW 0x0004 /* AVX-512 neural network > insns */ > #define SEFF0EDX_AVX512_4FMAPS 0x0008 /* AVX-512 mult accum single > prec */ > blob - 6b4802abf4b508495cdbc961bd799d3fa83b9c36 > blob + 032444b05e19d7fbec96a0d11b5b340f668c0917 > --- sys/arch/amd64/include/vmmvar.h > +++ sys/arch/amd64/include/vmmvar.h > @@ -672,8 +672,10 @@ struct vm_mprotect_ept_params { > SEFF0EBX_AVX512IFMA | SEFF0EBX_AVX512PF | \ > SEFF0EBX_AVX512ER | SEFF0EBX_AVX512CD | \ > SEFF0EBX_AVX512BW | SEFF0EBX_AVX512VL) > -#define VMM_SEFF0ECX_MASK ~(SEFF0ECX_AVX512VBMI) > > +/* ECX mask contains the bits to include */ > +#define VMM_SEFF0ECX_MASK (SEFF0ECX_PREFETCHWT1 | SEFF0ECX_UMIP | > SEFF0ECX_PKU) > + > /* EDX mask contains the bits to include */ > #define VMM_SEFF0EDX_MASK (SEFF0EDX_MD_CLEAR) > > blob - 310208ac4cdb262aaedfa9b78d869fd5911607b2 > blob + ccf1164fd658a69dc383e1602ae0ce1f269de4e4 > --- sys/arch/i386/i386/machdep.c > +++ sys/arch/i386/i386/machdep.c > @@ -1038,6 +1038,7 @@ const struct cpu_cpuid_feature cpu_seff0_ecxfeatures[] > { SEFF0ECX_UMIP,"UMIP" }, > { SEFF0ECX_AVX512VBMI, "AVX512VBMI" }, > { SEFF0ECX_PKU, "PKU" }, > + { SEFF0ECX_WAITPKG, "WAITPKG" }, > }; > > const struct cpu_cpuid_feature cpu_seff0_edxfeatures[] = { > blob - 392b4ff412e2dd3c4c48ed6c9c84aa2358721c6a > blob + 7ce77ca3fdc6bd1a51571dd0b5dbf5afc311a138 > --- sys/arch/i386/include/specialreg.h > +++ sys/arch/i386/include/specialreg.h > @@ -190,6 +190,7 @@ > #define SEFF0ECX_AVX512VBMI 0x0002 /* AVX-512 vector bit inst */ > #define SEFF0ECX_UMIP0x0004 /* UMIP support */ > #define SEFF0ECX_PKU 0x0008 /* Page prot keys for user mode */ > +#define SEFF0ECX_WAITPKG 0x0010 /* UMONITOR/UMWAIT/TPAUSE insns */ > /* SEFF EDX bits */ > #define SEFF0EDX_AVX512_4FNNIW 0x0004 /* AVX-512 neural network > insns */ > #define SEFF0EDX_AVX512_4FMAPS
Re: Fix evcount_percpu() after evcount_init_percpu() (plus bits for mips64)
On Sun, Dec 04, 2022 at 02:31:41PM +, Visa Hankala wrote: > Do not re-insert the event counter to evcount_list in evcount_percpu(). > Otherwise the list becomes corrupt when evcount_percpu() is called > after evcount_init_percpu(). > > OK? clearly I never managed to test that path. oops. ok jmatthew@ > > As an extra, use percpu counters with mips64 clock and ipi interrupts. > > Index: kern/subr_evcount.c > === > RCS file: src/sys/kern/subr_evcount.c,v > retrieving revision 1.14 > diff -u -p -r1.14 subr_evcount.c > --- kern/subr_evcount.c 10 Nov 2022 07:05:41 - 1.14 > +++ kern/subr_evcount.c 4 Dec 2022 14:17:59 - > @@ -56,7 +56,6 @@ evcount_percpu(struct evcount *ec) > TAILQ_INSERT_TAIL(&evcount_percpu_init_list, ec, next); > } else { > ec->ec_percpu = counters_alloc(1); > - TAILQ_INSERT_TAIL(&evcount_list, ec, next); > } > } > > Index: arch/mips64/mips64/clock.c > === > RCS file: src/sys/arch/mips64/mips64/clock.c,v > retrieving revision 1.48 > diff -u -p -r1.48 clock.c > --- arch/mips64/mips64/clock.c19 Nov 2022 16:23:48 - 1.48 > +++ arch/mips64/mips64/clock.c4 Dec 2022 14:17:58 - > @@ -37,7 +37,6 @@ > #include > #include > #include > -#include > #include > #include > #include > @@ -100,6 +99,7 @@ clockattach(struct device *parent, struc >*/ > set_intr(INTPRI_CLOCK, CR_INT_5, cp0_int5); > evcount_attach(&cp0_clock_count, "clock", &cp0_clock_irq); > + evcount_percpu(&cp0_clock_count); > > /* try to avoid getting clock interrupts early */ > cp0_set_compare(cp0_get_count() - 1); > @@ -121,7 +121,7 @@ cp0_int5(uint32_t mask, struct trapframe > struct cpu_info *ci = curcpu(); > int s; > > - atomic_inc_long((unsigned long *)&cp0_clock_count.ec_count); > + evcount_inc(&cp0_clock_count); > > cp0_set_compare(cp0_get_count() - 1); /* clear INT5 */ > > Index: arch/mips64/mips64/ipifuncs.c > === > RCS file: src/sys/arch/mips64/mips64/ipifuncs.c,v > retrieving revision 1.25 > diff -u -p -r1.25 ipifuncs.c > --- arch/mips64/mips64/ipifuncs.c 10 Apr 2022 13:23:14 - 1.25 > +++ arch/mips64/mips64/ipifuncs.c 4 Dec 2022 14:17:58 - > @@ -84,6 +84,7 @@ mips64_ipi_init(void) > if (!cpuid) { > mtx_init(&smp_rv_mtx, IPL_HIGH); > evcount_attach(&ipi_count, "ipi", &ipi_irq); > + evcount_percpu(&ipi_count); > } > > hw_ipi_intr_clear(cpuid); > @@ -113,8 +114,7 @@ mips64_ipi_intr(void *arg) > for (bit = 0; bit < MIPS64_NIPIS; bit++) { > if (pending_ipis & (1UL << bit)) { > (*ipifuncs[bit])(); > - atomic_inc_long( > - (unsigned long *)&ipi_count.ec_count); > + evcount_inc(&ipi_count); > } > } > } >
acpimadt: ignore OEM-reserved apic structures
On a Dell R6515, acpimadt(4) prints this 512 times during boot: acpimadt0: unknown apic structure type 80 Previous generations of machines had a few of these, and they were easy enough to ignore, but 512 is a bit excessive. On further inspection, it seems types 0x80 through 0xFF are reserved for OEM specific uses, which we're never going to be able to work with, so complaining about it seems pointless. If we encounter a non-OEM type we don't know about, we should still report that though. ok? Index: acpimadt.c === RCS file: /cvs/src/sys/dev/acpi/acpimadt.c,v retrieving revision 1.38 diff -u -p -r1.38 acpimadt.c --- acpimadt.c 6 Apr 2022 18:59:27 - 1.38 +++ acpimadt.c 22 Nov 2022 03:58:00 - @@ -418,8 +418,11 @@ acpimadt_attach(struct device *parent, s break; default: - printf("%s: unknown apic structure type %x\n", - self->dv_xname, entry->madt_lapic.apic_type); + if (entry->madt_lapic.apic_type < ACPI_MADT_OEM_RSVD) { + printf("%s: unknown apic structure type %x\n", + self->dv_xname, + entry->madt_lapic.apic_type); + } } addr += entry->madt_lapic.length; Index: acpireg.h === RCS file: /cvs/src/sys/dev/acpi/acpireg.h,v retrieving revision 1.58 diff -u -p -r1.58 acpireg.h --- acpireg.h 9 Jan 2022 05:42:37 - 1.58 +++ acpireg.h 22 Nov 2022 03:58:01 - @@ -352,6 +352,8 @@ struct acpi_madt_x2apic_nmi { uint8_t reserved[3]; } __packed; +#define ACPI_MADT_OEM_RSVD 128 + union acpi_madt_entry { struct acpi_madt_lapic madt_lapic; struct acpi_madt_ioapic madt_ioapic;
ypldap TLS by default
While working on ypconnect(2), Theo suggested that ypldap(8) should not default to plaintext LDAP connections, since the data it's dealing with is pretty important to the security of the system. Here's a straightforward diff implementing that, defaulting to what was previously called 'tls' (STARTTLS on port 389), and adding a 'notls' option for plaintext. ok? other opinions on what this option should be called? Index: parse.y === RCS file: /cvs/src/usr.sbin/ypldap/parse.y,v retrieving revision 1.36 diff -u -p -u -p -r1.36 parse.y --- parse.y 13 Oct 2022 04:55:33 - 1.36 +++ parse.y 13 Oct 2022 08:13:21 - @@ -107,7 +107,7 @@ typedef struct { %token SERVER FILTER ATTRIBUTE BASEDN BINDDN GROUPDN BINDCRED MAPS CHANGE DOMAIN PROVIDE %token USER GROUP TO EXPIRE HOME SHELL GECOS UID GID INTERVAL %token PASSWD NAME FIXED LIST GROUPNAME GROUPPASSWD GROUPGID MAP -%token INCLUDE DIRECTORY CLASS PORT ERROR GROUPMEMBERS LDAPS TLS CAFILE +%token INCLUDE DIRECTORY CLASS PORT ERROR GROUPMEMBERS LDAPS TLS NOTLS CAFILE %token BIND LOCAL PORTMAP BINDEXT CERTFILE KEYFILE %token STRING %token NUMBER @@ -366,9 +366,10 @@ diropt : BINDDN STRING { } ; -ssl: /* empty */ { $$ = 0; } +ssl: /* empty */ { $$ = F_STARTTLS; } | LDAPS { $$ = F_SSL; } | TLS { $$ = F_STARTTLS; } + | NOTLS { $$ = 0; } ; directory : DIRECTORY STRING port ssl { @@ -556,6 +557,7 @@ lookup(char *s) { "map",MAP }, { "maps", MAPS }, { "name", NAME }, + { "notls", NOTLS }, { "passwd", PASSWD }, { "port", PORT }, { "portmap",PORTMAP }, Index: ypldap.conf.5 === RCS file: /cvs/src/usr.sbin/ypldap/ypldap.conf.5,v retrieving revision 1.28 diff -u -p -u -p -r1.28 ypldap.conf.5 --- ypldap.conf.5 13 Oct 2022 04:55:33 - 1.28 +++ ypldap.conf.5 13 Oct 2022 08:13:21 - @@ -119,15 +119,19 @@ directory are used to construct YP map e .Bl -tag -width Ds .It Ic directory Ar hostname Oo Ic port Ar port Oc Oo tls Oc Brq ... Defines a directory by hostname and optionally port number. -If the +The .Ar tls -argument is not specified, no transport-level security will be used. +argument specifies the transport-level security used for the connection. Valid options are: .Bl -tag -width Ds .It Ic tls -Use STARTTLS to negotiate TLS, by default on port 389. +Use STARTTLS to negotiate TLS on port 389 unless an alternate port is +specified. +This is the default. .It Ic ldaps -Connect with TLS enabled, by default on port 636. +Connect with TLS enabled on port 636 unless an alternate port is specified. +.It Ic notls +Connect with no transport-level security. .El .El .Pp
Re: memory barrier in counters_zero
On Sat, Sep 17, 2022 at 04:28:15PM +0200, Alexander Bluhm wrote: > Hi, > > Inspired by Taylor's talk at EuroBSDCon I think a memory barrier > in counters_zero() is missing. Reading uses two consumer barriers, > so writing should also have two. Will slides or notes from this talk be available at some point? I went looking but didn't find anything. > > Following code would have no barrier between writing generation > number and writing counters. > > counters_leave(); > counters_zero(); > > counters_leave() writes to generation number at the end, so > counters_zero() needs a barrier at the start. > > ok? This seems reasonable, and I don't see a reason not to add the barrier here after release. counters_zero() is currently unused, so it's a bit hard to reason about, but I think using it sensibly would involve a memory barrier between the call to counters_zero() and any other updates, either through a lock or a timeout/interrupt style barrier. > > bluhm > > Index: kern/subr_percpu.c > === > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/subr_percpu.c,v > retrieving revision 1.9 > diff -u -p -r1.9 subr_percpu.c > --- kern/subr_percpu.c10 Mar 2021 10:21:47 - 1.9 > +++ kern/subr_percpu.c17 Sep 2022 14:17:34 - > @@ -213,6 +213,7 @@ counters_zero(struct cpumem *cm, unsigne > unsigned int i; > > counters = cpumem_first(&cmi, cm); > + membar_producer(); > do { > for (i = 0; i < n; i++) > counters[i] = 0; >
ypldap client cert authentication
This adds client certificate authentication to ypldap(8). libtls makes the actual certificate part of this straightforward (I would still like it reviewed, though), but there are some LDAP complications. Depending on your LDAP server and how you connect to it (LDAPS on port 636 or LDAP+TLS on port 389), a client presenting a certificate might automatically be bound as the subject of the certificate, or it might not. If it's not, the client can do an LDAP bind operation using the SASL EXTERNAL mechanism to bind as the cert subject, and it can optionally specify an identity, which means the bind will fail if the cert subject doesn't match that identity. If the client didn't present a certificate, the bind will also fail (one would hope). For reference, with Active Directory, SASL EXTERNAL bind is required when using LDAP+TLS, but not when using LDAPS, and the client identity can be specified in the form of "dn:" followed by the expected cert subject DN. OpenLDAP doesn't seem to do automatic bind at all, so SASL EXTERNAL would always be required there, and it doesn't appear to support specifying the expected identity with the bind. The diff adds 'certfile' and 'keyfile' config directives for specifying the certificate to use, and a 'bindext' directive for enabling SASL EXTERNAL bind, optionally including the identity string. SASL EXTERNAL bind doesn't get enabled implicitly when you configure a client cert, because ypldap can't tell if it's required or supported by the server. It's also not an error to enable SASL EXTERNAL bind without a client cert, since you could be connecting through stunnel or something. To configure this in ypldap.conf, you'd do something like this: directory "ldap.example.com" tls { bindext "dn:CN=ypldap,OU,Accounts,DC=example,DC=com" certfile "/etc/ssl/ypldap-cert.pem" keyfile "/etc/ssl/private/ypldap-key.pem" ... } ok? Index: aldap.c === RCS file: /cvs/src/usr.sbin/ypldap/aldap.c,v retrieving revision 1.48 diff -u -p -r1.48 aldap.c --- aldap.c 31 Mar 2022 09:06:55 - 1.48 +++ aldap.c 19 Sep 2022 11:47:13 - @@ -220,6 +220,40 @@ fail: } int +aldap_bind_sasl_external(struct aldap *ldap, char *bindid) +{ + struct ber_element *root = NULL, *elm; + + if ((root = ober_add_sequence(NULL)) == NULL) + goto fail; + + elm = ober_printf_elements(root, "d{tds{ts", ++ldap->msgid, + BER_CLASS_APP, LDAP_REQ_BIND, VERSION, "", + BER_CLASS_CONTEXT, LDAP_AUTH_SASL, LDAP_SASL_MECH_EXTERNAL); + if (bindid == NULL) + elm = ober_add_null(elm); + else + elm = ober_add_string(elm, bindid); + + if (elm == NULL) + goto fail; + + LDAP_DEBUG("aldap_bind_sasl_external", root); + + if (aldap_send(ldap, root) == -1) { + root = NULL; + goto fail; + } + return (ldap->msgid); +fail: + if (root != NULL) + ober_free_elements(root); + + ldap->err = ALDAP_ERR_OPERATION_FAILED; + return (-1); +} + +int aldap_unbind(struct aldap *ldap) { struct ber_element *root = NULL, *elm; Index: aldap.h === RCS file: /cvs/src/usr.sbin/ypldap/aldap.h,v retrieving revision 1.14 diff -u -p -r1.14 aldap.h --- aldap.h 11 May 2019 17:46:02 - 1.14 +++ aldap.h 19 Sep 2022 11:47:13 - @@ -32,6 +32,8 @@ #define LDAP_PAGED_OID "1.2.840.113556.1.4.319" #define LDAP_STARTTLS_OID "1.3.6.1.4.1.1466.20037" +#define LDAP_SASL_MECH_EXTERNAL"EXTERNAL" + struct aldap { #define ALDAP_ERR_SUCCESS 0 #define ALDAP_ERR_PARSER_ERROR 1 @@ -137,6 +139,7 @@ enum deref_aliases { enum authentication_choice { LDAP_AUTH_SIMPLE= 0, + LDAP_AUTH_SASL = 3, }; enum scope { @@ -222,6 +225,7 @@ void aldap_freemsg(struct aldap_messa int aldap_req_starttls(struct aldap *); int aldap_bind(struct aldap *, char *, char *); +int aldap_bind_sasl_external(struct aldap *, char *); int aldap_unbind(struct aldap *); int aldap_search(struct aldap *, char *, enum scope, char *, char **, int, int, int, struct aldap_page_control *); int aldap_get_errno(struct aldap *, const char **); Index: ldapclient.c === RCS file: /cvs/src/usr.sbin/ypldap/ldapclient.c,v retrieving revision 1.45 diff -u -p -r1.45 ldapclient.c --- ldapclient.c22 Aug 2022 10:10:59 - 1.45 +++ ldapclient.c19 Sep 2022 11:47:13 - @@ -635,7 +635,11 @@ client_try_idm(struct env *env, struct i int rc; where = "binding"; - if (aldap_bind(al, idm->idm_binddn, idm->idm_bindcred) == -1) + if (idm->idm_bindext !=
Re: ure(4): add support for RTL8156B
On Thu, Mar 31, 2022 at 09:41:09PM +0800, Kevin Lo wrote: > Hi, > > > > This diff adds preliminary support for RTL8156B to ure(4) and > bug fixes for RTL8153/RTL8156. > > Tested: > ure0 at uhub0 port 12 configuration 1 interface 0 "Realtek USB 10/100/1G/2.5G > LAN" rev 3.20/31.00 addr 3 > ure0: RTL8156B (0x7410), address 00:e0:4c:xx:xx:xx Works OK here: ure0 at uhub0 port 2 configuration 1 interface 0 "Realtek USB 10/100 LAN" rev 2.10/20.00 addr 2 ure0: RTL8152 (0x4c00), address 00:e0:4c:xx:xx:xx rlphy0 at ure0 phy 0: RTL8201E 10/100 PHY, rev. 2 Regarding this part: > @@ -1914,7 +2026,7 @@ ure_rxeof(struct usbd_xfer *xfer, void * > total_len -= roundup(pktlen, URE_RX_BUF_ALIGN); > buf += sizeof(rxhdr); > > - m = m_devget(buf, pktlen, ETHER_ALIGN); > + m = m_devget(buf, pktlen - ETHER_CRC_LEN, ETHER_ALIGN); > if (m == NULL) { > DPRINTF(("unable to allocate mbuf for next packet\n")); > ifp->if_ierrors++; We tried this earlier (r1.22 of if_ure.c) and had to back it out because it didn't work on some devices. Have we worked out what the problem was there?
Re: fix very small ntpd leak
On Wed, Mar 23, 2022 at 04:59:06PM +0100, Otto Moerbeek wrote: > On Wed, Mar 23, 2022 at 09:09:01PM +1000, Jonathan Matthew wrote: > > > We noticed that the ntpd engine process was getting a bit big on some boxes > > that we'd accidentally cut off from the ntp servers (routing is hard). > > Reading through the code, I noticed the 'query' member of struct ntp_peer > > is never freed, which seems to account for the leak. > > > > If you have a server pool in ntpd.conf and it resolves, but ntpd is unable > > to talk to the servers, it will re-resolve periodically, freeing the old > > list > > of peers and creating new ones. > > > > To show how slow the leak is, here's the leak report from MALLOC_OPTIONS=D > > after running for about two hours with four servers from two pools. > > > > without diff: > > > > Leak report > > f sum #avg > >0x09392128 73 > > 0x889878b920b 512 1512 > > 0x889878bc8e14096 4 1024 > > 0x889878bd065 128 2 64 > > 0x88bc91f0b4b 18280 1 18280 > > 0x88bc926a9ed 65536 1 65536 > > > > > > with diff: > > > > Leak report > > f sum #avg > >0x06064 16379 > > 0xbee1253320b 512 1512 > > 0xbf0265f4b4b 18280 1 18280 > > 0xbf02666e9ed 65536 1 65536 > > > > ok? > > > > Index: ntp.c > > === > > RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v > > retrieving revision 1.168 > > diff -u -p -r1.168 ntp.c > > --- ntp.c 24 Oct 2021 21:24:19 - 1.168 > > +++ ntp.c 23 Mar 2022 10:43:59 - > > @@ -686,6 +686,7 @@ void > > peer_remove(struct ntp_peer *p) > > { > > TAILQ_REMOVE(&conf->ntp_peers, p, entry); > > + free(p->query); > > free(p); > > peer_cnt--; > > } > > > > This is a bug that dlg reported last week. Serendepity or not? :-) We found it together looking at systems we run at work, so not really. > > This is my diff that uses an approach I like a litle bit better. I agree. I wasn't sure if there was a reason the query was allocated separately, so I went with the more straightforward diff to start with. > > -Otto > > Index: client.c > === > RCS file: /cvs/src/usr.sbin/ntpd/client.c,v > retrieving revision 1.116 > diff -u -p -r1.116 client.c > --- client.c 21 Apr 2021 09:38:11 - 1.116 > +++ client.c 21 Mar 2022 07:31:54 - > @@ -51,10 +51,9 @@ set_deadline(struct ntp_peer *p, time_t > int > client_peer_init(struct ntp_peer *p) > { > - if ((p->query = calloc(1, sizeof(struct ntp_query))) == NULL) > - fatal("client_peer_init calloc"); > - p->query->fd = -1; > - p->query->msg.status = MODE_CLIENT | (NTP_VERSION << 3); > + p->query.fd = -1; > + p->query.msg.status = MODE_CLIENT | (NTP_VERSION << 3); > + p->query.xmttime = 0; > p->state = STATE_NONE; > p->shift = 0; > p->trustlevel = TRUSTLEVEL_PATHETIC; > @@ -91,7 +90,7 @@ client_addr_init(struct ntp_peer *p) > } > } > > - p->query->fd = -1; > + p->query.fd = -1; > set_next(p, 0); > > return (0); > @@ -100,9 +99,9 @@ client_addr_init(struct ntp_peer *p) > int > client_nextaddr(struct ntp_peer *p) > { > - if (p->query->fd != -1) { > - close(p->query->fd); > - p->query->fd = -1; > + if (p->query.fd != -1) { > + close(p->query.fd); > + p->query.fd = -1; > } > > if (p->state == STATE_DNS_INPROGRESS) > @@ -148,26 +147,26 @@ client_query(struct ntp_peer *p) > if (p->state < STATE_DNS_DONE || p->addr == NULL) > return (-1); > > - if (p->query->fd == -1) { > + if (p->query.fd == -1) { > struct sockaddr *sa = (struct sockaddr *)&p->addr->ss; > struct sockaddr *qa4 = (struct sockaddr *)&p->query_addr4; > struct sockaddr *qa6 = (struct sockaddr *)&p->query_addr6; > > - if ((p->query->fd = socket(p->addr->ss.ss_family, SOCK_DGRAM, > + if ((p->query.fd = socket(p->addr->ss.ss_family, SOCK_DGRAM, &
fix very small ntpd leak
We noticed that the ntpd engine process was getting a bit big on some boxes that we'd accidentally cut off from the ntp servers (routing is hard). Reading through the code, I noticed the 'query' member of struct ntp_peer is never freed, which seems to account for the leak. If you have a server pool in ntpd.conf and it resolves, but ntpd is unable to talk to the servers, it will re-resolve periodically, freeing the old list of peers and creating new ones. To show how slow the leak is, here's the leak report from MALLOC_OPTIONS=D after running for about two hours with four servers from two pools. without diff: Leak report f sum #avg 0x09392128 73 0x889878b920b 512 1512 0x889878bc8e14096 4 1024 0x889878bd065 128 2 64 0x88bc91f0b4b 18280 1 18280 0x88bc926a9ed 65536 1 65536 with diff: Leak report f sum #avg 0x06064 16379 0xbee1253320b 512 1512 0xbf0265f4b4b 18280 1 18280 0xbf02666e9ed 65536 1 65536 ok? Index: ntp.c === RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v retrieving revision 1.168 diff -u -p -r1.168 ntp.c --- ntp.c 24 Oct 2021 21:24:19 - 1.168 +++ ntp.c 23 Mar 2022 10:43:59 - @@ -686,6 +686,7 @@ void peer_remove(struct ntp_peer *p) { TAILQ_REMOVE(&conf->ntp_peers, p, entry); + free(p->query); free(p); peer_cnt--; }
Re: ping icmp ident collisions
On Fri, Feb 18, 2022 at 04:03:28PM +0100, Florian Obser wrote: > On 2022-02-18 12:17 +10, Jonathan Matthew wrote: > > The only thing ping uses to determine whether a received icmp echo reply > > packet is a > > response to one of its requests is the 16 bit icmp ident field. If you > > ping enough > > stuff at the same time, eventually you'll have two concurrent pings using > > the same ident, > > and they will both see each other's replies. Since we do tricky MAC stuff > > on the ping > > payload, this results in signature mismatches that look like this: > > > > PING 172.23.94.210 (172.23.94.210): 56 data bytes > > 64 bytes from 172.23.94.210: icmp_seq=0 ttl=253 time=0.820 ms > > 64 bytes from 172.23.94.210: icmp_seq=1 ttl=253 time=0.419 ms > > 64 bytes from 172.23.94.210: icmp_seq=2 ttl=253 time=0.369 ms > > signature mismatch! > > 64 bytes from 172.23.94.210: icmp_seq=3 ttl=253 time=0.273 ms > > > > --- 172.23.94.210 ping statistics --- > > 4 packets transmitted, 5 packets received, -- somebody's duplicating > > packets! > > round-trip min/avg/max/std-dev = 0.273/0.376/0.820/0.265 ms > > > > ping is counting the packet with the signature mismatch as a reply it > > received, and it > > prints a misleading message about duplicated packets because it got more > > replies than > > the number of requests it sent. > > > > I think it would be more helpful not to count signature mismatch packets as > > replies. > > If you're actually getting corrupted replies, I'd say that's more like > > packet loss > > than normal operation. If you're getting extra replies due to ident > > collisions, this > > will result in ping sending and receiving the expected number of packets. > > > > Printing the source address and sequence number on signature mismatches > > would also help. > > I would have figured this out much quicker had ping told me the mismatch > > packets were > > from a completely different source. For example: > > > > PING 172.23.94.210 (172.23.94.210): 56 data bytes > > 64 bytes from 172.23.94.210: icmp_seq=0 ttl=253 time=2.645 ms > > 64 bytes from 172.23.94.210: icmp_seq=1 ttl=253 time=1.360 ms > > 64 bytes from 172.23.94.210: icmp_seq=2 ttl=253 time=0.506 ms > > 64 bytes from 172.23.94.210: icmp_seq=3 ttl=253 time=0.615 ms > > signature mismatch from 10.138.79.45: icmp_seq=0 > > 64 bytes from 172.23.94.210: icmp_seq=4 ttl=253 time=0.431 ms > > > > --- 172.23.94.210 ping statistics --- > > 5 packets transmitted, 5 packets received, 0.0% packet loss > > round-trip min/avg/max/std-dev = 0.431/1.111/2.645/0.835 ms > > > > ok? > > OK florian > > I think we can go further and also check the from address in the echo > reply case, like this. > > If something on the path is so confused as to answer to our pings with > the wrong source address I think it's tcpdump time... > > Feel free to put this in at the same time if you agree. I considered doing this, but I think I'd rather have ping print out anything it sees with the same ident, as long as it doesn't get confused and mess up its statistics. > > diff --git sbin/ping/ping.c sbin/ping/ping.c > index 6fa634bca3e..e47baa8912c 100644 > --- sbin/ping/ping.c > +++ sbin/ping/ping.c > @@ -181,6 +181,9 @@ char *hostname; > int ident; /* random number to identify our packets */ > int v6flag; /* are we ping6? */ > > +struct sockaddr_in dst4; > +struct sockaddr_in6 dst6; > + > /* counters */ > int64_t npackets;/* max packets to transmit */ > int64_t nreceived; /* # of packets we got back */ > @@ -243,8 +246,8 @@ main(int argc, char *argv[]) > struct addrinfo hints, *res; > struct itimerval itimer; > struct sockaddr *from, *dst; > - struct sockaddr_in from4, dst4; > - struct sockaddr_in6 from6, dst6; > + struct sockaddr_in from4; > + struct sockaddr_in6 from6; > struct cmsghdr *scmsg = NULL; > struct in6_pktinfo *pktinfo = NULL; > struct icmp6_filter filt; > @@ -1285,6 +1288,13 @@ pr_pack(u_char *buf, int cc, struct msghdr *mhdr) > } > > if (echo_reply) { > + if (v6flag) { > + if (memcmp(&dst6, from, sizeof(dst6)) != 0) > + return; /* 'Twas not our ECHO */ > + } else { > + if (memcmp(&dst4, from, sizeof(dst4)) != 0) > + return; /* 'Twas not our
ping icmp ident collisions
The only thing ping uses to determine whether a received icmp echo reply packet is a response to one of its requests is the 16 bit icmp ident field. If you ping enough stuff at the same time, eventually you'll have two concurrent pings using the same ident, and they will both see each other's replies. Since we do tricky MAC stuff on the ping payload, this results in signature mismatches that look like this: PING 172.23.94.210 (172.23.94.210): 56 data bytes 64 bytes from 172.23.94.210: icmp_seq=0 ttl=253 time=0.820 ms 64 bytes from 172.23.94.210: icmp_seq=1 ttl=253 time=0.419 ms 64 bytes from 172.23.94.210: icmp_seq=2 ttl=253 time=0.369 ms signature mismatch! 64 bytes from 172.23.94.210: icmp_seq=3 ttl=253 time=0.273 ms --- 172.23.94.210 ping statistics --- 4 packets transmitted, 5 packets received, -- somebody's duplicating packets! round-trip min/avg/max/std-dev = 0.273/0.376/0.820/0.265 ms ping is counting the packet with the signature mismatch as a reply it received, and it prints a misleading message about duplicated packets because it got more replies than the number of requests it sent. I think it would be more helpful not to count signature mismatch packets as replies. If you're actually getting corrupted replies, I'd say that's more like packet loss than normal operation. If you're getting extra replies due to ident collisions, this will result in ping sending and receiving the expected number of packets. Printing the source address and sequence number on signature mismatches would also help. I would have figured this out much quicker had ping told me the mismatch packets were from a completely different source. For example: PING 172.23.94.210 (172.23.94.210): 56 data bytes 64 bytes from 172.23.94.210: icmp_seq=0 ttl=253 time=2.645 ms 64 bytes from 172.23.94.210: icmp_seq=1 ttl=253 time=1.360 ms 64 bytes from 172.23.94.210: icmp_seq=2 ttl=253 time=0.506 ms 64 bytes from 172.23.94.210: icmp_seq=3 ttl=253 time=0.615 ms signature mismatch from 10.138.79.45: icmp_seq=0 64 bytes from 172.23.94.210: icmp_seq=4 ttl=253 time=0.431 ms --- 172.23.94.210 ping statistics --- 5 packets transmitted, 5 packets received, 0.0% packet loss round-trip min/avg/max/std-dev = 0.431/1.111/2.645/0.835 ms ok? Index: ping.c === RCS file: /cvs/src/sbin/ping/ping.c,v retrieving revision 1.245 diff -u -p -r1.245 ping.c --- ping.c 12 Jul 2021 15:09:19 - 1.245 +++ ping.c 18 Feb 2022 01:52:22 - @@ -1302,7 +1302,10 @@ pr_pack(u_char *buf, int cc, struct msgh if (timingsafe_memcmp(mac, &payload.mac, sizeof(mac)) != 0) { - printf("signature mismatch!\n"); + printf("signature mismatch from %s: " + "icmp_seq=%u\n", pr_addr(from, fromlen), + ntohs(seq)); + --nreceived; return; } timinginfo=1;
mpsafe dwxe(4)
This is almost identical to the changes I made to dwge(4) recently, since these drivers are very closely related. Unfortunately the only machine I have with dwxe(4) in it is armv7, so I can't test this properly, but it does still work there. Could someone with an arm64 allwinner board try this out more extensively? Index: if_dwxe.c === RCS file: /cvs/src/sys/dev/fdt/if_dwxe.c,v retrieving revision 1.19 diff -u -p -r1.19 if_dwxe.c --- if_dwxe.c 24 Oct 2021 17:52:26 - 1.19 +++ if_dwxe.c 3 Jan 2022 11:21:19 - @@ -275,6 +275,7 @@ struct dwxe_softc { bus_space_tag_t sc_iot; bus_space_handle_t sc_ioh; bus_dma_tag_t sc_dmat; + void*sc_ih; struct arpcom sc_ac; #define sc_lladdr sc_ac.ac_enaddr @@ -287,7 +288,6 @@ struct dwxe_softc { struct dwxe_buf *sc_txbuf; struct dwxe_desc*sc_txdesc; int sc_tx_prod; - int sc_tx_cnt; int sc_tx_cons; struct dwxe_dmamem *sc_rxring; @@ -322,7 +322,7 @@ uint32_t dwxe_read(struct dwxe_softc *, void dwxe_write(struct dwxe_softc *, bus_addr_t, uint32_t); intdwxe_ioctl(struct ifnet *, u_long, caddr_t); -void dwxe_start(struct ifnet *); +void dwxe_start(struct ifqueue *); void dwxe_watchdog(struct ifnet *); intdwxe_media_change(struct ifnet *); @@ -345,7 +345,7 @@ voiddwxe_rx_proc(struct dwxe_softc *); void dwxe_up(struct dwxe_softc *); void dwxe_down(struct dwxe_softc *); void dwxe_iff(struct dwxe_softc *); -intdwxe_encap(struct dwxe_softc *, struct mbuf *, int *); +intdwxe_encap(struct dwxe_softc *, struct mbuf *, int *, int *); void dwxe_reset(struct dwxe_softc *); void dwxe_stop_dma(struct dwxe_softc *); @@ -431,8 +431,9 @@ dwxe_attach(struct device *parent, struc ifp = &sc->sc_ac.ac_if; ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_xflags = IFXF_MPSAFE; ifp->if_ioctl = dwxe_ioctl; - ifp->if_start = dwxe_start; + ifp->if_qstart = dwxe_start; ifp->if_watchdog = dwxe_watchdog; ifq_set_maxlen(&ifp->if_snd, DWXE_NTXDESC - 1); bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ); @@ -460,8 +461,10 @@ dwxe_attach(struct device *parent, struc if_attach(ifp); ether_ifattach(ifp); - fdt_intr_establish(faa->fa_node, IPL_NET, dwxe_intr, sc, - sc->sc_dev.dv_xname); + sc->sc_ih = fdt_intr_establish(faa->fa_node, IPL_NET | IPL_MPSAFE, + dwxe_intr, sc, sc->sc_dev.dv_xname); + if (sc->sc_ih == NULL) + printf("%s: can't establish interrupt\n", sc->sc_dev.dv_xname); } void @@ -584,11 +587,12 @@ dwxe_lladdr_write(struct dwxe_softc *sc) } void -dwxe_start(struct ifnet *ifp) +dwxe_start(struct ifqueue *ifq) { + struct ifnet *ifp = ifq->ifq_if; struct dwxe_softc *sc = ifp->if_softc; struct mbuf *m; - int error, idx; + int error, idx, left, used; if (!(ifp->if_flags & IFF_RUNNING)) return; @@ -600,27 +604,29 @@ dwxe_start(struct ifnet *ifp) return; idx = sc->sc_tx_prod; - while ((sc->sc_txdesc[idx].sd_status & DWXE_TX_DESC_CTL) == 0) { - m = ifq_deq_begin(&ifp->if_snd); - if (m == NULL) + left = sc->sc_tx_cons; + if (left <= idx) + left += DWXE_NTXDESC; + left -= idx; + used = 0; + + for (;;) { + if (used + DWXE_NTXSEGS + 1 > left) { + ifq_set_oactive(ifq); break; + } - error = dwxe_encap(sc, m, &idx); - if (error == ENOBUFS) { - ifq_deq_rollback(&ifp->if_snd, m); - ifq_set_oactive(&ifp->if_snd); + m = ifq_dequeue(ifq); + if (m == NULL) break; - } + + error = dwxe_encap(sc, m, &idx, &used); if (error == EFBIG) { - ifq_deq_commit(&ifp->if_snd, m); m_freem(m); /* give up: drop it */ ifp->if_oerrors++; continue; } - /* Now we are committed to transmit the packet. */ - ifq_deq_commit(&ifp->if_snd, m); - #if NBPFILTER > 0 if (ifp->if_bpf) bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); @@ -632,6 +638,9 @@ dwxe_start(struct ifnet *ifp) /* Set a timeout in case the chip goes out to lunch. */ ifp->if_timer = 5; + + dwxe_write(sc, DWXE_TX_CTL1, dwxe_read(sc, +DWXE_TX_CTL1) | DWXE_TX_CTL1_TX_DMA_START); } }
Re: fix ldapd bug when removing last attribute
On Sun, Dec 19, 2021 at 01:31:24PM +0100, Claudio Jeker wrote: > In LDAP there is two ways to remove an attribute. > One can remove an attribute by just naming the attribute but it is also > possible to remove a specific attribute: value combo. > > In ldapd the latter is broken if the last attribute is removed because > the result of ldap_del_values() is an invalid encoding (empty sequence) > and with that the modification fails because validate_entry() fails. > The error is LDAP_INVALID_SYNTAX and I have noticed that in tools like > shelldap multiple times but never really connected the dots until now. > > This is the minimal way of solving this. If ldap_del_values() > removes the last element use ldap_del_attribute() to remove the attribute > but to make this work the ober_scanf_elements() format has to be relaxed > since what we remove no longer parses with "{s(". Is this an acceptable > solution? I think so, ldapd doesn't seem to check the attribute values consistently except where it actually has to look at them, so relaxing the check here seems fine to me. one note below, but ok jmatthew@ > > -- > :wq Claudio > > Index: attributes.c > === > RCS file: /cvs/src/usr.sbin/ldapd/attributes.c,v > retrieving revision 1.6 > diff -u -p -r1.6 attributes.c > --- attributes.c 24 Oct 2019 12:39:26 - 1.6 > +++ attributes.c 19 Dec 2021 12:12:48 - > @@ -181,7 +181,7 @@ ldap_del_attribute(struct ber_element *e > > attr = entry->be_sub; > while (attr) { > - if (ober_scanf_elements(attr, "{s(", &s) != 0) { > + if (ober_scanf_elements(attr, "{s", &s) != 0) { > log_warnx("failed to parse attribute"); > return -1; > } > @@ -240,6 +240,9 @@ ldap_del_values(struct ber_element *elm, > prev = v; > } > } > + > + if (old_vals->be_sub == NULL) > + return 1; > > return 0; > } > Index: modify.c > === > RCS file: /cvs/src/usr.sbin/ldapd/modify.c,v > retrieving revision 1.23 > diff -u -p -r1.23 modify.c > --- modify.c 24 Oct 2019 12:39:26 - 1.23 > +++ modify.c 19 Dec 2021 12:20:19 - > @@ -334,7 +334,8 @@ ldap_modify(struct request *req) >*/ > if (vals->be_sub && > vals->be_sub->be_type == BER_TYPE_OCTETSTRING) { > - ldap_del_values(a, vals); > + if (ldap_del_values(a, vals) == 1) > + ldap_del_attribute(entry, attr); > } else { > ldap_del_attribute(entry, attr); > } > Index: validate.c > === > RCS file: /cvs/src/usr.sbin/ldapd/validate.c,v > retrieving revision 1.12 > diff -u -p -r1.12 validate.c > --- validate.c24 Oct 2019 12:39:26 - 1.12 > +++ validate.c19 Dec 2021 11:42:48 - > @@ -313,6 +313,7 @@ validate_entry(const char *dn, struct be > objclass = objclass->be_next; /* skip attribute description */ > for (a = objclass->be_sub; a != NULL; a = a->be_next) { > if (ober_get_string(a, &s) != 0) { > + log_debug("bad ObjectClass encoding"); > rc = LDAP_INVALID_SYNTAX; > goto done; > } > @@ -396,6 +397,7 @@ validate_entry(const char *dn, struct be >*/ > for (a = entry->be_sub; a != NULL; a = a->be_next) { > if (ober_scanf_elements(a, "{se{", &s, &vals) != 0) { > + log_debug("bad attribue encoding"); misspelled 'attribute' here. > rc = LDAP_INVALID_SYNTAX; > goto done; > } >
fix ldapd unveil
ldapd currently can't reopen its database files, because it always passes O_CREAT to open() when reopening (see ldapd_open_request()), which means it needs the unveil 'c' flag. This may have been missed when ldapd was unveiled because 'ldapctl compact' was broken (see other diff). ok? Index: ldapd.c === RCS file: /cvs/src/usr.sbin/ldapd/ldapd.c,v retrieving revision 1.29 diff -u -p -r1.29 ldapd.c --- ldapd.c 14 Jul 2021 13:33:57 - 1.29 +++ ldapd.c 15 Dec 2021 03:42:04 - @@ -243,7 +243,7 @@ main(int argc, char *argv[]) err(1, "unveil %s.db", _PATH_LOGIN_CONF); if (unveil(_PATH_AUTHPROGDIR, "x") == -1) err(1, "unveil %s", _PATH_AUTHPROGDIR); - if (unveil(datadir, "rw") == -1) + if (unveil(datadir, "rwc") == -1) err(1, "unveil %s", datadir); if (unveil(NULL, NULL) == -1) err(1, "unveil");
fix ldapctl compact and index operations
r1.5 of ldapctl.c accidentally inverted the conditionals meant to skip compacting or indexing namespaces with referrals. ok? Index: ldapctl.c === RCS file: /cvs/src/usr.sbin/ldapctl/ldapctl.c,v retrieving revision 1.15 diff -u -p -u -p -r1.15 ldapctl.c --- ldapctl.c 15 Jan 2021 18:57:04 - 1.15 +++ ldapctl.c 15 Dec 2021 03:29:36 - @@ -128,8 +128,8 @@ compact_namespaces(const char *datadir) struct namespace*ns; TAILQ_FOREACH(ns, &conf->namespaces, next) { - if (SLIST_EMPTY(&ns->referrals)) - continue; + if (!SLIST_EMPTY(&ns->referrals)) + continue; if (compact_namespace(ns, datadir) != 0) return -1; } @@ -224,7 +224,7 @@ index_namespaces(const char *datadir) struct namespace*ns; TAILQ_FOREACH(ns, &conf->namespaces, next) { - if (SLIST_EMPTY(&ns->referrals)) + if (!SLIST_EMPTY(&ns->referrals)) continue; if (index_namespace(ns, datadir) != 0) return -1;
mpsafe dwge(4)
This applies our normal strategies for making network drivers mpsafe, and also writes to GMAC_TX_POLL_DEMAND once per call to dwge_start() rather than once per packet, and returns rx slots once per interrupt rather than once per packet. I've tested this on a rockpro64, where it makes tcpbench etc. a bit faster. I think I have an armv7 board with dwge(4) somewhere, but I haven't tested it there yet. ok? Index: if_dwge.c === RCS file: /cvs/src/sys/dev/fdt/if_dwge.c,v retrieving revision 1.12 diff -u -p -r1.12 if_dwge.c --- if_dwge.c 24 Oct 2021 17:52:26 - 1.12 +++ if_dwge.c 28 Nov 2021 09:36:56 - @@ -234,6 +234,7 @@ struct dwge_softc { bus_space_tag_t sc_iot; bus_space_handle_t sc_ioh; bus_dma_tag_t sc_dmat; + void*sc_ih; struct arpcom sc_ac; #define sc_lladdr sc_ac.ac_enaddr @@ -247,7 +248,6 @@ struct dwge_softc { struct dwge_buf *sc_txbuf; struct dwge_desc*sc_txdesc; int sc_tx_prod; - int sc_tx_cnt; int sc_tx_cons; struct dwge_dmamem *sc_rxring; @@ -289,7 +289,7 @@ uint32_t dwge_read(struct dwge_softc *, void dwge_write(struct dwge_softc *, bus_addr_t, uint32_t); intdwge_ioctl(struct ifnet *, u_long, caddr_t); -void dwge_start(struct ifnet *); +void dwge_start(struct ifqueue *); void dwge_watchdog(struct ifnet *); intdwge_media_change(struct ifnet *); @@ -312,7 +312,7 @@ voiddwge_rx_proc(struct dwge_softc *); void dwge_up(struct dwge_softc *); void dwge_down(struct dwge_softc *); void dwge_iff(struct dwge_softc *); -intdwge_encap(struct dwge_softc *, struct mbuf *, int *); +intdwge_encap(struct dwge_softc *, struct mbuf *, int *, int *); void dwge_reset(struct dwge_softc *); void dwge_stop_dma(struct dwge_softc *); @@ -422,8 +422,9 @@ dwge_attach(struct device *parent, struc ifp = &sc->sc_ac.ac_if; ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_xflags = IFXF_MPSAFE; ifp->if_ioctl = dwge_ioctl; - ifp->if_start = dwge_start; + ifp->if_qstart = dwge_start; ifp->if_watchdog = dwge_watchdog; ifq_set_maxlen(&ifp->if_snd, DWGE_NTXDESC - 1); bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ); @@ -535,8 +536,10 @@ dwge_attach(struct device *parent, struc dwge_write(sc, GMAC_MMC_TX_INT_MSK, 0x); dwge_write(sc, GMAC_MMC_IPC_INT_MSK, 0x); - fdt_intr_establish(faa->fa_node, IPL_NET, dwge_intr, sc, - sc->sc_dev.dv_xname); + sc->sc_ih = fdt_intr_establish(faa->fa_node, IPL_NET | IPL_MPSAFE, + dwge_intr, sc, sc->sc_dev.dv_xname); + if (sc->sc_ih == NULL) + printf("%s: can't establish interrupt\n", sc->sc_dev.dv_xname); } void @@ -612,11 +615,12 @@ dwge_lladdr_write(struct dwge_softc *sc) } void -dwge_start(struct ifnet *ifp) +dwge_start(struct ifqueue *ifq) { + struct ifnet *ifp = ifq->ifq_if; struct dwge_softc *sc = ifp->if_softc; struct mbuf *m; - int error, idx; + int error, idx, left, used; if (!(ifp->if_flags & IFF_RUNNING)) return; @@ -628,27 +632,29 @@ dwge_start(struct ifnet *ifp) return; idx = sc->sc_tx_prod; - while ((sc->sc_txdesc[idx].sd_status & TDES0_OWN) == 0) { - m = ifq_deq_begin(&ifp->if_snd); - if (m == NULL) + left = sc->sc_tx_cons; + if (left <= idx) + left += DWGE_NTXDESC; + left -= idx; + used = 0; + + for (;;) { + if (used + DWGE_NTXSEGS + 1 > left) { + ifq_set_oactive(ifq); break; + } - error = dwge_encap(sc, m, &idx); - if (error == ENOBUFS) { - ifq_deq_rollback(&ifp->if_snd, m); - ifq_set_oactive(&ifp->if_snd); + m = ifq_dequeue(ifq); + if (m == NULL) break; - } + + error = dwge_encap(sc, m, &idx, &used); if (error == EFBIG) { - ifq_deq_commit(&ifp->if_snd, m); m_freem(m); /* give up: drop it */ ifp->if_oerrors++; continue; } - /* Now we are committed to transmit the packet. */ - ifq_deq_commit(&ifp->if_snd, m); - #if NBPFILTER > 0 if (ifp->if_bpf) bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); @@ -660,6 +666,8 @@ dwge_start(struct ifnet *ifp) /* Set a timeout in case the chip goes out to lunch. */ ifp->if_timer =
Re: ixl(4): add rx/tx checksum offloading
On Tue, Oct 26, 2021 at 02:52:10PM +0200, Jan Klemkow wrote: > On Tue, Oct 26, 2021 at 05:17:55PM +1000, Jonathan Matthew wrote: > > First of all, thanks for looking at this, I forgot we hadn't done offloads > > for ixl(4) yet. > > You're welcome. > > > In the case of ixl(4), the driver has to tell the nic the length of each of > > the > > packet headers, so it should also be tested with vlan interfaces. > > > > I think ixl_tx_setup_offload() needs to account for outgoing vlan-tagged > > packets. > > Yes, it should. I just want to keep this diff small for now. I plan to > implement handling of vlan tags in a later diff. The code just stops > processing the offload and returns, if the stack tries to send out a > vlan taged ethernet frame in the switch-statement at the beginning. > > So, with vlan tags we just don't offload checksumming at the moment. and it turns out vlan interfaces don't allow checksum offload unless the parent interface does vlan tagging too, so this doesn't matter until that's implemented. I think I forget this every time. > > I also tested this scenario. > > > It currently assumes the ethernet header is ETHER_HDR_LEN bytes long, which > > isn't > > always true. See ixgbe_tx_ctx_setup() (sys/dev/pci/if_ix.c) for an example > > of > > a driver that takes this into account. > > I already looked at this code and will adapt vlan tagging later, if this > is OK for you? It'd probably be simpler to do vlan tagging first, so checksum offload could be done all at once, but since we're here already, ok jmatthew@ > > Thanks, > Jan > > > > Index: dev/pci/if_ixl.c > > > === > > > RCS file: /mount/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v > > > retrieving revision 1.75 > > > diff -u -p -r1.75 if_ixl.c > > > --- dev/pci/if_ixl.c 23 Jul 2021 00:29:14 - 1.75 > > > +++ dev/pci/if_ixl.c 25 Oct 2021 15:11:46 - > > > @@ -82,6 +82,10 @@ > > > #endif > > > > > > #include > > > +#include > > > +#include > > > +#include > > > +#include > > > #include > > > > > > #include > > > @@ -1388,6 +1392,7 @@ static int ixl_rxeof(struct ixl_softc *, > > > static void ixl_rxfill(struct ixl_softc *, struct ixl_rx_ring *); > > > static void ixl_rxrefill(void *); > > > static int ixl_rxrinfo(struct ixl_softc *, struct if_rxrinfo *); > > > +static void ixl_rx_checksum(struct mbuf *, uint64_t); > > > > > > #if NKSTAT > 0 > > > static void ixl_kstat_attach(struct ixl_softc *); > > > @@ -1942,9 +1947,9 @@ ixl_attach(struct device *parent, struct > > > ifp->if_capabilities = IFCAP_VLAN_MTU; > > > #if 0 > > > ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; > > > - ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 | > > > - IFCAP_CSUM_UDPv4; > > > #endif > > > + ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 | > > > + IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; > > > > > > ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status); > > > > > > @@ -2772,6 +2777,69 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm > > > } > > > > > > static void > > > +ixl_tx_setup_offload(struct mbuf *mp, uint64_t *cmd) > > > +{ > > > + uint64_t ip_hdr_len; > > > + int ipoff = ETHER_HDR_LEN; > > > + uint8_t ipproto; > > > + struct ip *ip; > > > +#ifdef INET6 > > > + struct ip6_hdr *ip6; > > > +#endif > > > + struct tcphdr *th; > > > + struct mbuf *m; > > > + > > > + switch (ntohs(mtod(mp, struct ether_header *)->ether_type)) { > > > + case ETHERTYPE_IP: > > > + if (mp->m_pkthdr.len < ETHER_HDR_LEN + sizeof(*ip)) > > > + return; > > > + m = m_getptr(mp, ETHER_HDR_LEN, &ipoff); > > > + KASSERT(m != NULL && m->m_len - ipoff >= sizeof(*ip)); > > > + ip = (struct ip *)(m->m_data + ipoff); > > > + > > > + if (mp->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT) > > > + *cmd |= IXL_TX_DESC_CMD_IIPT_IPV4_CSUM; > > > + else > > > + *cmd |= IXL_TX_DESC_CMD_IIPT_IPV4; > > > + > > > +
Re: ixl(4): add rx/tx checksum offloading
Hi Jan, First of all, thanks for looking at this, I forgot we hadn't done offloads for ixl(4) yet. On Mon, Oct 25, 2021 at 05:27:28PM +0200, Jan Klemkow wrote: > On Fri, Oct 22, 2021 at 03:39:01PM +0200, Hrvoje Popovski wrote: > > On 22.10.2021. 13:39, Jan Klemkow wrote: > > > Thats because, you only see this flags, if the checksum offloading is > > > enabled for "sending". I'm still working/debugging on the sending side. > > > Thus, I just send a diff with the receiving part for now. > > > > > > You can see if its working for your card with the netstat(8) statistics. > > > > > > # netstat -s | grep software-checksummed > > > > > > These counters should not raise much on the receive side if you put some > > > traffic over the interface. > > > > Thank you for explanation... > > > > I'm sending 8 tcp streams with iperf3 from some box to openbsd ixl box > > and here are results: > > > > without diff > > smc24# netstat -s | grep software-checksummed > > 5039250 input datagrams software-checksummed > > 2592718 output datagrams software-checksummed > > 2592709 packets software-checksummed > > 5039250 packets software-checksummed > > 0 input packets software-checksummed > > 0 output packets software-checksummed > > > > cca 6.12 Gbits/sec > > > > > > > > with diff > > smc24# netstat -s | grep software-checksummed > > 0 input datagrams software-checksummed > > 2956546 output datagrams software-checksummed > > 2956537 packets software-checksummed > > 0 packets software-checksummed > > 0 input packets software-checksummed > > 0 output packets software-checksummed > > > > cca 6.70 Gbits/sec > > > > are result like those expected? > > > > is forwarding testing any good for checksum offload diffs? > > Hi Hrvoje, > > Thanks a lot for you big testing efforts! > > In case of forwarding the forwarding box just checks the IPv4 header > checksum and ignores the UDP/TCP header. Your setup from one Box to > another is fine. > > Here is a new diff, which also includes send checksum offloading. > Thus, all software-checksummed numbers should stay low in both > directions. > > Could you test this diff with your ospf{6}d and NFS tests? > If you see IPv4 fragments in the ospf and NFS traffic within tcpdump(8), > your test should find the bugs pointed out by deraadt@ and claudio@. In the case of ixl(4), the driver has to tell the nic the length of each of the packet headers, so it should also be tested with vlan interfaces. I think ixl_tx_setup_offload() needs to account for outgoing vlan-tagged packets. It currently assumes the ethernet header is ETHER_HDR_LEN bytes long, which isn't always true. See ixgbe_tx_ctx_setup() (sys/dev/pci/if_ix.c) for an example of a driver that takes this into account. > > You can provoke large NFS packets with the following options on your NFS > mount point. > > server:/export /mnt nfs ro,intr,-r65536,-w65536 > > Thanks, > Jan > > Index: dev/pci/if_ixl.c > === > RCS file: /mount/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v > retrieving revision 1.75 > diff -u -p -r1.75 if_ixl.c > --- dev/pci/if_ixl.c 23 Jul 2021 00:29:14 - 1.75 > +++ dev/pci/if_ixl.c 25 Oct 2021 15:11:46 - > @@ -82,6 +82,10 @@ > #endif > > #include > +#include > +#include > +#include > +#include > #include > > #include > @@ -1388,6 +1392,7 @@ static int ixl_rxeof(struct ixl_softc *, > static void ixl_rxfill(struct ixl_softc *, struct ixl_rx_ring *); > static void ixl_rxrefill(void *); > static int ixl_rxrinfo(struct ixl_softc *, struct if_rxrinfo *); > +static void ixl_rx_checksum(struct mbuf *, uint64_t); > > #if NKSTAT > 0 > static void ixl_kstat_attach(struct ixl_softc *); > @@ -1942,9 +1947,9 @@ ixl_attach(struct device *parent, struct > ifp->if_capabilities = IFCAP_VLAN_MTU; > #if 0 > ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; > - ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 | > - IFCAP_CSUM_UDPv4; > #endif > + ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 | > + IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; > > ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status); > > @@ -2772,6 +2777,69 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm > } > > static void > +ixl_tx_setup_offload(struct mbuf *mp, uint64_t *cmd) > +{ > + uint64_t ip_hdr_len; > + int ipoff = ETHER_HDR_LEN; > + uint8_t ipproto; > + struct ip *ip; > +#ifdef INET6 > + struct ip6_hdr *ip6; > +#endif > + struct tcphdr *th; > + struct mbuf *m; > + > + switch (ntohs(mtod(mp, struct ether_header *)->ether_type)) { > + case ETHERTYPE_IP: > + if (mp->m_pkthdr.len < ETHER_HDR_LEN + sizeof(*ip)) > + return; >
uaq(4): aquantia usb ethernet driver
Here's a driver for the Aquantia USB ethernet devices I just added to usbdevs. These are somewhat interesting because they theoretically go up to 5GbE and support jumbo frames (not implemented yet). While working on this I noticed that it doesn't receive 15-25% of the packets it should, even at very low packet rates, when connected to ehci(4) controllers. No such packet loss occurs with an xhci(4) controller. I'm not sure if this is a problem with our ehci driver or a poor hardware interaction. ok? Index: files.usb === RCS file: /cvs/src/sys/dev/usb/files.usb,v retrieving revision 1.145 diff -u -p -u -r1.145 files.usb --- files.usb 4 Feb 2021 16:25:39 - 1.145 +++ files.usb 31 Aug 2021 23:41:35 - @@ -295,6 +295,10 @@ device ure: ether, ifnet, mii, ifmedia attach ure at uhub file dev/usb/if_ure.cure +# Aquantia AQC111 +device uaq: ether, ifnet, ifmedia +attach uaq at uhub +file dev/usb/if_uaq.cuaq # Serial drivers # Modems Index: if_uaq.c === RCS file: if_uaq.c diff -N if_uaq.c --- /dev/null 1 Jan 1970 00:00:00 - +++ if_uaq.c31 Aug 2021 23:41:35 - @@ -0,0 +1,1397 @@ +/* $OpenBSD$ */ +/*- + * Copyright (c) 2021 Jonathan Matthew + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *notice, this list of conditions and the following disclaimer in the + *documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "bpfilter.h" +#include "vlan.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#if NBPFILTER > 0 +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include + +#ifdef UAQ_DEBUG +#define DPRINTF(x) do { if (uaqdebug) printf x; } while (0) +#define DPRINTFN(n,x) do { if (uaqdebug >= (n)) printf x; } while (0) +intuaqdebug = 0; +#else +#define DPRINTF(x) +#define DPRINTFN(n,x) +#endif + +#define UAQ_ENDPT_RX 0 +#define UAQ_ENDPT_TX 1 +#define UAQ_ENDPT_INTR 2 +#define UAQ_ENDPT_MAX 3 + +#define UAQ_TX_LIST_CNT1 +#define UAQ_RX_LIST_CNT1 +#define UAQ_TX_BUF_ALIGN 8 +#define UAQ_RX_BUF_ALIGN 8 + +#define UAQ_TX_BUFSZ 16384 +#define UAQ_RX_BUFSZ 32768 + +#define UAQ_CTL_READ 1 +#define UAQ_CTL_WRITE 2 + +#define UAQ_MCAST_FILTER_SIZE 8 + +/* control commands */ +#define UAQ_CMD_ACCESS_MAC 0x01 +#define UAQ_CMD_FLASH_PARAM0x20 +#define UAQ_CMD_PHY_POWER 0x31 +#define UAQ_CMD_WOL_CFG0x60 +#define UAQ_CMD_PHY_OPS0x61 + +/* SFR registers */ +#define UAQ_SFR_GENERAL_STATUS 0x03 +#define UAQ_SFR_CHIP_STATUS0x05 +#define UAQ_SFR_RX_CTL 0x0B +#define UAQ_SFR_RX_CTL_STOP 0x +#define UAQ_SFR_RX_CTL_PRO0x0001 +#define UAQ_SFR_RX_CTL_AMALL 0x0002 +#define UAQ_SFR_RX_CTL_AB 0x0008 +#define UAQ_SFR_RX_CTL_AM 0x0010 +#define UAQ_SFR_RX_CTL_START 0x0080 +#define UAQ_SFR_RX_CTL_IPE0x0200 +#define UAQ_SFR_IPG_0 0x0D +#define UAQ_SFR_NODE_ID0x10 +#define UAQ_SFR_MCAST_FILTER 0x16 +#define UAQ_SFR_MEDIUM_STATUS_MODE 0x22 +#define UAQ_SFR_MEDIUM_XGMIIMODE 0x0001 +#define UAQ_SFR_MEDIUM_FULL_DUPLEX0x0002 +#define UAQ_SFR_MEDIUM_RXFLOW_CTRLEN 0x0010 +#define UAQ_SFR_MEDIUM_TXFLOW_CTRLEN 0x0020 +#define UAQ_SFR_MEDIUM_JUMBO_EN 0x0040 +#define UAQ_SFR_MEDIUM_RECEIVE_EN 0x0100 +#define UAQ_SFR_MONITOR_MODE 0x24 +#define UAQ_SFR_MONITOR_MODE_EPHYRW 0x01 +#define UAQ_SFR_MONITOR_MODE_RWLC 0x02 +#define UAQ_SFR_MONITOR_MOD
Re: snmpd: allow sending traps with SNMPv3
On Tue, Aug 10, 2021 at 12:58:05PM +0200, Martijn van Duren wrote: > On Mon, 2021-08-09 at 21:44 +0200, Martijn van Duren wrote: > > On Tue, 2021-07-27 at 21:28 +0200, Martijn van Duren wrote: > > > This diff allows sending traps in SNMPv3 messages. > > > It defaults to the global seclevel, but it can be specified on a per > > > rule basis. > > > > > > Diff requires both previous setting engineid and ober_dup diff. > > > > > > Tested with netsnmp's snmptrapd and my WIP diff. > > > > > > The other 2 outstanding diffs are for receiving SNMPv3 traps. > > > > > > OK? > > > > > > martijn@ > > > > > Resending now that the engineid diff is in. > > > > Still awaiting the commit of ober_dup diff[0]. > > > > OK once that one goes in? > > > > Also, rereading the diff, splitting the trap receiver in two might be a > > bit clutch. Once again invoking the manpage gurus. > > > > martijn@ > > > > [0] https://marc.info/?l=openbsd-tech&m=162698527126249&w=2 > > > The listen on diff committed this morning broke this patch. > Updated version I think my only concern with this is that the config syntax changes incompatibly, since you now have to specify 'snmpv2c' for v2c trap receivers. I can think of a few alternatives, but none of them are great. What you've done here seems to be the cleanest option both in terms of what the config looks like and the code for processing it, so if we're prepared to change the config syntax, I'm happy with it.
usb: don't pass USBD_EXCLUSIVE_USE to usbd_open_pipe_intr()
While working on a new driver, I noticed we have a few places where we pass USBD_EXCLUSIVE_USE as the flags parameter to usbd_open_pipe_intr(), which is wrong. The interrupt pipe is always opened exclusively, and the flags parameter is actually passed to usbd_setup_xfer(), where it means USBD_NO_COPY, so any data written by the transfer is not copied to the buffer where the driver expects it. I don't have hardware supported by any these drivers, but most of them don't look at the transferred data, and in a couple of them, the interrupt pipe code is #if 0'd out, so I think there is little chance this changes anything. ok? Index: if_aue.c === RCS file: /cvs/src/sys/dev/usb/if_aue.c,v retrieving revision 1.111 diff -u -p -u -p -r1.111 if_aue.c --- if_aue.c31 Jul 2020 10:49:32 - 1.111 +++ if_aue.c8 Aug 2021 03:25:19 - @@ -1355,7 +1355,7 @@ aue_openpipes(struct aue_softc *sc) return (EIO); } err = usbd_open_pipe_intr(sc->aue_iface, sc->aue_ed[AUE_ENDPT_INTR], - USBD_EXCLUSIVE_USE, &sc->aue_ep[AUE_ENDPT_INTR], sc, + 0, &sc->aue_ep[AUE_ENDPT_INTR], sc, &sc->aue_cdata.aue_ibuf, AUE_INTR_PKTLEN, aue_intr, AUE_INTR_INTERVAL); if (err) { Index: if_udav.c === RCS file: /cvs/src/sys/dev/usb/if_udav.c,v retrieving revision 1.84 diff -u -p -u -p -r1.84 if_udav.c --- if_udav.c 31 Jul 2020 10:49:32 - 1.84 +++ if_udav.c 8 Aug 2021 03:25:19 - @@ -769,7 +769,7 @@ udav_openpipes(struct udav_softc *sc) /* XXX: interrupt endpoint is not yet supported */ /* Open Interrupt pipe */ err = usbd_open_pipe_intr(sc->sc_ctl_iface, sc->sc_intrin_no, - USBD_EXCLUSIVE_USE, &sc->sc_pipe_intr, sc, + 0, &sc->sc_pipe_intr, sc, &sc->sc_cdata.udav_ibuf, UDAV_INTR_PKGLEN, udav_intr, UDAV_INTR_INTERVAL); if (err) { Index: if_ugl.c === RCS file: /cvs/src/sys/dev/usb/if_ugl.c,v retrieving revision 1.26 diff -u -p -u -p -r1.26 if_ugl.c --- if_ugl.c31 Jul 2020 10:49:32 - 1.26 +++ if_ugl.c8 Aug 2021 03:25:20 - @@ -681,7 +681,7 @@ ugl_openpipes(struct ugl_softc *sc) return (EIO); } err = usbd_open_pipe_intr(sc->sc_iface, sc->sc_ed[UGL_ENDPT_INTR], - USBD_EXCLUSIVE_USE, &sc->sc_ep[UGL_ENDPT_INTR], sc, + 0, &sc->sc_ep[UGL_ENDPT_INTR], sc, sc->sc_ibuf, UGL_INTR_PKTLEN, ugl_intr, UGL_INTR_INTERVAL); if (err) { Index: if_upl.c === RCS file: /cvs/src/sys/dev/usb/if_upl.c,v retrieving revision 1.78 diff -u -p -u -p -r1.78 if_upl.c --- if_upl.c31 Jul 2020 10:49:32 - 1.78 +++ if_upl.c8 Aug 2021 03:25:20 - @@ -661,7 +661,7 @@ upl_openpipes(struct upl_softc *sc) return (EIO); } err = usbd_open_pipe_intr(sc->sc_iface, sc->sc_ed[UPL_ENDPT_INTR], - USBD_EXCLUSIVE_USE, &sc->sc_ep[UPL_ENDPT_INTR], sc, + 0, &sc->sc_ep[UPL_ENDPT_INTR], sc, &sc->sc_ibuf, UPL_INTR_PKTLEN, upl_intr, UPL_INTR_INTERVAL); if (err) { Index: if_url.c === RCS file: /cvs/src/sys/dev/usb/if_url.c,v retrieving revision 1.88 diff -u -p -u -p -r1.88 if_url.c --- if_url.c31 Jul 2020 10:49:33 - 1.88 +++ if_url.c8 Aug 2021 03:25:20 - @@ -635,7 +635,7 @@ url_openpipes(struct url_softc *sc) /* XXX: interrupt endpoint is not yet supported */ /* Open Interrupt pipe */ err = usbd_open_pipe_intr(sc->sc_ctl_iface, sc->sc_intrin_no, - USBD_EXCLUSIVE_USE, &sc->sc_pipe_intr, sc, + 0, &sc->sc_pipe_intr, sc, &sc->sc_cdata.url_ibuf, URL_INTR_PKGLEN, url_intr, URL_INTR_INTERVAL); if (err) { Index: if_wi_usb.c === RCS file: /cvs/src/sys/dev/usb/if_wi_usb.c,v retrieving revision 1.73 diff -u -p -u -p -r1.73 if_wi_usb.c --- if_wi_usb.c 31 Jul 2020 10:49:33 - 1.73 +++ if_wi_usb.c 8 Aug 2021 03:25:21 - @@ -1233,7 +1233,7 @@ wi_usb_open_pipes(struct wi_usb_softc *s /* is this used? */ err = usbd_open_pipe_intr(sc->wi_usb_iface, - sc->wi_usb_ed[WI_USB_ENDPT_INTR], USBD_EXCLUSIVE_USE, + sc->wi_usb_ed[WI_USB_ENDPT_INTR], 0, &sc->wi_usb_ep[WI_USB_ENDPT_INTR], sc, &sc->wi_usb_ibuf, WI_USB_INTR_PKTLEN, wi_usb_intr, WI_USB_INTR_INTERVAL); if (err) {
Re: libutil/ber: add ober_dup(3)
On Thu, Jul 22, 2021 at 10:19:59PM +0200, Martijn van Duren wrote: > I'm currently working on adding SNMPv3 support to traps in snmpd(8). > For sending traps we loop over sc_trapreceivers and can send each trap > to 0 or more receivers. > > I want to high-jack snmpe_response() to do the heavy lifting for doing > the snmp/usm encoding, but this interface frees the varbindlist in > snmp_msgfree(), which means I need to rebuild the varbindlist for every > iteration. To keep this simple I suggest adding ober_dup, which > duplicates a full ber_element chain. > > Sending this prior to any of my snmpd(8) work, since it requires a > library version bump. > > OK? > Any additional coordination needed for this diff? ok jmatthew@ I don't think we need to worry about clashing with existing symbols in ports, since we renamed ber_* to ober_* because no one was using it. > > martijn@ > > Index: Symbols.map > === > RCS file: /cvs/src/lib/libutil/Symbols.map,v > retrieving revision 1.3 > diff -u -p -r1.3 Symbols.map > --- Symbols.map 24 Oct 2019 12:39:26 - 1.3 > +++ Symbols.map 22 Jul 2021 20:18:35 - > @@ -65,6 +65,7 @@ > ober_add_set; > ober_add_string; > ober_calc_len; > + ober_dup; > ober_free; > ober_free_element; > ober_free_elements; > Index: ber.c > === > RCS file: /cvs/src/lib/libutil/ber.c,v > retrieving revision 1.21 > diff -u -p -r1.21 ber.c > --- ber.c 22 Feb 2021 17:15:02 - 1.21 > +++ ber.c 22 Jul 2021 20:18:35 - > @@ -926,6 +926,43 @@ ober_getpos(struct ber_element *elm) > return elm->be_offs; > } > > +struct ber_element * > +ober_dup(struct ber_element *orig) > +{ > + struct ber_element *new; > + > + if ((new = malloc(sizeof(*new))) == NULL) > + return NULL; > + memcpy(new, orig, sizeof(*new)); > + new->be_next = NULL; > + new->be_sub = NULL; > + > + if (orig->be_next != NULL) { > + if ((new->be_next = ober_dup(orig->be_next)) == NULL) > + goto fail; > + } > + if (orig->be_encoding == BER_TYPE_SEQUENCE || > + orig->be_encoding == BER_TYPE_SET) { > + if (orig->be_sub != NULL) { > + if ((new->be_sub = ober_dup(orig->be_sub)) == NULL) > + goto fail; > + } > + } else if (orig->be_encoding == BER_TYPE_OCTETSTRING || > + orig->be_encoding == BER_TYPE_BITSTRING || > + orig->be_encoding == BER_TYPE_OBJECT) { > + if (orig->be_val != NULL) { > + if ((new->be_val = malloc(orig->be_len)) == NULL) > + goto fail; > + memcpy(new->be_val, orig->be_val, orig->be_len); > + } > + } else > + new->be_numeric = orig->be_numeric; > + return new; > + fail: > + ober_free_elements(new); > + return NULL; > +} > + > void > ober_free_element(struct ber_element *root) > { > Index: ber.h > === > RCS file: /cvs/src/lib/libutil/ber.h,v > retrieving revision 1.3 > diff -u -p -r1.3 ber.h > --- ber.h 31 Dec 2019 10:34:14 - 1.3 > +++ ber.h 22 Jul 2021 20:18:35 - > @@ -137,6 +137,7 @@ ssize_tober_write_elements(struct be > void ober_set_readbuf(struct ber *, void *, size_t); > struct ber_element *ober_read_elements(struct ber *, struct ber_element *); > off_t ober_getpos(struct ber_element *); > +struct ber_element *ober_dup(struct ber_element *); > void ober_free_element(struct ber_element *); > void ober_free_elements(struct ber_element *); > size_tober_calc_len(struct ber_element *); > Index: ober_set_header.3 > === > RCS file: /cvs/src/lib/libutil/ober_set_header.3,v > retrieving revision 1.2 > diff -u -p -r1.2 ober_set_header.3 > --- ober_set_header.3 12 Mar 2021 05:18:01 - 1.2 > +++ ober_set_header.3 22 Jul 2021 20:18:35 - > @@ -23,6 +23,7 @@ > .Nm ober_set_writecallback , > .Nm ober_link_elements , > .Nm ober_replace_elements , > +.Nm ober_dup , > .Nm ober_unlink_elements , > .Nm ober_free_element , > .Nm ober_free_elements > @@ -45,6 +46,8 @@ > .Ft "void" > .Fn "ober_replace_elements" "struct ber_element *prev" "struct ber_element > *elm" > .Ft "struct ber_element *" > +.Ft "struct ber_element *" > +.Fn "ober_dup" "struct ber_element *orig" > .Fn "ober_unlink_elements" "struct ber_element *prev" > .Ft "void" > .Fn "ober_free_element" "struct ber_element *root" > @@ -101,6 +104,9 @@ with > and frees any dynamically allocated storage associated with > .Fa prev . > .
Re: snmpd(8): set smi_applicatoin in usm_decrypt
On Thu, Jul 22, 2021 at 10:27:44PM +0200, Martijn van Duren wrote: > Not an issue with read requests, but will set requests if they contain > snmp application elements such as timeticks. > > Definitely needed for upcoming SNMPv3 trap support. > > OK? ok jmatthew@ > > martijn@ > > Index: usm.c > === > RCS file: /cvs/src/usr.sbin/snmpd/usm.c,v > retrieving revision 1.20 > diff -u -p -r1.20 usm.c > --- usm.c 20 Jun 2021 19:55:48 - 1.20 > +++ usm.c 22 Jul 2021 20:27:01 - > @@ -630,6 +630,7 @@ usm_decrypt(struct snmp_message *msg, st > return NULL; > > bzero(&ber, sizeof(ber)); > + ober_set_application(&ber, smi_application); > ober_set_readbuf(&ber, buf, scoped_pdu_len); > scoped_pdu = ober_read_elements(&ber, NULL); > > >
Re: snmpd(8): fix trapv2 on correct protocol detection
On Thu, Jul 22, 2021 at 10:31:53PM +0200, Martijn van Duren wrote: > This type-O snuck in when merging traphandler into snmpe. > Not a big deal since it's there just for ASN1/SMI strictness, but it > breaks when introducing SNMPv3 support. > > OK? ok jmatthew@ > > martijn@ > > Index: snmpe.c > === > RCS file: /cvs/src/usr.sbin/snmpd/snmpe.c,v > retrieving revision 1.72 > diff -u -p -r1.72 snmpe.c > --- snmpe.c 20 Jun 2021 19:55:48 - 1.72 > +++ snmpe.c 22 Jul 2021 20:31:31 - > @@ -381,7 +381,7 @@ badversion: > case SNMP_C_TRAPV2: > if (msg->sm_pdutype == SNMP_C_TRAPV2 && > !(msg->sm_version == SNMP_V2 || > - msg->sm_version != SNMP_V3)) { > + msg->sm_version == SNMP_V3)) { > msg->sm_errstr = "trapv2 request on !SNMPv2C or " > "!SNMPv3 message"; > goto parsefail; > >
Re: snmpd(8): Allow setting engineid
On Tue, Jul 27, 2021 at 08:43:20PM +0200, Martijn van Duren wrote: > Previous diff failed to set the initial bit when not defining engineid > in the config. > > On Fri, 2021-07-23 at 15:41 +0200, Martijn van Duren wrote: > > This diff introduces setting the engineid for snmpd(8). > > Although this diff might seem quite excessive at first glance, there's > > a valid reason to do so. > > > > The following things are in effect when sending an SNMPv3 trap: > > - SNMP trap packets are unacknowledged; meaning that we don't get a > > response -, nor report message. > > - SNMPv3 packets with a trap contain the engineid of the sender. > > - The key used in auth and priv are derived from the password and the > > engineid. > > - users are linked to an engineid > > > > So if we're sending messages in SNMPv3 format we can't generate a random > > engineid on each boot as we do now, or the trap receiver can't find the > > correct user. Since I want to keep the default config as empty as > > possible I've choosen to use the first 27 bytes (maximum length that > > fits in the engineid) of the sha256 hash of the hostname(3). This should > > give us the biggest confidence in having a consistent name that won't > > clash with other agents. If someone has a better idea though, please > > speak up now. This seems reasonable to me. Another option would be to generate a random ID once and store it on disk, like the SOII key. Seems like an awkward thing to do when there's also a config file that the information could be in, though, so I don't think this is really a good option. > > > > As for allowing to set the engineid: When receiving a trap admins will > > need to be able to specify the engineid of the remote agent, or there > > will be problems with the key generation of that user. > > Given this requirement it's a small step to allow the same yacc rules > > to be used for setting the global engineid and gives a little more > > control to the admin. The global engineid just happens to be more > > convenient to implement first. > > > > OK? If no one has any better ideas for generating a default engine ID, ok jmatthew@
Re: ix(4): fix Rx hash type
On Wed, Jul 14, 2021 at 01:46:37PM +0800, Kevin Lo wrote: > Hi, > > The diff below fixes Rx desc RSS type. This matches what Linux and FreeBSD > do. > ok? ok jmatthew@ > > Index: sys/dev/pci/if_ix.c > === > RCS file: /cvs/src/sys/dev/pci/if_ix.c,v > retrieving revision 1.178 > diff -u -p -u -p -r1.178 if_ix.c > --- sys/dev/pci/if_ix.c 22 Dec 2020 23:25:37 - 1.178 > +++ sys/dev/pci/if_ix.c 14 Jul 2021 05:41:08 - > @@ -3071,7 +3071,8 @@ ixgbe_rxeof(struct rx_ring *rxr) > > i = rxr->next_to_check; > while (if_rxr_inuse(&rxr->rx_ring) > 0) { > - uint32_t hash, hashtype; > + uint32_t hash; > + uint16_t hashtype; > > bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, > dsize * i, dsize, BUS_DMASYNC_POSTREAD); > @@ -3101,7 +3102,8 @@ ixgbe_rxeof(struct rx_ring *rxr) > vtag = letoh16(rxdesc->wb.upper.vlan); > eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0); > hash = lemtoh32(&rxdesc->wb.lower.hi_dword.rss); > - hashtype = lemtoh32(&rxdesc->wb.lower.lo_dword.data) & > + hashtype = > + lemtoh16(&rxdesc->wb.lower.lo_dword.hs_rss.pkt_info) & > IXGBE_RXDADV_RSSTYPE_MASK; > > if (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) { >
Re: ahci(4): Add support for JMicron JMB585 chipset
On Thu, Jul 22, 2021 at 10:45:17PM -0400, Ashton Fagg wrote: > I have two devices here based on the JMicron JMB585 chipset. This diff > adds the required pcidev IDs and sets disables native command queuing in > the driver. FreeBSD does something similar for this device: > > https://github.com/freebsd/freebsd-src/commit/16b766eed443043f4216d50e40ba283e74f992c2 Can you explain how you came to the conclusion that you'd need to disable NCQ? The FreeBSD commit you link to doesn't appear to do that as they're not applying the AHCI_Q_NONCQ flag to these devices. Does it not work with NCQ enabled?
Re: ix(4)/riscv64: Make ix(4) work when MSI-X interrupts aren't available
On Tue, Jul 20, 2021 at 02:21:39PM +0200, Mark Kettenis wrote: > > Date: Tue, 20 Jul 2021 21:55:56 +1000 > > From: Jonathan Matthew > > > > On Mon, Jul 19, 2021 at 07:37:10PM -0400, Ashton Fagg wrote: > > > I have an Intel 82599 10 gigabit ethernet card I wanted to get working > > > on my SiFive Unmatched board. > > > > > > I found the ix(4) driver has some weirdness around MSI-X > > > interrupts. While the driver supports operating both with and without > > > MSI-X support, it's hard-coded via a flag rather than dynamically checking > > > if it's available. If the flag is set (which it always is right now), > > > but MSI-X isn't available, the driver will throw an error and the device > > > won't work: > > > > > > ix0 at pci7 dev 0 function 0 "Intel 82599" rev 0x01ixgbe_allocate_msix: > > > pci_intr_map_msix vec 0 failed > > > > > > The root cause is this call failing in if_ix.c: > > > > > > if (pci_intr_map_msix(pa, i, &ih)) { > > > printf("ixgbe_allocate_msix: " > > > "pci_intr_map_msix vec %d failed\n", i); > > > error = ENOMEM; > > > goto fail; > > > } > > > > > > > > > Because in _pci_intr_map_msix (in sys/arch/riscv64/dev/pci_machdep.c): > > > > > > if ((pa->pa_flags & PCI_FLAGS_MSI_ENABLED) == 0 || > > > pci_get_capability(pc, tag, PCI_CAP_MSI, NULL, NULL) == 0) > > > return -1; > > > > > > The PCI attach flags would not have PCI_FLAGS_MSI_ENABLED set. > > > > > > The following diff remedies that by checking if PCI_FLAGS_MSI_ENABLED is > > > actually set, rather than just trying and failing because the hard-coded > > > flag says so. It also enables ix(4) in the kernel config for > > > riscv64. Effectively, the driver will now only try to use MSI-X if the > > > machine is advertising it to be available. > > > > I'd rather not have to do this in every driver. We otherwise check that > > flag > > inside the pci interrupt functions rather than in the driver code, so we > > should do so in pci_intr_msix_count() too, since that's what we call in > > multi-queue nic drivers to decide whether to use MSI-X. Drivers that only > > want a single vector will just call pci_intr_map_msix() and fall back to MSI > > or legacy interrupts if that fails. > > > > I posted the alternate version of this diff to misc@ a few days ago, > > which repeats the checks used to set PCI_FLAGS_MSI_ENABLED in > > pci_intr_msix_count(), rather than passing in struct > > pci_attach_args, in case we prefer to do it that way. > > I don't really read misc@, so don't post your patches there. Right, it was just there for testing. > > > Mark, what do you think? > > Yeah, making pci_intr_msix_count() should return 0 if MSIs are not > supported. A bit strange though to pass both pa and pa->pa_tag. I'd > change the function to only take pa as an argument. Yes, on second look that makes sense. Here's a better diff with that change, and that also doesn't break arches without __HAVE_PCI_MSIX. ok? Index: if_bnxt.c === RCS file: /cvs/src/sys/dev/pci/if_bnxt.c,v retrieving revision 1.32 diff -u -p -u -p -r1.32 if_bnxt.c --- if_bnxt.c 24 Apr 2021 09:37:46 - 1.32 +++ if_bnxt.c 21 Jul 2021 03:24:44 - @@ -537,7 +537,7 @@ bnxt_attach(struct device *parent, struc sc->sc_flags |= BNXT_FLAG_MSIX; intrstr = pci_intr_string(sc->sc_pc, ih); - nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag); + nmsix = pci_intr_msix_count(pa); if (nmsix > 1) { sc->sc_ih = pci_intr_establish(sc->sc_pc, ih, IPL_NET | IPL_MPSAFE, bnxt_admin_intr, sc, DEVNAME(sc)); Index: if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.178 diff -u -p -u -p -r1.178 if_ix.c --- if_ix.c 22 Dec 2020 23:25:37 - 1.178 +++ if_ix.c 21 Jul 2021 03:24:44 - @@ -1783,7 +1783,7 @@ ixgbe_setup_msix(struct ix_softc *sc) if (!ixgbe_enable_msix) return; - nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag); + nmsix = pci_intr_msix_count(pa); if (nmsix <= 1) return; Index: if_ixl.c ==
Re: ix(4)/riscv64: Make ix(4) work when MSI-X interrupts aren't available
On Mon, Jul 19, 2021 at 07:37:10PM -0400, Ashton Fagg wrote: > I have an Intel 82599 10 gigabit ethernet card I wanted to get working > on my SiFive Unmatched board. > > I found the ix(4) driver has some weirdness around MSI-X > interrupts. While the driver supports operating both with and without > MSI-X support, it's hard-coded via a flag rather than dynamically checking > if it's available. If the flag is set (which it always is right now), > but MSI-X isn't available, the driver will throw an error and the device > won't work: > > ix0 at pci7 dev 0 function 0 "Intel 82599" rev 0x01ixgbe_allocate_msix: > pci_intr_map_msix vec 0 failed > > The root cause is this call failing in if_ix.c: > > if (pci_intr_map_msix(pa, i, &ih)) { > printf("ixgbe_allocate_msix: " > "pci_intr_map_msix vec %d failed\n", i); > error = ENOMEM; > goto fail; > } > > > Because in _pci_intr_map_msix (in sys/arch/riscv64/dev/pci_machdep.c): > > if ((pa->pa_flags & PCI_FLAGS_MSI_ENABLED) == 0 || > pci_get_capability(pc, tag, PCI_CAP_MSI, NULL, NULL) == 0) > return -1; > > The PCI attach flags would not have PCI_FLAGS_MSI_ENABLED set. > > The following diff remedies that by checking if PCI_FLAGS_MSI_ENABLED is > actually set, rather than just trying and failing because the hard-coded > flag says so. It also enables ix(4) in the kernel config for > riscv64. Effectively, the driver will now only try to use MSI-X if the > machine is advertising it to be available. I'd rather not have to do this in every driver. We otherwise check that flag inside the pci interrupt functions rather than in the driver code, so we should do so in pci_intr_msix_count() too, since that's what we call in multi-queue nic drivers to decide whether to use MSI-X. Drivers that only want a single vector will just call pci_intr_map_msix() and fall back to MSI or legacy interrupts if that fails. I posted the alternate version of this diff to misc@ a few days ago, which repeats the checks used to set PCI_FLAGS_MSI_ENABLED in pci_intr_msix_count(), rather than passing in struct pci_attach_args, in case we prefer to do it that way. Mark, what do you think? Index: if_bnxt.c === RCS file: /cvs/src/sys/dev/pci/if_bnxt.c,v retrieving revision 1.32 diff -u -p -u -p -r1.32 if_bnxt.c --- if_bnxt.c 24 Apr 2021 09:37:46 - 1.32 +++ if_bnxt.c 20 Jul 2021 11:23:22 - @@ -537,7 +537,7 @@ bnxt_attach(struct device *parent, struc sc->sc_flags |= BNXT_FLAG_MSIX; intrstr = pci_intr_string(sc->sc_pc, ih); - nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag); + nmsix = pci_intr_msix_count(pa, pa->pa_tag); if (nmsix > 1) { sc->sc_ih = pci_intr_establish(sc->sc_pc, ih, IPL_NET | IPL_MPSAFE, bnxt_admin_intr, sc, DEVNAME(sc)); Index: if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.178 diff -u -p -u -p -r1.178 if_ix.c --- if_ix.c 22 Dec 2020 23:25:37 - 1.178 +++ if_ix.c 20 Jul 2021 11:23:22 - @@ -1783,7 +1783,7 @@ ixgbe_setup_msix(struct ix_softc *sc) if (!ixgbe_enable_msix) return; - nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag); + nmsix = pci_intr_msix_count(pa, pa->pa_tag); if (nmsix <= 1) return; Index: if_ixl.c === RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.74 diff -u -p -u -p -r1.74 if_ixl.c --- if_ixl.c26 Mar 2021 08:02:34 - 1.74 +++ if_ixl.c20 Jul 2021 11:23:22 - @@ -1795,7 +1795,7 @@ ixl_attach(struct device *parent, struct } if (pci_intr_map_msix(pa, 0, &sc->sc_ih) == 0) { - int nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag); + int nmsix = pci_intr_msix_count(pa, pa->pa_tag); if (nmsix > 1) { /* we used 1 (the 0th) for the adminq */ nmsix--; Index: if_mcx.c === RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v retrieving revision 1.101 diff -u -p -u -p -r1.101 if_mcx.c --- if_mcx.c2 Jun 2021 19:16:11 - 1.101 +++ if_mcx.c20 Jul 2021 11:23:22 - @@ -2831,7 +2831,7 @@ mcx_attach(struct device *parent, struct goto teardown; } - msix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag); + msix = pci_intr_msix_count(pa, pa->pa_tag); if (msix < 2) { printf(": not enough msi-x vectors\n"); goto teardown; Index: if_vmx.c === RCS file: /cvs/src/sys/dev
Re: uao references & uao_swap_off() cleanup
On Wed, Jun 23, 2021 at 09:37:10AM +0200, Martin Pieuchot wrote: > On 16/06/21(Wed) 11:26, Martin Pieuchot wrote: > > Diff below does two things: > > > > - Use atomic operations for incrementing/decrementing references of > > anonymous objects. This allows us to manipulate them without holding > > the KERNEL_LOCK(). > > > > - Rewrite the loop from uao_swap_off() to only keep a reference to the > > next item in the list. This is imported from NetBSD and is necessary > > to introduce locking around uao_pagein(). > > > > ok? > > Anyone? uao_reference_locked() and uao_detach_locked() are prototyped in uvm_extern.h, so they should be removed here too. It doesn't look like uao_detach() is safe to call without the kernel lock; it calls uao_dropswap() for each page, which calls uao_set_swslot(), which includes a KERNEL_ASSERT_LOCKED(). Should we keep the KERNEL_ASSERT_LOCKED() in uao_detach()? ok jmatthew@ otherwise > > > > > Index: uvm/uvm_aobj.c > > === > > RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v > > retrieving revision 1.98 > > diff -u -p -r1.98 uvm_aobj.c > > --- uvm/uvm_aobj.c 16 Jun 2021 09:02:21 - 1.98 > > +++ uvm/uvm_aobj.c 16 Jun 2021 09:20:26 - > > @@ -779,19 +779,11 @@ uao_init(void) > > void > > uao_reference(struct uvm_object *uobj) > > { > > - KERNEL_ASSERT_LOCKED(); > > - uao_reference_locked(uobj); > > -} > > - > > -void > > -uao_reference_locked(struct uvm_object *uobj) > > -{ > > - > > /* Kernel object is persistent. */ > > if (UVM_OBJ_IS_KERN_OBJECT(uobj)) > > return; > > > > - uobj->uo_refs++; > > + atomic_inc_int(&uobj->uo_refs); > > } > > > > > > @@ -801,34 +793,19 @@ uao_reference_locked(struct uvm_object * > > void > > uao_detach(struct uvm_object *uobj) > > { > > - KERNEL_ASSERT_LOCKED(); > > - uao_detach_locked(uobj); > > -} > > - > > - > > -/* > > - * uao_detach_locked: drop a reference to an aobj > > - * > > - * => aobj may freed upon return. > > - */ > > -void > > -uao_detach_locked(struct uvm_object *uobj) > > -{ > > struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; > > struct vm_page *pg; > > > > /* > > * Detaching from kernel_object is a NOP. > > */ > > - if (UVM_OBJ_IS_KERN_OBJECT(uobj)) { > > + if (UVM_OBJ_IS_KERN_OBJECT(uobj)) > > return; > > - } > > > > /* > > * Drop the reference. If it was the last one, destroy the object. > > */ > > - uobj->uo_refs--; > > - if (uobj->uo_refs) { > > + if (atomic_dec_int_nv(&uobj->uo_refs) > 0) { > > return; > > } > > > > @@ -1265,68 +1242,54 @@ uao_dropswap(struct uvm_object *uobj, in > > boolean_t > > uao_swap_off(int startslot, int endslot) > > { > > - struct uvm_aobj *aobj, *nextaobj, *prevaobj = NULL; > > + struct uvm_aobj *aobj; > > > > /* > > -* Walk the list of all anonymous UVM objects. > > +* Walk the list of all anonymous UVM objects. Grab the first. > > */ > > mtx_enter(&uao_list_lock); > > + if ((aobj = LIST_FIRST(&uao_list)) == NULL) { > > + mtx_leave(&uao_list_lock); > > + return FALSE; > > + } > > + uao_reference(&aobj->u_obj); > > > > - for (aobj = LIST_FIRST(&uao_list); > > -aobj != NULL; > > -aobj = nextaobj) { > > + do { > > + struct uvm_aobj *nextaobj; > > boolean_t rv; > > > > /* > > -* add a ref to the aobj so it doesn't disappear > > -* while we're working. > > -*/ > > - uao_reference_locked(&aobj->u_obj); > > - > > - /* > > -* now it's safe to unlock the uao list. > > -* note that lock interleaving is alright with IPL_NONE mutexes. > > +* Prefetch the next object and immediately hold a reference > > +* on it, so neither the current nor the next entry could > > +* disappear while we are iterating. > > */ > > - mtx_leave(&uao_list_lock); > > - > > - if (prevaobj) { > > - uao_detach_locked(&prevaobj->u_obj); > > - prevaobj = NULL; > > + if ((nextaobj = LIST_NEXT(aobj, u_list)) != NULL) { > > + uao_reference(&nextaobj->u_obj); > > } > > + mtx_leave(&uao_list_lock); > > > > /* > > -* page in any pages in the swslot range. > > -* if there's an error, abort and return the error. > > +* Page in all pages in the swap slot range. > > */ > > rv = uao_pagein(aobj, startslot, endslot); > > + > > + /* Drop the reference of the current object. */ > > + uao_detach(&aobj->u_obj); > > if (rv) { > > - uao_detach_locked(&aobj->u_obj); > > + if (nextaobj) { > > + uao_detach(&nextaobj->u_obj); > > +
Re: Reaper & amaps
On Mon, Jun 14, 2021 at 05:35:07PM +0200, Mark Kettenis wrote: > > Date: Mon, 14 Jun 2021 11:50:24 +0200 > > From: Martin Pieuchot > > > > Now that operations on amaps are serialized using a per-map rwlock > > the KERNEL_LOCK() shouldn't be necessary to call amap_unref(). The > > diff below allows the reaper to do this operation before grabbing it. > > > > I haven't seen any relevant contention on the reaper in my profilings, > > so I don't expect any visible change related to this change. However > > this reflects the current state of locking in UVM and helps me shrink > > my diff. > > > > ok? > > This means we no longer call uvm_pause() for these, but I believe the > main reason for calling uvm_pause() is to prevent us from holding the > kernel lock for too long. So I think that's fine. > > ok kettenis@ And I guess to allow something else to run if we're on a single processor system, which I don't think is a huge concern. ok jmatthew@ > > > > Index: uvm/uvm_map.c > > === > > RCS file: /cvs/src/sys/uvm/uvm_map.c,v > > retrieving revision 1.275 > > diff -u -p -r1.275 uvm_map.c > > --- uvm/uvm_map.c 22 May 2021 08:38:29 - 1.275 > > +++ uvm/uvm_map.c 14 Jun 2021 09:32:04 - > > @@ -1571,10 +1571,16 @@ uvm_unmap_detach(struct uvm_map_deadq *d > > > > TAILQ_FOREACH_SAFE(entry, deadq, dfree.deadq, tmp) { > > /* Skip entries for which we have to grab the kernel lock. */ > > - if (entry->aref.ar_amap || UVM_ET_ISSUBMAP(entry) || > > - UVM_ET_ISOBJ(entry)) > > + if (UVM_ET_ISSUBMAP(entry) || UVM_ET_ISOBJ(entry)) > > continue; > > > > + /* Drop reference to amap, if we've got one. */ > > + if (entry->aref.ar_amap) > > + amap_unref(entry->aref.ar_amap, > > + entry->aref.ar_pageoff, > > + atop(entry->end - entry->start), > > + flags & AMAP_REFALL); > > + > > TAILQ_REMOVE(deadq, entry, dfree.deadq); > > uvm_mapent_free(entry); > > } > > @@ -1586,12 +1592,6 @@ uvm_unmap_detach(struct uvm_map_deadq *d > > while ((entry = TAILQ_FIRST(deadq)) != NULL) { > > if (waitok) > > uvm_pause(); > > - /* Drop reference to amap, if we've got one. */ > > - if (entry->aref.ar_amap) > > - amap_unref(entry->aref.ar_amap, > > - entry->aref.ar_pageoff, > > - atop(entry->end - entry->start), > > - flags & AMAP_REFALL); > > > > /* Drop reference to our backing object, if we've got one. */ > > if (UVM_ET_ISSUBMAP(entry)) { > > > > >
Re: nvme(4): fix prpl sync length
On Tue, Jun 01, 2021 at 08:24:10AM +1000, David Gwynne wrote: > > > > On 1 Jun 2021, at 04:17, Patrick Wildt wrote: > > > > Hi, > > > > this call to sync the DMA mem wants to sync N - 1 number of prpl > > entries, as the first segment is configured regularly, while the > > addresses for the following segments (if more than 2), are in a > > special DMA memory. > > > > The code currently removes a single byte, instead of an entry. > > This just means that it is syncing more than it should. > > nice. > > > ok? > > ok. ok by me too. > > > > > Patrick > > > > diff --git a/sys/dev/ic/nvme.c b/sys/dev/ic/nvme.c > > index 62b8e40c626..6db25260ef0 100644 > > --- a/sys/dev/ic/nvme.c > > +++ b/sys/dev/ic/nvme.c > > @@ -629,7 +629,7 @@ nvme_scsi_io(struct scsi_xfer *xs, int dir) > > bus_dmamap_sync(sc->sc_dmat, > > NVME_DMA_MAP(sc->sc_ccb_prpls), > > ccb->ccb_prpl_off, > > - sizeof(*ccb->ccb_prpl) * dmap->dm_nsegs - 1, > > + sizeof(*ccb->ccb_prpl) * (dmap->dm_nsegs - 1), > > BUS_DMASYNC_PREWRITE); > > } > > > > @@ -691,7 +691,7 @@ nvme_scsi_io_done(struct nvme_softc *sc, struct > > nvme_ccb *ccb, > > bus_dmamap_sync(sc->sc_dmat, > > NVME_DMA_MAP(sc->sc_ccb_prpls), > > ccb->ccb_prpl_off, > > - sizeof(*ccb->ccb_prpl) * dmap->dm_nsegs - 1, > > + sizeof(*ccb->ccb_prpl) * (dmap->dm_nsegs - 1), > > BUS_DMASYNC_POSTWRITE); > > } > > > > >
Re: mcx(4): sync only received length on RX
On Tue, Jun 01, 2021 at 08:20:43AM +1000, David Gwynne wrote: > > > > On 1 Jun 2021, at 04:15, Patrick Wildt wrote: > > > > Hi, > > > > mcx(4) seems to sync the whole mapsize on processing a received packet. > > As far as I know, we usually only sync the actual size that we have > > received. Noticed this when doing bounce buffer tests, seeing that > > it copied a lot more data than is necessary. > > > > That's because the RX buffer size is maximum supported MTU, which is > > about 9500 bytes or so. For small packets, or regular 1500 bytes, > > this adds overhead. > > > > This change should not change anything for ARM machines that have a > > cache coherent PCIe bus or x86. > > > > ok? > > ok. ok by me too. > > > > > Patrick > > > > diff --git a/sys/dev/pci/if_mcx.c b/sys/dev/pci/if_mcx.c > > index 38437e54897..065855d46d3 100644 > > --- a/sys/dev/pci/if_mcx.c > > +++ b/sys/dev/pci/if_mcx.c > > @@ -6800,20 +6800,20 @@ mcx_process_rx(struct mcx_softc *sc, struct mcx_rx > > *rx, > > { > > struct mcx_slot *ms; > > struct mbuf *m; > > - uint32_t flags; > > + uint32_t flags, len; > > int slot; > > > > + len = bemtoh32(&cqe->cq_byte_cnt); > > slot = betoh16(cqe->cq_wqe_count) % (1 << MCX_LOG_RQ_SIZE); > > > > ms = &rx->rx_slots[slot]; > > - bus_dmamap_sync(sc->sc_dmat, ms->ms_map, 0, ms->ms_map->dm_mapsize, > > - BUS_DMASYNC_POSTREAD); > > + bus_dmamap_sync(sc->sc_dmat, ms->ms_map, 0, len, BUS_DMASYNC_POSTREAD); > > bus_dmamap_unload(sc->sc_dmat, ms->ms_map); > > > > m = ms->ms_m; > > ms->ms_m = NULL; > > > > - m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt); > > + m->m_pkthdr.len = m->m_len = len; > > > > if (cqe->cq_rx_hash_type) { > > m->m_pkthdr.ph_flowid = betoh32(cqe->cq_rx_hash); > > >
Re: [External] : arp mbuf queue
On Sun, Apr 25, 2021 at 09:44:16AM +0200, Alexandr Nedvedicky wrote: > Hello, > > I think this should go in as-is. Though I have one question/idea > to share at the moment. > > > > @@ -672,20 +666,18 @@ arpcache(struct ifnet *ifp, struct ether > > > > la->la_asked = 0; > > la->la_refreshed = 0; > > - while ((len = ml_len(&la->la_ml)) != 0) { > > - struct mbuf *mh; > > + while ((m = mq_dequeue(&la->la_mq)) != NULL) { > > + unsigned int len; > > > > - mh = ml_dequeue(&la->la_ml); > > - la_hold_total--; > > + atomic_dec_int(&la_hold_total); > > + len = mq_len(&la->la_mq); > > > > - ifp->if_output(ifp, mh, rt_key(rt), rt); > > + ifp->if_output(ifp, m, rt_key(rt), rt); > > > > - if (ml_len(&la->la_ml) == len) { > > + /* XXXSMP we discard if other CPU enqueues */ > > + if (mq_len(&la->la_mq) > len) { > > /* mbuf is back in queue. Discard. */ > > - while ((mh = ml_dequeue(&la->la_ml)) != NULL) { > > - la_hold_total--; > > - m_freem(mh); > > - } > > + atomic_sub_int(&la_hold_total, mq_purge(&la->la_mq)); > > break; > > } > > would it make sense to have let's say > > mq_move2mlist(struct mbuf_queue *, struct mbuf_list *) This already exists, it's called mq_delist() > > This would allow as to move whole globally visible la->la_mq into > into mbuf list, which will be a local variable. This way we won't > need to jump on la->la_mq's mutex with every loop iteration. > > I it makes sense, we can do it as a follow up change. We'd need some other way to do the 'mbuf is back in queue' detection, but I agree this seems like a sensible thing to do. > > > thanks and > regards > sashan >
Re: rge(4): move tx/rx descriptors into their own structs
On Thu, Mar 25, 2021 at 05:21:38PM +0800, Kevin Lo wrote: > Hi, > > The diff below moves tx/rx descriptors into their own structs. > This is a first step toward making rge work with multiple queues and > interrupts. > Only one queue is currently used. > > While here, update the RTL8125B microcode. I can't really comment on the magic numbers, but the struct reorganisation looks good to me, ok jmatthew@ > > Index: sys/dev/pci/if_rge.c > === > RCS file: /cvs/src/sys/dev/pci/if_rge.c,v > retrieving revision 1.12 > diff -u -p -u -p -r1.12 if_rge.c > --- sys/dev/pci/if_rge.c 11 Feb 2021 16:22:06 - 1.12 > +++ sys/dev/pci/if_rge.c 25 Mar 2021 09:14:17 - > @@ -61,7 +61,7 @@ int rge_match(struct device *, void *, > void rge_attach(struct device *, struct device *, void *); > int rge_activate(struct device *, int); > int rge_intr(void *); > -int rge_encap(struct rge_softc *, struct mbuf *, int); > +int rge_encap(struct rge_queues *, struct mbuf *, int); > int rge_ioctl(struct ifnet *, u_long, caddr_t); > void rge_start(struct ifqueue *); > void rge_watchdog(struct ifnet *); > @@ -70,13 +70,13 @@ void rge_stop(struct ifnet *); > int rge_ifmedia_upd(struct ifnet *); > void rge_ifmedia_sts(struct ifnet *, struct ifmediareq *); > int rge_allocmem(struct rge_softc *); > -int rge_newbuf(struct rge_softc *); > -void rge_discard_rxbuf(struct rge_softc *, int); > -void rge_rx_list_init(struct rge_softc *); > -void rge_tx_list_init(struct rge_softc *); > -void rge_fill_rx_ring(struct rge_softc *); > -int rge_rxeof(struct rge_softc *); > -int rge_txeof(struct rge_softc *); > +int rge_newbuf(struct rge_queues *); > +void rge_discard_rxbuf(struct rge_queues *, int); > +void rge_rx_list_init(struct rge_queues *); > +void rge_tx_list_init(struct rge_queues *); > +void rge_fill_rx_ring(struct rge_queues *); > +int rge_rxeof(struct rge_queues *); > +int rge_txeof(struct rge_queues *); > void rge_reset(struct rge_softc *); > void rge_iff(struct rge_softc *); > void rge_set_phy_power(struct rge_softc *, int); > @@ -159,6 +159,7 @@ rge_attach(struct device *parent, struct > pci_intr_handle_t ih; > const char *intrstr = NULL; > struct ifnet *ifp; > + struct rge_queues *q; > pcireg_t reg; > uint32_t hwrev; > uint8_t eaddr[ETHER_ADDR_LEN]; > @@ -184,6 +185,17 @@ rge_attach(struct device *parent, struct > } > } > > + q = malloc(sizeof(struct rge_queues), M_DEVBUF, M_NOWAIT | M_ZERO); > + if (q == NULL) { > + printf(": unable to allocate queue memory\n"); > + return; > + } > + q->q_sc = sc; > + q->q_index = 0; > + > + sc->sc_queues = q; > + sc->sc_nqueues = 1; > + > /* >* Allocate interrupt. >*/ > @@ -323,9 +335,10 @@ int > rge_intr(void *arg) > { > struct rge_softc *sc = arg; > + struct rge_queues *q = sc->sc_queues; > struct ifnet *ifp = &sc->sc_arpcom.ac_if; > uint32_t status; > - int claimed = 0, rx, tx; > + int claimed = 0, rv; > > if (!(ifp->if_flags & IFF_RUNNING)) > return (0); > @@ -345,29 +358,21 @@ rge_intr(void *arg) > if (status & RGE_ISR_PCS_TIMEOUT) > claimed = 1; > > - rx = tx = 0; > + rv = 0; > if (status & sc->rge_intrs) { > - if (status & > - (sc->rge_rx_ack | RGE_ISR_RX_ERR | RGE_ISR_RX_FIFO_OFLOW)) { > - rx |= rge_rxeof(sc); > - claimed = 1; > - } > - > - if (status & (sc->rge_tx_ack | RGE_ISR_TX_ERR)) { > - tx |= rge_txeof(sc); > - claimed = 1; > - } > + rv |= rge_rxeof(q); > + rv |= rge_txeof(q); > > if (status & RGE_ISR_SYSTEM_ERR) { > KERNEL_LOCK(); > rge_init(ifp); > KERNEL_UNLOCK(); > - claimed = 1; > } > + claimed = 1; > } > > if (sc->rge_timerintr) { > - if ((tx | rx) == 0) { > + if (!rv) { > /* >* Nothing needs to be processed, fallback >* to use TX/RX interrupts. > @@ -379,11 +384,11 @@ rge_intr(void *arg) >* race introduced by changing interrupt >* masks. >*/ > - rge_rxeof(sc); > - rge_txeof(sc); > + rge_rxeof(q); > + rge_txeof(q); > } else > RGE_WRI
Re: btrace: add dry run mode
On Fri, Mar 19, 2021 at 08:24:12AM -0600, Todd C. Miller wrote: > On Fri, 19 Mar 2021 13:22:35 +0100, Klemens Nanni wrote: > > > I argue it should be `-n' like all the daemons, e.g. vmd(8) and other > > parsers such as pfctl(8) do. > > Yes, please. I was about to make the same point. Fair enough. I started with -d because that's what bpftrace has, but changing to -n for consistency makes sense to me. Index: btrace.8 === RCS file: /cvs/src/usr.sbin/btrace/btrace.8,v retrieving revision 1.2 diff -u -p -u -p -r1.2 btrace.8 --- btrace.811 Sep 2020 08:16:15 - 1.2 +++ btrace.820 Mar 2021 04:38:55 - @@ -46,6 +46,9 @@ Execute .Ar program . .It Fl l List all available probes. +.It Fl n +No action. +Parse the program and then exit. .It Fl p Ar pid Enable tracing on the indicated process ID (only one .Fl p Index: btrace.c === RCS file: /cvs/src/usr.sbin/btrace/btrace.c,v retrieving revision 1.29 diff -u -p -u -p -r1.29 btrace.c --- btrace.c8 Feb 2021 09:46:45 - 1.29 +++ btrace.c20 Mar 2021 04:38:56 - @@ -124,7 +124,7 @@ main(int argc, char *argv[]) int fd = -1, ch, error = 0; const char *filename = NULL, *btscript = NULL; const char *errstr; - int showprobes = 0, tracepid = -1; + int showprobes = 0, tracepid = -1, noaction = 0; setlocale(LC_ALL, ""); @@ -133,7 +133,7 @@ main(int argc, char *argv[]) err(1, "pledge"); #endif - while ((ch = getopt(argc, argv, "e:lp:v")) != -1) { + while ((ch = getopt(argc, argv, "e:lnp:v")) != -1) { switch (ch) { case 'e': btscript = optarg; @@ -141,6 +141,9 @@ main(int argc, char *argv[]) case 'l': showprobes = 1; break; + case 'n': + noaction = 1; + break; case 'p': if (tracepid != -1) usage(); @@ -178,6 +181,9 @@ main(int argc, char *argv[]) return error; } + if (noaction) + return error; + if (showprobes || g_nprobes > 0) { fd = open(__PATH_DEVDT, O_RDONLY); if (fd == -1) @@ -201,7 +207,7 @@ main(int argc, char *argv[]) __dead void usage(void) { - fprintf(stderr, "usage: %s [-lv] [-p pid] [-e program|file]\n", + fprintf(stderr, "usage: %s [-lnv] [-p pid] [-e program | file]\n", getprogname()); exit(1); }
btrace: add dry run mode
I'd like to add some regress tests for the btrace(8) parser. To do that, it would help to have a dry-run mode where it just parses the file and exits. The way I've implemented it here, it exits immediately after parsing, so it won't open /dev/dt and try to find the probes the program uses. This means it doesn't require privileges and can be run on kernels without dt(4) compiled in, but it can't catch typos in probe names. ok? Index: btrace.8 === RCS file: /cvs/src/usr.sbin/btrace/btrace.8,v retrieving revision 1.2 diff -u -p -u -p -r1.2 btrace.8 --- btrace.811 Sep 2020 08:16:15 - 1.2 +++ btrace.819 Mar 2021 11:22:41 - @@ -41,6 +41,9 @@ in .Pp The options are as follows: .Bl -tag -width Ds +.It Fl d +Dry run. +Parse the program and then exit. .It Fl e Ar program Execute .Ar program . Index: btrace.c === RCS file: /cvs/src/usr.sbin/btrace/btrace.c,v retrieving revision 1.26 diff -u -p -u -p -r1.26 btrace.c --- btrace.c7 Dec 2020 18:28:09 - 1.26 +++ btrace.c19 Mar 2021 11:22:41 - @@ -123,7 +123,7 @@ main(int argc, char *argv[]) int fd = -1, ch, error = 0; const char *filename = NULL, *btscript = NULL; const char *errstr; - int showprobes = 0, tracepid = -1; + int showprobes = 0, tracepid = -1, dryrun = 0; setlocale(LC_ALL, ""); @@ -132,8 +132,11 @@ main(int argc, char *argv[]) err(1, "pledge"); #endif - while ((ch = getopt(argc, argv, "e:lp:v")) != -1) { + while ((ch = getopt(argc, argv, "de:lp:v")) != -1) { switch (ch) { + case 'd': + dryrun = 1; + break; case 'e': btscript = optarg; break; @@ -177,6 +180,9 @@ main(int argc, char *argv[]) return error; } + if (dryrun) + return error; + if (showprobes || g_nprobes > 0) { fd = open(__PATH_DEVDT, O_RDONLY); if (fd == -1) @@ -200,7 +206,7 @@ main(int argc, char *argv[]) __dead void usage(void) { - fprintf(stderr, "usage: %s [-lv] [-p pid] [-e program|file]\n", + fprintf(stderr, "usage: %s [-dlv] [-p pid] [-e program|file]\n", getprogname()); exit(1); }
relayd check script memory explosion
It's fairly easy to accidentally configure relayd to try to run check scripts faster than they finish, for example if you have a check interval of one second and the check script makes a tcp connection to a host that doesn't exist any more. In this situation, the hce process will keep writing messages to its imsg buffer to the parent process asking it to run checks, which causes its memory usage to grow without bounds. If the check script starts working again (or if you change it to just 'exit 0') the parent works its way through the backlog and memory usage goes back to normal, but ideally relayd would avoid doing this to itself. If we don't clear the F_CHECK_SENT and F_CHECK_DONE flags in hce_launch_checks(), check_script() can use them to figure out if the last check request it sent for the host has finished yet, so it can avoid building up a backlog of work for the parent. The ICMP and script check implementations clear these flags as they start checks, and the TCP check code doesn't use them at all, so this shouldn't affect anything else. ok? Index: check_script.c === RCS file: /cvs/src/usr.sbin/relayd/check_script.c,v retrieving revision 1.21 diff -u -p -u -p -r1.21 check_script.c --- check_script.c 28 May 2017 10:39:15 - 1.21 +++ check_script.c 15 Feb 2021 01:28:54 - @@ -38,6 +38,9 @@ check_script(struct relayd *env, struct struct ctl_scriptscr; struct table*table; + if ((host->flags & (F_CHECK_SENT|F_CHECK_DONE)) == F_CHECK_SENT) + return; + if ((table = table_find(env, host->conf.tableid)) == NULL) fatalx("%s: invalid table id", __func__); @@ -52,7 +55,9 @@ check_script(struct relayd *env, struct fatalx("invalid script path"); memcpy(&scr.timeout, &table->conf.timeout, sizeof(scr.timeout)); - proc_compose(env->sc_ps, PROC_PARENT, IMSG_SCRIPT, &scr, sizeof(scr)); + if (proc_compose(env->sc_ps, PROC_PARENT, IMSG_SCRIPT, &scr, + sizeof(scr)) == 0) + host->flags |= F_CHECK_SENT; } void Index: hce.c === RCS file: /cvs/src/usr.sbin/relayd/hce.c,v retrieving revision 1.79 diff -u -p -u -p -r1.79 hce.c --- hce.c 6 Aug 2018 17:31:31 - 1.79 +++ hce.c 15 Feb 2021 01:28:54 - @@ -139,7 +139,6 @@ hce_launch_checks(int fd, short event, v TAILQ_FOREACH(host, &table->hosts, entry) { if ((host->flags & F_CHECK_DONE) == 0) host->he = HCE_INTERVAL_TIMEOUT; - host->flags &= ~(F_CHECK_SENT|F_CHECK_DONE); if (event_initialized(&host->cte.ev)) { event_del(&host->cte.ev); close(host->cte.s);
Re: Uninitialized var in dev/pv/vmt.c
On Thu, Feb 11, 2021 at 11:41:24AM +, Ricardo Mestre wrote: > Hi, > > Uninitialized var and it's used in a condition != NULL a little bit > afterwards. > CID 1501713 > > OK? yes, ok jmatthew@ > > Index: vmt.c > === > RCS file: /cvs/src/sys/dev/pv/vmt.c,v > retrieving revision 1.22 > diff -u -p -u -r1.22 vmt.c > --- vmt.c 15 Jan 2021 06:14:41 - 1.22 > +++ vmt.c 11 Feb 2021 11:35:41 - > @@ -1289,7 +1289,7 @@ vmt_xdr_nic_info(char *data) > struct ifnet *iface; > struct vm_nicinfo_nic_list nl; > size_t total, nictotal; > - char *listdata; > + char *listdata = NULL; > int nics; > > NET_ASSERT_LOCKED();
Re: sleep_setup/finish simplification
On Fri, Jan 08, 2021 at 12:59:16PM -0600, Scott Cheloha wrote: > On Mon, Dec 28, 2020 at 11:41:52AM -0300, Martin Pieuchot wrote: > > On 08/12/20(Tue) 10:06, Martin Pieuchot wrote: > > > Diff below aims to simplify the API to put a thread on a sleep queue and > > > reduce it to the following: > > > > > > sleep_setup(); > > > /* check condition or release lock */ > > > sleep_finish(); > > > > > > It is motivated by my work to sleep the SCHED_LOCK() but might as well > > > prevent/fix some bugs. > > > > > > The tricky part of the current implementation is that sleep_setup_signal() > > > can already park/stop the current thread resulting in a context change. > > > Should any custom accounting / lock check happen before that? At least > > > two lock primitives do so currently: drm's schedule_timeout() and > > > rwlock's rw_enter(). > > > > > > As a result of this diff various states can be removed and sleep_finish() > > > contains the following magic: > > > > > > 1. check for signal/parking > > > 2. context switch or remove from sleep queue > > > 3. check for signal/parking > > > > > > Note that sleep_finish() could be simplified even further but I left > > > that for later to ease the review. > > > > > > Comments? Oks? > > > > Anyone? > > I really like this simplification. > > It also makes my forthcoming kclock changes to tsleep_nsec(9)/etc. > simpler, so it's doubly good for me. > > I was hoping someone would step forward and OK this but nobody did, at > least not publicly. > > I see claudio@ is trying to break off a piece of this for commit in a > different thread. Unsure if that means this is dead or just being cut > up and merged piecemeal. > > FWIW, ok cheloha@. Obviously you need more OKs. > > Even if this is dead, some other simplification in this vein would be > nice. I agree; I read through this, tried to puzzle my way through what would happen if sleep_setup_signal() slept and figured it was OK. I don't think I've written any code using this API so my opinion doesn't count for much, but I've always found it more complicated than I'd like, and particularly hard to determine if it's being used correctly, so simplifying it definitely sounds good to me.
Re: remove vmt(4) (superseeded by open-vm-tools package)
On Fri, Jan 08, 2021 at 10:34:02PM +0100, Klemens Nanni wrote: > The report on bugs shows vmt(4) lagging behind and I sent a working > working open-vm-tools port to ports@ yesterday. > > In case the port gets imported and there are no further regressions wrt. > the functionality vmt(4) already provides, here's a tentative diff to > remove the driver entirely. > > Not asking for OKs at this point because the port neede testing test and > I have only tested with it anyway, but vmt(4) supports i386 as well. > > Thoughts? The reason I work on vmt(4) is so I don't have to run open-vm-tools, so I don't want to see it removed in favour of open-vm-tools.
btrace: fix parsing of profile:hz:
Anton's fix for parsing of syscall names that are also tokens in the btrace grammar broke parsing of 'profile:hz:number', because it forces 'hz' to be handled as a string rather than a token. I can't see how we'd ever end up with a syscall named 'hz', so one way we could fix this would be to exclude the HZ token from the lexer backdoor. ok? Index: bt_parse.y === RCS file: /cvs/src/usr.sbin/btrace/bt_parse.y,v retrieving revision 1.20 diff -u -p -r1.20 bt_parse.y --- bt_parse.y 11 Dec 2020 07:27:55 - 1.20 +++ bt_parse.y 8 Jan 2021 21:37:53 - @@ -792,10 +792,14 @@ again: /* * Probe lexer backdoor, interpret the token as a string * rather than a keyword. Otherwise, reserved keywords -* would conflict with syscall names. +* would conflict with syscall names. The exception to +* this is 'hz', which hopefully will never be a +* syscall. */ - yylval.v.string = kwp->word; - return STRING; + if (kwp->token != HZ) { + yylval.v.string = kwp->word; + return STRING; + } } yylval.v.i = kwp->type; return kwp->token;
convert i386 fix_f00f() uvm_km_zalloc
I don't have a real 586, but I can tell qemu to pretend to be one, which at least executes this code. Using kd_waitok here seems suspect, because if we're out of memory this early I can't see anything else freeing any up, but uvm_km_zalloc() will also sleep rather than return failure. Should this use kd_nowait and panic if the allocation fails instead? ok? Index: arch/i386/i386/machdep.c === RCS file: /cvs/src/sys/arch/i386/i386/machdep.c,v retrieving revision 1.642 diff -u -p -u -p -r1.642 machdep.c --- arch/i386/i386/machdep.c28 Dec 2020 14:02:07 - 1.642 +++ arch/i386/i386/machdep.c3 Jan 2021 23:01:34 - @@ -3100,7 +3100,7 @@ fix_f00f(void) void *p; /* Allocate two new pages */ - va = uvm_km_zalloc(kernel_map, NBPG*2); + va = (vaddr_t)km_alloc(NBPG*2, &kv_any, &kp_zero, &kd_waitok); p = (void *)(va + NBPG - 7*sizeof(*idt)); /* Copy over old IDT */
convert vga POST uvm_km_vallocs
This code is now only here for some unfortunate Intel graphics chips based on PowerVR, and I don't have a machine with one of those. vga_post_init() gets called from vga_attach() in any case, and vga_post_free() doesn't seem to be called at all. I've booted this on amd64 (real) and i386 (virtualized) with no problems. ok? diff --git sys/arch/amd64/pci/vga_post.c sys/arch/amd64/pci/vga_post.c index 32876649ddd..36596490d35 100644 --- sys/arch/amd64/pci/vga_post.c +++ sys/arch/amd64/pci/vga_post.c @@ -125,13 +125,15 @@ vga_post_init(int bus, int device, int function) vaddr_t sys_image, sys_bios_data; int err; - sys_bios_data = uvm_km_valloc(kernel_map, PAGE_SIZE); + sys_bios_data = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, + &kd_nowait); if (sys_bios_data == 0) return NULL; - sys_image = uvm_km_valloc(kernel_map, 1024 * 1024); + sys_image = (vaddr_t)km_alloc(1024 * 1024, &kv_any, &kp_none, + &kd_nowait); if (sys_image == 0) { - uvm_km_free(kernel_map, sys_bios_data, PAGE_SIZE); + km_free((void *)sys_bios_data, PAGE_SIZE, &kv_any, &kp_none); return NULL; } sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); @@ -140,7 +142,7 @@ vga_post_init(int bus, int device, int function) err = uvm_pglistalloc(BASE_MEMORY, 0, (paddr_t)-1, 0, 0, &sc->ram_backing, BASE_MEMORY/PAGE_SIZE, UVM_PLA_WAITOK); if (err) { - uvm_km_free(kernel_map, sc->sys_image, 1024 * 1024); + km_free((void *)sc->sys_image, 1024 * 1024, &kv_any, &kp_none); free(sc, M_DEVBUF, sizeof(*sc)); return NULL; } @@ -152,7 +154,7 @@ vga_post_init(int bus, int device, int function) pmap_update(pmap_kernel()); memcpy((void *)sc->bios_data, (void *)sys_bios_data, PAGE_SIZE); pmap_kremove(sys_bios_data, PAGE_SIZE); - uvm_km_free(kernel_map, sys_bios_data, PAGE_SIZE); + km_free((void *)sys_bios_data, PAGE_SIZE, &kv_any, &kp_none); iter = 0; TAILQ_FOREACH(pg, &sc->ram_backing, pageq) { @@ -209,7 +211,7 @@ vga_post_free(struct vga_post *sc) { uvm_pglistfree(&sc->ram_backing); pmap_kremove(sc->sys_image, 1024 * 1024); - uvm_km_free(kernel_map, sc->sys_image, 1024 * 1024); + km_free((void *)sc->sys_image, 1024 * 1024, &kv_any, &kp_none) pmap_update(pmap_kernel()); free(sc, M_DEVBUF, sizeof(*sc)); } diff --git sys/arch/i386/pci/vga_post.c sys/arch/i386/pci/vga_post.c index c85ee05dcdb..2464fd6019c 100644 --- sys/arch/i386/pci/vga_post.c +++ sys/arch/i386/pci/vga_post.c @@ -126,13 +126,15 @@ vga_post_init(int bus, int device, int function) vaddr_t sys_image, sys_bios_data; int err; - sys_bios_data = uvm_km_valloc(kernel_map, PAGE_SIZE); + sys_bios_data = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, + &kd_nowait); if (sys_bios_data == 0) return NULL; - sys_image = uvm_km_valloc(kernel_map, 1024 * 1024); + sys_image = (vaddr_t)km_alloc(1024 * 1024, &kv_any, &kp_none, + &kd_nowait); if (sys_image == 0) { - uvm_km_free(kernel_map, sys_bios_data, PAGE_SIZE); + km_free((void *)sys_bios_data, PAGE_SIZE, &kv_any, &kp_none); return NULL; } sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); @@ -141,7 +143,7 @@ vga_post_init(int bus, int device, int function) err = uvm_pglistalloc(BASE_MEMORY, 0, (paddr_t)-1, 0, 0, &sc->ram_backing, BASE_MEMORY/PAGE_SIZE, UVM_PLA_WAITOK); if (err) { - uvm_km_free(kernel_map, sc->sys_image, 1024 * 1024); + km_free((void *)sc->sys_image, 1024 * 1024, &kv_any, &kp_none); free(sc, M_DEVBUF, sizeof *sc); return NULL; } @@ -153,7 +155,7 @@ vga_post_init(int bus, int device, int function) pmap_update(pmap_kernel()); memcpy((void *)sc->bios_data, (void *)sys_bios_data, PAGE_SIZE); pmap_kremove(sys_bios_data, PAGE_SIZE); - uvm_km_free(kernel_map, sys_bios_data, PAGE_SIZE); + km_free((void *)sys_bios_data, PAGE_SIZE, &kv_any, &kp_none); iter = 0; TAILQ_FOREACH(pg, &sc->ram_backing, pageq) { @@ -211,7 +213,7 @@ vga_post_free(struct vga_post *sc) uvm_pglistfree(&sc->ram_backing); pmap_kremove(sc->sys_image, 1024 * 1024); - uvm_km_free(kernel_map, sc->sys_image, 1024 * 1024); + km_free((void *)sc->sys_image, 1024 * 1024, &kv_any, &kp_none); pmap_update(pmap_kernel()); free(sc, M_DEVBUF, sizeof *sc); }
sparc64 cpu uvm_km_valloc()
Continuing to convert uvm_km_valloc() calls to km_alloc(), sparc64's struct cpu_info wants to be allocated on an 8 page boundary, so it needs a custom kmem_va_mode. My T5120 didn't blow up with this, so I think it works. ok? Index: arch/sparc64/sparc64/cpu.c === RCS file: /cvs/src/sys/arch/sparc64/sparc64/cpu.c,v retrieving revision 1.71 diff -u -p -u -p -r1.71 cpu.c --- arch/sparc64/sparc64/cpu.c 31 Jul 2020 11:19:12 - 1.71 +++ arch/sparc64/sparc64/cpu.c 21 Dec 2020 05:12:32 - @@ -113,6 +113,12 @@ void hummingbird_init(struct cpu_info *c #defineIU_IMPL(v) u_int64_t)(v))&VER_IMPL) >> VER_IMPL_SHIFT) #defineIU_VERS(v) u_int64_t)(v))&VER_MASK) >> VER_MASK_SHIFT) +/* virtual address allocation mode for struct cpu_info */ +struct kmem_va_mode kv_cpu_info = { + .kv_map = &kernel_map, + .kv_align = 8 * PAGE_SIZE +}; + struct cpu_info * alloc_cpuinfo(struct mainbus_attach_args *ma) { @@ -137,7 +143,7 @@ alloc_cpuinfo(struct mainbus_attach_args if (cpi->ci_upaid == portid) return cpi; - va = uvm_km_valloc_align(kernel_map, sz, 8 * PAGE_SIZE, 0); + va = (vaddr_t)km_alloc(sz, &kv_cpu_info, &kp_none, &kd_nowait); if (va == 0) panic("alloc_cpuinfo: no virtual space"); va0 = va;
mpbios: replace uvm_km_valloc() with km_alloc()
A few more km_alloc()s following the same pattern as acpi. I don't have any machines that actually need mpbios(4) but I've booted amd64 and i386 smp qemu vms with acpi disabled, which causes mpbios to attach instead. ok? Index: arch/amd64/amd64/mpbios.c === RCS file: /cvs/src/sys/arch/amd64/amd64/mpbios.c,v retrieving revision 1.29 diff -u -p -u -p -r1.29 mpbios.c --- arch/amd64/amd64/mpbios.c 7 Feb 2018 06:19:54 - 1.29 +++ arch/amd64/amd64/mpbios.c 19 Dec 2020 09:26:33 - @@ -240,7 +240,8 @@ mpbios_map(paddr_t pa, int len, struct m { paddr_t pgpa = trunc_page(pa); paddr_t endpa = round_page(pa + len); - vaddr_t va = uvm_km_valloc(kernel_map, endpa - pgpa); + vaddr_t va = (vaddr_t)km_alloc(endpa - pgpa, &kv_any, &kp_none, + &kd_nowait); vaddr_t retva = va + (pa & PGOFSET); handle->pa = pa; @@ -262,7 +263,7 @@ void mpbios_unmap(struct mp_map *handle) { pmap_kremove(handle->baseva, handle->vsize); - uvm_km_free(kernel_map, handle->baseva, handle->vsize); + km_free((void *)handle->baseva, handle->vsize, &kv_any, &kp_none); } /* Index: arch/i386/i386/mpbios.c === RCS file: /cvs/src/sys/arch/i386/i386/mpbios.c,v retrieving revision 1.41 diff -u -p -u -p -r1.41 mpbios.c --- arch/i386/i386/mpbios.c 7 Feb 2018 06:19:54 - 1.41 +++ arch/i386/i386/mpbios.c 19 Dec 2020 09:26:33 - @@ -253,7 +253,8 @@ mpbios_map(paddr_t pa, int len, struct m { paddr_t pgpa = trunc_page(pa); paddr_t endpa = round_page(pa + len); - vaddr_t va = uvm_km_valloc(kernel_map, endpa - pgpa); + vaddr_t va = (vaddr_t)km_alloc(endpa - pgpa, &kv_any, &kp_none, + &kd_nowait); vaddr_t retva = va + (pa & PGOFSET); handle->pa = pa; @@ -275,7 +276,7 @@ void mpbios_unmap(struct mp_map *handle) { pmap_kremove(handle->baseva, handle->vsize); - uvm_km_free(kernel_map, handle->baseva, handle->vsize); + km_free((void *)handle->baseva, handle->vsize, &kv_any, &kp_none); } /*
converting uvm_km_valloc to km_alloc
On Wed, Dec 16, 2020 at 12:00:38AM +0100, Mark Kettenis wrote: > > Date: Tue, 15 Dec 2020 21:21:37 +0100 > > From: Alexander Bluhm > > > > On Tue, Dec 15, 2020 at 06:57:03PM +0100, Mark Kettenis wrote: > > > Does the diff below fix this? > > > > I can reproduce the panic and your diff fixes it. > > > > Usually my regress machines do not trigger it as I do not install > > firmware. fw_update and reboot makes it crash. > > > > bluhm > > Thanks. This is committed now. However, there may be other case > where we use uvm_km_valloc() early on that will trip over the kernel > lock assertion that mpi@ added in uvm_km_pgremove(). Ideally we > should get rid of all the uvm_km_free() calls in the kernel. Here are a couple of relatively easy ones, applying changes from r1.86 of amd64's acpi_machdep.c to i386 and arm64. I've tested i386 but it turns out I don't have any arm64 machines with acpi. Index: arch/arm64/arm64/acpi_machdep.c === RCS file: /cvs/src/sys/arch/arm64/arm64/acpi_machdep.c,v retrieving revision 1.10 diff -u -p -u -p -r1.10 acpi_machdep.c --- arch/arm64/arm64/acpi_machdep.c 6 Dec 2020 21:19:55 - 1.10 +++ arch/arm64/arm64/acpi_machdep.c 18 Dec 2020 00:23:01 - @@ -74,7 +74,8 @@ acpi_map(paddr_t pa, size_t len, struct { paddr_t pgpa = trunc_page(pa); paddr_t endpa = round_page(pa + len); - vaddr_t va = uvm_km_valloc(kernel_map, endpa - pgpa); + vaddr_t va = (vaddr_t)km_alloc(endpa - pgpa, &kv_any, &kp_none, + &kd_nowait); if (va == 0) return (ENOMEM); @@ -97,7 +98,7 @@ void acpi_unmap(struct acpi_mem_map *handle) { pmap_kremove(handle->baseva, handle->vsize); - uvm_km_free(kernel_map, handle->baseva, handle->vsize); + km_free((void *)handle->baseva, handle->vsize, &kv_any, &kp_none); } int Index: arch/i386/i386/acpi_machdep.c === RCS file: /cvs/src/sys/arch/i386/i386/acpi_machdep.c,v retrieving revision 1.74 diff -u -p -u -p -r1.74 acpi_machdep.c --- arch/i386/i386/acpi_machdep.c 21 Jul 2020 03:48:06 - 1.74 +++ arch/i386/i386/acpi_machdep.c 18 Dec 2020 00:23:01 - @@ -117,7 +117,8 @@ acpi_map(paddr_t pa, size_t len, struct { paddr_t pgpa = trunc_page(pa); paddr_t endpa = round_page(pa + len); - vaddr_t va = uvm_km_valloc(kernel_map, endpa - pgpa); + vaddr_t va = (vaddr_t)km_alloc(endpa - pgpa, &kv_any, &kp_none, + &kd_nowait); if (va == 0) return (ENOMEM); @@ -140,7 +141,7 @@ void acpi_unmap(struct acpi_mem_map *handle) { pmap_kremove(handle->baseva, handle->vsize); - uvm_km_free(kernel_map, handle->baseva, handle->vsize); + km_free((void *)handle->baseva, handle->vsize, &kv_any, &kp_none); } int
Re: uvm_fault: entering swap code
On Sat, Dec 12, 2020 at 10:54:57PM +1000, Jonathan Matthew wrote: > On Thu, Dec 10, 2020 at 10:46:58AM -0300, Martin Pieuchot wrote: > > On 08/12/20(Tue) 22:55, Jonathan Matthew wrote: > > > On Mon, Dec 07, 2020 at 03:15:50PM -0300, Martin Pieuchot wrote: > > > > Getting a page from the fault handler might require poking at some > > > > swap-related states. > > > > > > > > These are not in the hot-path of the fault handler so for the moment > > > > just assert that the KERNEL_LOCK() is held or grab it if the function > > > > might be called from an future unlocked path. > > > > > > > > ok? > > > > > > Could you add 'K' to the list of locks in the comment above struct uvmexp > > > too? > > > > Updated diff below. > > > > > I went looking for other uses of swpgonly and saw that it's used under > > > uvm_map_teardown -> uvm_unmap_kill_entry -> uvm_km_pgremove, > > > and uvm_map_teardown ensures that the kernel lock is not held. > > > Not related to this diff exactly, but is this something we need to fix? > > > > I suppose that the problem can only occur if a kernel thread is exiting > > since this code is only executed for the kernel pmap. Anyway I added an > > assertion. > > Right, and as I understand it, kernel threads all share the proc0 vm space, > so its reference count won't ever reach 0, so the kernel map portions of > uvm_unmap_kill_entry() can't be reached from the reaper. Looks like this is > all safe, it just requires a bit more reading than I did the first time. > I'll see if I can find a way to make it more clear. And now that I've tested this out and checked that it doesn't blow up when you drive the machine into swap, ok jmatthew@ > > > > > Index: uvm/uvm_km.c > > === > > RCS file: /cvs/src/sys/uvm/uvm_km.c,v > > retrieving revision 1.137 > > diff -u -p -r1.137 uvm_km.c > > --- uvm/uvm_km.c23 May 2020 06:15:09 - 1.137 > > +++ uvm/uvm_km.c10 Dec 2020 13:33:49 - > > @@ -243,6 +243,7 @@ uvm_km_pgremove(struct uvm_object *uobj, > > voff_t curoff; > > int slot; > > > > + KERNEL_ASSERT_LOCKED(); > > KASSERT(uobj->pgops == &aobj_pager); > > > > for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) { > > Index: uvm/uvm_swap.c > > === > > RCS file: /cvs/src/sys/uvm/uvm_swap.c,v > > retrieving revision 1.147 > > diff -u -p -r1.147 uvm_swap.c > > --- uvm/uvm_swap.c 29 Sep 2020 11:47:41 - 1.147 > > +++ uvm/uvm_swap.c 10 Dec 2020 13:30:30 - > > @@ -1403,7 +1403,7 @@ uvm_swap_alloc(int *nslots, boolean_t le > > /* > > * lock data lock, convert slots into blocks, and enter loop > > */ > > - > > + KERNEL_ASSERT_LOCKED(); > > ReTry: /* XXXMRG */ > > LIST_FOREACH(spp, &swap_priority, spi_swappri) { > > TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { > > @@ -1449,8 +1449,10 @@ uvm_swapisfull(void) > > { > > int result; > > > > + KERNEL_LOCK(); > > KASSERT(uvmexp.swpgonly <= uvmexp.swpages); > > result = (uvmexp.swpgonly == uvmexp.swpages); > > + KERNEL_UNLOCK(); > > > > return result; > > } > > @@ -1465,6 +1467,7 @@ uvm_swap_markbad(int startslot, int nslo > > { > > struct swapdev *sdp; > > > > + KERNEL_LOCK(); > > sdp = swapdrum_getsdp(startslot); > > if (sdp != NULL) { > > /* > > @@ -1475,6 +1478,7 @@ uvm_swap_markbad(int startslot, int nslo > > */ > > sdp->swd_npgbad += nslots; > > } > > + KERNEL_UNLOCK(); > > } > > > > /* > > @@ -1501,7 +1505,7 @@ uvm_swap_free(int startslot, int nslots) > > * in the extent, and return. must hold pri lock to do > > * lookup and access the extent. > > */ > > - > > + KERNEL_LOCK(); > > sdp = swapdrum_getsdp(startslot); > > KASSERT(uvmexp.nswapdev >= 1); > > KASSERT(sdp != NULL); > > @@ -1533,6 +1537,7 @@ uvm_swap_free(int startslot, int nslots) > > } > > } > > #endif /* UVM_SWAP_ENCRYPT */ > > + KERNEL_UNLOCK(); > > } > > > > /* > > @@ -1567,6 +1572,7 @@ uvm_swap_get(struct vm_page *page, int s > > return VM_PAGER
Re: uvm_fault: entering swap code
On Thu, Dec 10, 2020 at 10:46:58AM -0300, Martin Pieuchot wrote: > On 08/12/20(Tue) 22:55, Jonathan Matthew wrote: > > On Mon, Dec 07, 2020 at 03:15:50PM -0300, Martin Pieuchot wrote: > > > Getting a page from the fault handler might require poking at some > > > swap-related states. > > > > > > These are not in the hot-path of the fault handler so for the moment > > > just assert that the KERNEL_LOCK() is held or grab it if the function > > > might be called from an future unlocked path. > > > > > > ok? > > > > Could you add 'K' to the list of locks in the comment above struct uvmexp > > too? > > Updated diff below. > > > I went looking for other uses of swpgonly and saw that it's used under > > uvm_map_teardown -> uvm_unmap_kill_entry -> uvm_km_pgremove, > > and uvm_map_teardown ensures that the kernel lock is not held. > > Not related to this diff exactly, but is this something we need to fix? > > I suppose that the problem can only occur if a kernel thread is exiting > since this code is only executed for the kernel pmap. Anyway I added an > assertion. Right, and as I understand it, kernel threads all share the proc0 vm space, so its reference count won't ever reach 0, so the kernel map portions of uvm_unmap_kill_entry() can't be reached from the reaper. Looks like this is all safe, it just requires a bit more reading than I did the first time. I'll see if I can find a way to make it more clear. > > Index: uvm/uvm_km.c > === > RCS file: /cvs/src/sys/uvm/uvm_km.c,v > retrieving revision 1.137 > diff -u -p -r1.137 uvm_km.c > --- uvm/uvm_km.c 23 May 2020 06:15:09 - 1.137 > +++ uvm/uvm_km.c 10 Dec 2020 13:33:49 - > @@ -243,6 +243,7 @@ uvm_km_pgremove(struct uvm_object *uobj, > voff_t curoff; > int slot; > > + KERNEL_ASSERT_LOCKED(); > KASSERT(uobj->pgops == &aobj_pager); > > for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) { > Index: uvm/uvm_swap.c > === > RCS file: /cvs/src/sys/uvm/uvm_swap.c,v > retrieving revision 1.147 > diff -u -p -r1.147 uvm_swap.c > --- uvm/uvm_swap.c29 Sep 2020 11:47:41 - 1.147 > +++ uvm/uvm_swap.c10 Dec 2020 13:30:30 - > @@ -1403,7 +1403,7 @@ uvm_swap_alloc(int *nslots, boolean_t le > /* >* lock data lock, convert slots into blocks, and enter loop >*/ > - > + KERNEL_ASSERT_LOCKED(); > ReTry: /* XXXMRG */ > LIST_FOREACH(spp, &swap_priority, spi_swappri) { > TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { > @@ -1449,8 +1449,10 @@ uvm_swapisfull(void) > { > int result; > > + KERNEL_LOCK(); > KASSERT(uvmexp.swpgonly <= uvmexp.swpages); > result = (uvmexp.swpgonly == uvmexp.swpages); > + KERNEL_UNLOCK(); > > return result; > } > @@ -1465,6 +1467,7 @@ uvm_swap_markbad(int startslot, int nslo > { > struct swapdev *sdp; > > + KERNEL_LOCK(); > sdp = swapdrum_getsdp(startslot); > if (sdp != NULL) { > /* > @@ -1475,6 +1478,7 @@ uvm_swap_markbad(int startslot, int nslo >*/ > sdp->swd_npgbad += nslots; > } > + KERNEL_UNLOCK(); > } > > /* > @@ -1501,7 +1505,7 @@ uvm_swap_free(int startslot, int nslots) >* in the extent, and return. must hold pri lock to do >* lookup and access the extent. >*/ > - > + KERNEL_LOCK(); > sdp = swapdrum_getsdp(startslot); > KASSERT(uvmexp.nswapdev >= 1); > KASSERT(sdp != NULL); > @@ -1533,6 +1537,7 @@ uvm_swap_free(int startslot, int nslots) > } > } > #endif /* UVM_SWAP_ENCRYPT */ > + KERNEL_UNLOCK(); > } > > /* > @@ -1567,6 +1572,7 @@ uvm_swap_get(struct vm_page *page, int s > return VM_PAGER_ERROR; > } > > + KERNEL_LOCK(); > /* this page is (about to be) no longer only in swap. */ > uvmexp.swpgonly--; > > @@ -1577,7 +1583,7 @@ uvm_swap_get(struct vm_page *page, int s > /* oops, the read failed so it really is still only in swap. */ > uvmexp.swpgonly++; > } > - > + KERNEL_UNLOCK(); > return (result); > } > > @@ -1599,6 +1605,8 @@ uvm_swap_io(struct vm_page **pps, int st > struct swapdev *sdp; > int encrypt = 0; > #endif > + > + KERNEL_ASSERT_LOCKED(); > > write = (flags &a
Re: uvm_fault: entering swap code
On Mon, Dec 07, 2020 at 03:15:50PM -0300, Martin Pieuchot wrote: > Getting a page from the fault handler might require poking at some > swap-related states. > > These are not in the hot-path of the fault handler so for the moment > just assert that the KERNEL_LOCK() is held or grab it if the function > might be called from an future unlocked path. > > ok? Could you add 'K' to the list of locks in the comment above struct uvmexp too? I went looking for other uses of swpgonly and saw that it's used under uvm_map_teardown -> uvm_unmap_kill_entry -> uvm_km_pgremove, and uvm_map_teardown ensures that the kernel lock is not held. Not related to this diff exactly, but is this something we need to fix? > > Index: uvm/uvm_swap.c > === > RCS file: /cvs/src/sys/uvm/uvm_swap.c,v > retrieving revision 1.147 > diff -u -p -r1.147 uvm_swap.c > --- uvm/uvm_swap.c29 Sep 2020 11:47:41 - 1.147 > +++ uvm/uvm_swap.c7 Dec 2020 18:07:03 - > @@ -1403,7 +1403,7 @@ uvm_swap_alloc(int *nslots, boolean_t le > /* >* lock data lock, convert slots into blocks, and enter loop >*/ > - > + KERNEL_ASSERT_LOCKED(); > ReTry: /* XXXMRG */ > LIST_FOREACH(spp, &swap_priority, spi_swappri) { > TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { > @@ -1449,8 +1449,10 @@ uvm_swapisfull(void) > { > int result; > > + KERNEL_LOCK(); > KASSERT(uvmexp.swpgonly <= uvmexp.swpages); > result = (uvmexp.swpgonly == uvmexp.swpages); > + KERNEL_UNLOCK(); > > return result; > } > @@ -1465,6 +1467,7 @@ uvm_swap_markbad(int startslot, int nslo > { > struct swapdev *sdp; > > + KERNEL_LOCK(); > sdp = swapdrum_getsdp(startslot); > if (sdp != NULL) { > /* > @@ -1475,6 +1478,7 @@ uvm_swap_markbad(int startslot, int nslo >*/ > sdp->swd_npgbad += nslots; > } > + KERNEL_UNLOCK(); > } > > /* > @@ -1501,7 +1505,7 @@ uvm_swap_free(int startslot, int nslots) >* in the extent, and return. must hold pri lock to do >* lookup and access the extent. >*/ > - > + KERNEL_LOCK(); > sdp = swapdrum_getsdp(startslot); > KASSERT(uvmexp.nswapdev >= 1); > KASSERT(sdp != NULL); > @@ -1533,6 +1537,7 @@ uvm_swap_free(int startslot, int nslots) > } > } > #endif /* UVM_SWAP_ENCRYPT */ > + KERNEL_UNLOCK(); > } > > /* > @@ -1567,6 +1572,7 @@ uvm_swap_get(struct vm_page *page, int s > return VM_PAGER_ERROR; > } > > + KERNEL_LOCK(); > /* this page is (about to be) no longer only in swap. */ > uvmexp.swpgonly--; > > @@ -1577,7 +1583,7 @@ uvm_swap_get(struct vm_page *page, int s > /* oops, the read failed so it really is still only in swap. */ > uvmexp.swpgonly++; > } > - > + KERNEL_UNLOCK(); > return (result); > } > > @@ -1599,6 +1605,8 @@ uvm_swap_io(struct vm_page **pps, int st > struct swapdev *sdp; > int encrypt = 0; > #endif > + > + KERNEL_ASSERT_LOCKED(); > > write = (flags & B_READ) == 0; > async = (flags & B_ASYNC) != 0; > Index: uvm/uvmexp.h > === > RCS file: /cvs/src/sys/uvm/uvmexp.h,v > retrieving revision 1.6 > diff -u -p -r1.6 uvmexp.h > --- uvm/uvmexp.h 1 Dec 2020 13:56:22 - 1.6 > +++ uvm/uvmexp.h 7 Dec 2020 18:09:06 - > @@ -79,9 +79,9 @@ struct uvmexp { > > /* swap */ > int nswapdev; /* number of configured swap devices in system */ > - int swpages;/* number of PAGE_SIZE'ed swap pages */ > + int swpages;/* [K] number of PAGE_SIZE'ed swap pages */ > int swpginuse; /* number of swap pages in use */ > - int swpgonly; /* number of swap pages in use, not also in RAM */ > + int swpgonly; /* [K] number of swap pages in use, not also in RAM */ > int nswget; /* number of swap pages moved from disk to RAM */ > int nanon; /* XXX number total of anon's in system */ > int unused05; /* formerly nanonneeded */ >
Re: uvm_fault: kill goto in uvm_fault()
On Mon, Dec 07, 2020 at 04:08:46PM -0300, Martin Pieuchot wrote: > Diff below rewrites uvm_fault() using a loop. > > I added a KERNEL_LOCK/UNLOCK() dance around the part that won't be > unlocked soon to illustrate where this is going. > > ok? yes, ok jmatthew@ > > Index: uvm/uvm_fault.c > === > RCS file: /cvs/src/sys/uvm/uvm_fault.c,v > retrieving revision 1.108 > diff -u -p -r1.108 uvm_fault.c > --- uvm/uvm_fault.c 19 Nov 2020 17:06:40 - 1.108 > +++ uvm/uvm_fault.c 7 Dec 2020 18:20:16 - > @@ -907,7 +907,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad > boolean_t shadowed; > struct vm_anon *anons_store[UVM_MAXRANGE], **anons; > struct vm_page *pages[UVM_MAXRANGE]; > - int error; > + int error = ERESTART; > > uvmexp.faults++;/* XXX: locking? */ > TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL); > @@ -923,43 +923,32 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad > flt.narrow = FALSE; /* normal fault */ > > > - /* "goto ReFault" means restart the page fault from ground zero. */ > -ReFault: > - anons = anons_store; > - > - error = uvm_fault_check(&ufi, &flt, &anons, access_type); > - switch (error) { > - case 0: > - break; > - case ERESTART: > - goto ReFault; > - default: > - return error; > - } > - > - /* (shadowed == TRUE) if there is an anon at the faulting address */ > - shadowed = uvm_fault_upper_lookup(&ufi, &flt, anons, pages); > - > - /* handle case 1: fault on an anon in our amap */ > - if (shadowed == TRUE) { > - error = uvm_fault_upper(&ufi, &flt, anons, fault_type, > - access_type); > - switch (error) { > - case ERESTART: > - goto ReFault; > - default: > - return error; > + /* > + * ReFault > + */ > + while (error == ERESTART) { > + anons = anons_store; > + > + error = uvm_fault_check(&ufi, &flt, &anons, access_type); > + if (error != 0) > + continue; > + > + /* True if there is an anon at the faulting address */ > + shadowed = uvm_fault_upper_lookup(&ufi, &flt, anons, pages); > + if (shadowed == TRUE) { > + /* case 1: fault on an anon in our amap */ > + error = uvm_fault_upper(&ufi, &flt, anons, fault_type, > + access_type); > + } else { > + /* case 2: fault on backing object or zero fill */ > + KERNEL_LOCK(); > + error = uvm_fault_lower(&ufi, &flt, pages, fault_type, > + access_type); > + KERNEL_UNLOCK(); > } > } > > - /* handle case 2: faulting on backing object or zero fill */ > - error = uvm_fault_lower(&ufi, &flt, pages, fault_type, access_type); > - switch (error) { > - case ERESTART: > - goto ReFault; > - default: > - return error; > - } > + return error; > } > > int >
Re: Use SMR_TAILQ for `ps_threads'
On Fri, Dec 04, 2020 at 10:03:46AM -0300, Martin Pieuchot wrote: > On 04/12/20(Fri) 12:01, Jonathan Matthew wrote: > > On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote: > > > [...] > > > Could you try the diff below that only call smr_barrier() for multi- > > > threaded processes with threads still in the list. I guess this also > > > answers guenther@'s question. The same could be done with smr_flush(). > > > > This removes the overhead, more or less. Are we only looking at unlocking > > access > > to ps_threads from within a process (not the sysctl or ptrace stuff)? > > Otherwise > > this doesn't seem safe. > > I'd argue that if `ps_thread' is being iterated the CPU doing the > iteration must already have a reference to the "struct process" so > the serialization should be done on this reference. Sounds reasonable to me. > > Now I doubt we'll be able to answer all the questions right now. If we > can find a path forward that doesn't decrease performances too much and > allow us to move signal delivery and sleep out of the KERNEL_LOCK() > that's a huge win. I think we're at an acceptable performance hit now, so if this lets you progress with unlocking signal delivery, I'm happy.
Re: srp_finalize(9): tsleep(9) -> tsleep_nsec(9)
On Fri, Dec 04, 2020 at 12:17:31PM -0600, Scott Cheloha wrote: > On Fri, Dec 04, 2020 at 09:56:02AM +0100, Claudio Jeker wrote: > > On Thu, Dec 03, 2020 at 10:05:30PM -0600, Scott Cheloha wrote: > > > Hi, > > > > > > srp_finalize(9) uses tsleep(9) to spin while it waits for the object's > > > refcount to reach zero. It blocks for up to 1 tick and then checks > > > the refecount again and again. > > > > > > We can just as easily do this with tsleep_nsec(9) and block for 1 > > > millisecond per interval. > > > > > > ok? > > > > > > Index: kern_srp.c > > > === > > > RCS file: /cvs/src/sys/kern/kern_srp.c,v > > > retrieving revision 1.12 > > > diff -u -p -r1.12 kern_srp.c > > > --- kern_srp.c8 Sep 2017 05:36:53 - 1.12 > > > +++ kern_srp.c4 Dec 2020 04:04:39 - > > > @@ -274,7 +274,7 @@ void > > > srp_finalize(void *v, const char *wmesg) > > > { > > > while (srp_referenced(v)) > > > - tsleep(v, PWAIT, wmesg, 1); > > > + tsleep_nsec(v, PWAIT, wmesg, MSEC_TO_NSEC(1)); > > > } > > > > > > #else /* MULTIPROCESSOR */ > > > > > > > Why only 1ms instead of the original 10ms (at least on most archs)? > > The underlying implementation can only process timeouts from > hardclock(9) which runs about hz times per second. If we tell the > thread to "sleep for 10ms" it's almost always going to overshoot the > next hardclock(9) and wind up sleeping ~20ms. > > Some people run with HZ=1000 kernels. I don't think many people run > with kernels with a higher HZ than that, though. So I figure a 1ms > sleep is "good enough" for all practical kernels. On HZ=100 kernels > the thread will oversleep because it doesn't process timeouts often > enough to honor the 1ms request. > > Basically I'm trying to pick a reasonable polling interval (not too > fast) that also won't cause the existing default kernel to block for > longer than it already does (~10ms). The default kernel is HZ=100, so > a 1ms sleep will, in this case, almost always sleep ~10ms per > iteration of this loop. This sleep should basically be 'as short as possible', since it's waiting out SRP references which are very short lived. ok jmatthew@
Re: Use SMR_TAILQ for `ps_threads'
On Wed, Dec 02, 2020 at 07:44:02PM +0100, Anton Lindqvist wrote: > On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote: > > On 02/12/20(Wed) 17:27, Jonathan Matthew wrote: > > > On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote: > > > > On 01/12/20(Tue) 15:30, Claudio Jeker wrote: > > > > > [...] > > > > > Did you run a make build with that smr_barrier() in it and checked > > > > > that it > > > > > does not cause a slow down? I am sceptical, smr_barrier() is a very > > > > > slow > > > > > construct which introduces large delays and should be avoided whenever > > > > > possible. > > > > > > > > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff > > > > below, without noticeable difference. > > > > > > > > I'm happy to hear from sceptical performance checkers :o) > > > > > > On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build > > > time from > > > ~3m06s to ~3m44s, which seems a bit much to me. > > > > Do you know if this is due to an increase of %spin time? > > > > > Replacing smr_barrier() with smr_flush() reduces the overhead to a couple > > > of > > > seconds, and it seems warranted here. > > > > Could you try the diff below that only call smr_barrier() for multi- > > threaded processes with threads still in the list. I guess this also > > answers guenther@'s question. The same could be done with smr_flush(). > > I'm wondering if smr_grace_wait() could be improved on amd64, assuming > SMT is disabled, by skipping offline CPUs. This doesn't make much of a difference when using smr_barrier(), but with smr_flush() it removes much of the overhead on this system with 8 of 16 cpus online. Of course as Visa and Mark point out this is risky without more guarantees about what offline cpus are actually doing. If we start using SMR in ways that make the delay visible to user processes, it'll probably be worth looking at. > > Index: kern/kern_smr.c > === > RCS file: /cvs/src/sys/kern/kern_smr.c,v > retrieving revision 1.8 > diff -u -p -r1.8 kern_smr.c > --- kern/kern_smr.c 3 Apr 2020 03:36:56 - 1.8 > +++ kern/kern_smr.c 2 Dec 2020 18:41:29 - > @@ -142,7 +142,7 @@ smr_grace_wait(void) > > ci_start = curcpu(); > CPU_INFO_FOREACH(cii, ci) { > - if (ci == ci_start) > + if (ci == ci_start || !cpu_is_online(ci)) > continue; > sched_peg_curproc(ci); > } >
Re: Use SMR_TAILQ for `ps_threads'
On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote: > On 02/12/20(Wed) 17:27, Jonathan Matthew wrote: > > On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote: > > > On 01/12/20(Tue) 15:30, Claudio Jeker wrote: > > > > [...] > > > > Did you run a make build with that smr_barrier() in it and checked that > > > > it > > > > does not cause a slow down? I am sceptical, smr_barrier() is a very slow > > > > construct which introduces large delays and should be avoided whenever > > > > possible. > > > > > > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff > > > below, without noticeable difference. > > > > > > I'm happy to hear from sceptical performance checkers :o) > > > > On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build > > time from > > ~3m06s to ~3m44s, which seems a bit much to me. > > Do you know if this is due to an increase of %spin time? It actually decreased %spin, and the total system cpu time used during the build was decreased from around 6m30s to around 5m15, so I think it's mostly the effect of the delayed wakeup of the SMR thread in smr_dispatch(). There's also this: $ time sleep 1 0m01.11s real 0m00.00s user 0m00.00s system > > > Replacing smr_barrier() with smr_flush() reduces the overhead to a couple of > > seconds, and it seems warranted here. > > Could you try the diff below that only call smr_barrier() for multi- > threaded processes with threads still in the list. I guess this also > answers guenther@'s question. The same could be done with smr_flush(). This removes the overhead, more or less. Are we only looking at unlocking access to ps_threads from within a process (not the sysctl or ptrace stuff)? Otherwise this doesn't seem safe.
Re: Use SMR_TAILQ for `ps_threads'
On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote: > On 01/12/20(Tue) 15:30, Claudio Jeker wrote: > > [...] > > Did you run a make build with that smr_barrier() in it and checked that it > > does not cause a slow down? I am sceptical, smr_barrier() is a very slow > > construct which introduces large delays and should be avoided whenever > > possible. > > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff > below, without noticeable difference. > > I'm happy to hear from sceptical performance checkers :o) On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build time from ~3m06s to ~3m44s, which seems a bit much to me. Replacing smr_barrier() with smr_flush() reduces the overhead to a couple of seconds, and it seems warranted here. > > diff --git lib/libkvm/kvm_proc2.c lib/libkvm/kvm_proc2.c > index 96f7dc91b92..1f4f9b914bb 100644 > --- lib/libkvm/kvm_proc2.c > +++ lib/libkvm/kvm_proc2.c > @@ -341,8 +341,9 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process > *pr, > kp.p_pctcpu = 0; > kp.p_stat = (process.ps_flags & PS_ZOMBIE) ? SDEAD : > SIDL; > - for (p = TAILQ_FIRST(&process.ps_threads); p != NULL; > - p = TAILQ_NEXT(&proc, p_thr_link)) { > + for (p = SMR_TAILQ_FIRST_LOCKED(&process.ps_threads); > + p != NULL; > + p = SMR_TAILQ_NEXT_LOCKED(&proc, p_thr_link)) { > if (KREAD(kd, (u_long)p, &proc)) { > _kvm_err(kd, kd->program, > "can't read proc at %lx", > @@ -376,8 +377,8 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process > *pr, > if (!dothreads) > continue; > > - for (p = TAILQ_FIRST(&process.ps_threads); p != NULL; > - p = TAILQ_NEXT(&proc, p_thr_link)) { > + for (p = SMR_TAILQ_FIRST_LOCKED(&process.ps_threads); p != NULL; > + p = SMR_TAILQ_NEXT_LOCKED(&proc, p_thr_link)) { > if (KREAD(kd, (u_long)p, &proc)) { > _kvm_err(kd, kd->program, > "can't read proc at %lx", > diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c > index 5e455208663..575273b306c 100644 > --- sys/kern/exec_elf.c > +++ sys/kern/exec_elf.c > @@ -85,6 +85,7 @@ > #include > #include > #include > +#include > > #include > > @@ -1360,7 +1361,7 @@ coredump_notes_elf(struct proc *p, void *iocookie, > size_t *sizep) >* threads in the process have been stopped and the list can't >* change. >*/ > - TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) { > + SMR_TAILQ_FOREACH_LOCKED(q, &pr->ps_threads, p_thr_link) { > if (q == p) /* we've taken care of this thread */ > continue; > error = coredump_note_elf(q, iocookie, ¬esize); > diff --git sys/kern/init_main.c sys/kern/init_main.c > index fed6be19435..2b657ffe328 100644 > --- sys/kern/init_main.c > +++ sys/kern/init_main.c > @@ -519,7 +519,7 @@ main(void *framep) >*/ > LIST_FOREACH(pr, &allprocess, ps_list) { > nanouptime(&pr->ps_start); > - TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) { > + SMR_TAILQ_FOREACH_LOCKED(p, &pr->ps_threads, p_thr_link) { > nanouptime(&p->p_cpu->ci_schedstate.spc_runtime); > timespecclear(&p->p_rtime); > } > diff --git sys/kern/kern_exit.c sys/kern/kern_exit.c > index a20775419e3..3c526ab83b8 100644 > --- sys/kern/kern_exit.c > +++ sys/kern/kern_exit.c > @@ -63,6 +63,7 @@ > #ifdef SYSVSEM > #include > #endif > +#include > #include > > #include > @@ -161,7 +162,8 @@ exit1(struct proc *p, int xexit, int xsig, int flags) > } > > /* unlink ourselves from the active threads */ > - TAILQ_REMOVE(&pr->ps_threads, p, p_thr_link); > + SMR_TAILQ_REMOVE_LOCKED(&pr->ps_threads, p, p_thr_link); > + smr_barrier(); > if ((p->p_flag & P_THREAD) == 0) { > /* main thread gotta wait because it has the pid, et al */ > while (pr->ps_refcnt > 1) > @@ -724,7 +726,7 @@ process_zap(struct process *pr) > if (pr->ps_ptstat != NULL) > free(pr->ps_ptstat, M_SUBPROC, sizeof(*pr->ps_ptstat)); > pool_put(&rusage_pool, pr->ps_ru); > - KASSERT(TAILQ_EMPTY(&pr->ps_threads)); > + KASSERT(SMR_TAILQ_EMPTY_LOCKED(&pr->ps_threads)); > lim_free(pr->ps_limit); > crfree(pr->ps_ucred); > pool_put(&process_pool, pr); > diff --git sys/kern/kern_fork.c sys/kern/kern_fork.c > index 9fb239bc8b4..e1cb587b2b8 100644 > --- sys/kern/kern_fork.c > +++ sys/kern/kern_fork.c > @@ -52,6 +52,7 @@ > #include > #include > #include > +#include > #include >
Re: Use SMR_TAILQ for `ps_threads'
On Tue, Dec 01, 2020 at 10:31:43AM +0100, Claudio Jeker wrote: > On Mon, Nov 30, 2020 at 07:10:47PM -0300, Martin Pieuchot wrote: > > Every multi-threaded process keeps a list of threads in `ps_threads'. > > This list is iterated in interrupt and process context which makes it > > complicated to protect it with a rwlock. > > > > One of the places where such iteration is done is inside the tsleep(9) > > routines, directly in single_thread_check() or via CURSIG(). In order > > to take this code path out of the KERNEL_LOCK(), claudio@ proposed to > > use SMR_TAILQ. This has the advantage of not introducing lock > > dependencies and allow us to address every iteration one-by-one. > > > > Diff below is a first step into this direction, it replaces the existing > > TAILQ_* macros by the locked version of SMR_TAILQ*. This is mostly lifted > > from claudio@'s diff and should not introduce any side effect. > > > > ok? > > > > diff --git sys/uvm/uvm_glue.c sys/uvm/uvm_glue.c > > index 390307c4c81..40a10e4c1c5 100644 > > --- sys/uvm/uvm_glue.c > > +++ sys/uvm/uvm_glue.c > > @@ -369,7 +369,7 @@ uvm_swapout_threads(void) > > * the smallest p_slptime > > */ > > slpp = NULL; > > - TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) { > > + SMR_TAILQ_FOREACH_LOCKED(p, &pr->ps_threads, p_thr_link) { > > switch (p->p_stat) { > > case SRUN: > > case SONPROC: > > > > Why did you not include the smr_call() to safely free struct proc in this > diff? I was wondering about this too. Freeing the struct proc is already delayed by some amount since it happens in the reaper or in the parent process, does it make sense to combine that with the SMR wait?
Re: ldapd warning
On Sat, Nov 28, 2020 at 11:20:30PM +0100, Theo Buehler wrote: > /usr/src/usr.sbin/ldapd/util.c:46:21: warning: comparison of integers of > different signs: > 'int' and 'size_t' (aka 'unsigned long') [-Wsign-compare] > if (ret < 0 || ret >= size) >~~~ ^ > > This has been around for a while. I forgot that I had this patch in my > tree. 'size' was cast to int before r1.11 of util.c, I'm not sure why the cast was removed. smtpd also has a copy of this function that still has the cast. > > Index: util.c > === > RCS file: /cvs/src/usr.sbin/ldapd/util.c,v > retrieving revision 1.12 > diff -u -p -r1.12 util.c > --- util.c24 Oct 2019 12:39:26 - 1.12 > +++ util.c4 Aug 2020 07:14:33 - > @@ -43,7 +43,7 @@ bsnprintf(char *str, size_t size, const > va_start(ap, format); > ret = vsnprintf(str, size, format, ap); > va_end(ap); > - if (ret < 0 || ret >= size) > + if (ret < 0 || (size_t)ret >= size) > return 0; > > return 1; >
Re: Locking of uvm_pageclean()
On Wed, Nov 18, 2020 at 08:31:23PM -0300, Martin Pieuchot wrote: > I found another race related to some missing locking, this time around > uvm_pageclean(). > > Diff below fixes the two places in /sys/uvm where the page queue lock > should be taken. To prevent further corruption I added some assertions > and documented some global data structures that are currently protected > by this lock. > > Note that uvm_pagefree() is called by many pmaps most of the time > without the lock held. The diff below doesn't fix them and that's why > some assertions are commented out. > > ok? It looks like there are a couple of other paths to uvm_pagefree() that don't take the page queue lock - uvm_km_pgremove_intrface() and (on non pmap direct archs) uvm_km_doputpage(). Since that doesn't really affect the diff, and everything else is right as far as I can tell, ok jmatthew@ > > Index: uvm/uvm.h > === > RCS file: /cvs/src/sys/uvm/uvm.h,v > retrieving revision 1.67 > diff -u -p -r1.67 uvm.h > --- uvm/uvm.h 6 Dec 2019 08:33:25 - 1.67 > +++ uvm/uvm.h 18 Nov 2020 23:22:15 - > @@ -44,18 +44,20 @@ > /* > * uvm structure (vm global state: collected in one structure for ease > * of reference...) > + * > + * Locks used to protect struct members in this file: > + * Q uvm.pageqlock > */ > - > struct uvm { > /* vm_page related parameters */ > > /* vm_page queues */ > - struct pglist page_active; /* allocated pages, in use */ > - struct pglist page_inactive_swp;/* pages inactive (reclaim or free) */ > - struct pglist page_inactive_obj;/* pages inactive (reclaim or free) */ > + struct pglist page_active; /* [Q] allocated pages, in use */ > + struct pglist page_inactive_swp;/* [Q] pages inactive (reclaim/free) */ > + struct pglist page_inactive_obj;/* [Q] pages inactive (reclaim/free) */ > /* Lock order: pageqlock, then fpageqlock. */ > - struct mutex pageqlock; /* lock for active/inactive page q */ > - struct mutex fpageqlock;/* lock for free page q + pdaemon */ > + struct mutex pageqlock; /* [] lock for active/inactive page q */ > + struct mutex fpageqlock;/* [] lock for free page q + pdaemon */ > boolean_t page_init_done; /* TRUE if uvm_page_init() finished */ > struct uvm_pmr_control pmr_control; /* pmemrange data */ > > Index: uvm/uvm_anon.c > === > RCS file: /cvs/src/sys/uvm/uvm_anon.c,v > retrieving revision 1.49 > diff -u -p -r1.49 uvm_anon.c > --- uvm/uvm_anon.c4 Jan 2020 16:17:29 - 1.49 > +++ uvm/uvm_anon.c18 Nov 2020 23:22:15 - > @@ -106,7 +106,9 @@ uvm_anfree_list(struct vm_anon *anon, st >* clean page, and put on on pglist >* for later freeing. >*/ > + uvm_lock_pageq(); > uvm_pageclean(pg); > + uvm_unlock_pageq(); > TAILQ_INSERT_HEAD(pgl, pg, pageq); > } else { > uvm_lock_pageq(); /* lock out pagedaemon */ > Index: uvm/uvm_object.c > === > RCS file: /cvs/src/sys/uvm/uvm_object.c,v > retrieving revision 1.17 > diff -u -p -r1.17 uvm_object.c > --- uvm/uvm_object.c 21 Oct 2020 09:08:14 - 1.17 > +++ uvm/uvm_object.c 18 Nov 2020 23:22:15 - > @@ -172,7 +172,9 @@ uvm_objfree(struct uvm_object *uobj) >* this pg from the uobj we are throwing away >*/ > atomic_clearbits_int(&pg->pg_flags, PG_TABLED); > + uvm_lock_pageq(); > uvm_pageclean(pg); > + uvm_unlock_pageq(); > TAILQ_INSERT_TAIL(&pgl, pg, pageq); > } > uvm_pmr_freepageq(&pgl); > Index: uvm/uvm_page.c > === > RCS file: /cvs/src/sys/uvm/uvm_page.c,v > retrieving revision 1.150 > diff -u -p -r1.150 uvm_page.c > --- uvm/uvm_page.c22 Sep 2020 14:31:08 - 1.150 > +++ uvm/uvm_page.c18 Nov 2020 23:22:15 - > @@ -973,6 +973,10 @@ uvm_pageclean(struct vm_page *pg) > { > u_int flags_to_clear = 0; > > +#if all_pmap_are_fixed > + MUTEX_ASSERT_LOCKED(&uvm.pageqlock); > +#endif > + > #ifdef DEBUG > if (pg->uobject == (void *)0xdeadbeef && > pg->uanon == (void *)0xdeadbeef) { > @@ -1037,6 +1041,10 @@ uvm_pageclean(struct vm_page *pg) > void > uvm_pagefree(struct vm_page *pg) > { > +#if all_pmap_are_fixed > + MUTEX_ASSERT_LOCKED(&uvm.pageqlock); > +#endif > + > uvm_pageclean(pg); > uvm_pmr_freepages(pg, 1); > } > @@ -1229,6 +1237,8 @@ uvm_pagelookup(struct uvm_object *obj, v > void > uvm_pagewire(struct vm_page *pg) > { > + MUTEX_ASSERT_LOCKED(&uvm.pageqlock); > + >
Re: uvm_fault: refactoring for case 2 faults
On Tue, Nov 17, 2020 at 09:25:10AM -0300, Martin Pieuchot wrote: > Here's another refactoring that moves the remaining logic of uvm_fault() > handling lower faults, case 2, to its own function. This logic shouldn't > be modified in the first step of unlocking amap & anon and will still be > executed under KERNEL_LOCK(). Having a separate function will however > help to turn the 'ReFault' goto into a more readable loop. This will be > the next step. > > ok? ok jmatthew@ > > Index: uvm/uvm_fault.c > === > RCS file: /cvs/src/sys/uvm/uvm_fault.c,v > retrieving revision 1.107 > diff -u -p -r1.107 uvm_fault.c > --- uvm/uvm_fault.c 16 Nov 2020 12:30:16 - 1.107 > +++ uvm/uvm_fault.c 16 Nov 2020 13:27:32 - > @@ -484,6 +484,9 @@ struct uvm_faultctx { > paddr_t pa_flags; > }; > > +int uvm_fault_lower(struct uvm_faultinfo *, struct uvm_faultctx *, > + struct vm_page **, vm_fault_t, vm_prot_t); > + > /* > * uvm_fault_check: check prot, handle needs-copy, etc. > * > @@ -901,19 +904,11 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad > { > struct uvm_faultinfo ufi; > struct uvm_faultctx flt; > - boolean_t promote, locked, shadowed; > - int result, lcv, gotpages; > - vaddr_t currva; > - voff_t uoff; > - struct vm_amap *amap; > - struct uvm_object *uobj; > - struct vm_anon *anons_store[UVM_MAXRANGE], **anons, *anon; > - struct vm_page *pages[UVM_MAXRANGE], *pg, *uobjpage; > + boolean_t shadowed; > + struct vm_anon *anons_store[UVM_MAXRANGE], **anons; > + struct vm_page *pages[UVM_MAXRANGE]; > int error; > > - anon = NULL; > - pg = NULL; > - > uvmexp.faults++;/* XXX: locking? */ > TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL); > > @@ -957,8 +952,28 @@ ReFault: > } > } > > - amap = ufi.entry->aref.ar_amap; > - uobj = ufi.entry->object.uvm_obj; > + /* handle case 2: faulting on backing object or zero fill */ > + error = uvm_fault_lower(&ufi, &flt, pages, fault_type, access_type); > + switch (error) { > + case ERESTART: > + goto ReFault; > + default: > + return error; > + } > +} > + > +int > +uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, > + struct vm_page **pages, vm_fault_t fault_type, vm_prot_t access_type) > +{ > + struct vm_amap *amap = ufi->entry->aref.ar_amap; > + struct uvm_object *uobj = ufi->entry->object.uvm_obj; > + boolean_t promote, locked; > + int result, lcv, gotpages; > + struct vm_page *uobjpage, *pg = NULL; > + struct vm_anon *anon = NULL; > + vaddr_t currva; > + voff_t uoff; > > /* >* if the desired page is not shadowed by the amap and we have a > @@ -967,15 +982,15 @@ ReFault: >* with the usual pgo_get hook). the backing object signals this by >* providing a pgo_fault routine. >*/ > - if (uobj && shadowed == FALSE && uobj->pgops->pgo_fault != NULL) { > - result = uobj->pgops->pgo_fault(&ufi, flt.startva, pages, > - flt.npages, flt.centeridx, fault_type, access_type, > + if (uobj != NULL && uobj->pgops->pgo_fault != NULL) { > + result = uobj->pgops->pgo_fault(ufi, flt->startva, pages, > + flt->npages, flt->centeridx, fault_type, access_type, > PGO_LOCKED); > > if (result == VM_PAGER_OK) > return (0); /* pgo_fault did pmap enter */ > else if (result == VM_PAGER_REFAULT) > - goto ReFault; /* try again! */ > + return ERESTART;/* try again! */ > else > return (EACCES); > } > @@ -989,20 +1004,20 @@ ReFault: >* >* ("get" has the option of doing a pmap_enter for us) >*/ > - if (uobj && shadowed == FALSE) { > + if (uobj != NULL) { > uvmexp.fltlget++; > - gotpages = flt.npages; > - (void) uobj->pgops->pgo_get(uobj, ufi.entry->offset + > - (flt.startva - ufi.entry->start), > - pages, &gotpages, flt.centeridx, > - access_type & MASK(ufi.entry), > - ufi.entry->advice, PGO_LOCKED); > + gotpages = flt->npages; > + (void) uobj->pgops->pgo_get(uobj, ufi->entry->offset + > + (flt->startva - ufi->entry->start), > + pages, &gotpages, flt->centeridx, > + access_type & MASK(ufi->entry), > + ufi->entry->advice, PGO_LOCKED); > > /* check for pages to map, if we got any */ > uobjpage = NULL; > if (gotpages) { > - currva = flt.star
Re: uvm_fault: Kill goto Case2
On Fri, Nov 13, 2020 at 12:04:23PM -0300, Martin Pieuchot wrote: > Another simple refactoring of uvm_fault() removing a goto, ok? I like it, ok jmatthew@ > > Index: uvm/uvm_fault.c > === > RCS file: /cvs/src/sys/uvm/uvm_fault.c,v > retrieving revision 1.106 > diff -u -p -r1.106 uvm_fault.c > --- uvm/uvm_fault.c 13 Nov 2020 14:18:25 - 1.106 > +++ uvm/uvm_fault.c 13 Nov 2020 15:01:41 - > @@ -942,12 +942,24 @@ ReFault: > return error; > } > > - amap = ufi.entry->aref.ar_amap; > - uobj = ufi.entry->object.uvm_obj; > - > /* (shadowed == TRUE) if there is an anon at the faulting address */ > shadowed = uvm_fault_upper_lookup(&ufi, &flt, anons, pages); > > + /* handle case 1: fault on an anon in our amap */ > + if (shadowed == TRUE) { > + error = uvm_fault_upper(&ufi, &flt, anons, fault_type, > + access_type); > + switch (error) { > + case ERESTART: > + goto ReFault; > + default: > + return error; > + } > + } > + > + amap = ufi.entry->aref.ar_amap; > + uobj = ufi.entry->object.uvm_obj; > + > /* >* if the desired page is not shadowed by the amap and we have a >* backing object, then we check to see if the backing object would > @@ -1055,30 +1067,12 @@ ReFault: > /* >* note that at this point we are done with any front or back pages. >* we are now going to focus on the center page (i.e. the one we've > - * faulted on). if we have faulted on the top (anon) layer > - * [i.e. case 1], then the anon we want is anons[centeridx] (we have > - * not touched it yet). if we have faulted on the bottom (uobj) > + * faulted on). if we have faulted on the bottom (uobj) >* layer [i.e. case 2] and the page was both present and available, >* then we've got a pointer to it as "uobjpage" and we've already >* made it BUSY. >*/ > - /* > - * there are four possible cases we must address: 1A, 1B, 2A, and 2B > - */ > - /* redirect case 2: if we are not shadowed, go to case 2. */ > - if (shadowed == FALSE) > - goto Case2; > - > - /* handle case 1: fault on an anon in our amap */ > - error = uvm_fault_upper(&ufi, &flt, anons, fault_type, access_type); > - switch (error) { > - case ERESTART: > - goto ReFault; > - default: > - return error; > - } > > -Case2: > /* handle case 2: faulting on backing object or zero fill */ > /* >* note that uobjpage can not be PGO_DONTCARE at this point. we now >
Re: uvm_fault: is there an anon?
On Fri, Nov 13, 2020 at 12:17:04PM +0100, Theo Buehler wrote: > On Wed, Nov 04, 2020 at 11:04:12AM -0300, Martin Pieuchot wrote: > > Diff below introduces a helper that looks for existing mapping. The > > value returned by this lookup function determines if there's an anon > > at the faulting address which tells us if we're dealign with a fault > > of type 1 or 2. > > > > This small refactoring is part of the current work to separate the code > > handling faults of type 1 and 2. The end goal being to move the type 1 > > faults handling out of the KERNEL_LOCK(). > > > > The function name is taken from NetBSD to not introduce more difference > > than there's already. > > > > ok? > > ok tb. > > I've been running the three diffs for two days and this went through two > 'make release' > Same here. NetBSD's uvm fault handler is a lot more readable than ours, so heading in that direction seems like a pretty good idea, ok jmatthew@
Re: amap: introduce amap_adjref_anons()
On Fri, Oct 30, 2020 at 08:46:20PM +0100, Martin Pieuchot wrote: > On 23/10/20(Fri) 10:31, Martin Pieuchot wrote: > > More refactoring. This time let's introduce a helper to manipulate > > references. The goal is to reduce the upcoming diff adding locking. > > > > This is extracted from a bigger diff from guenther@ as well as some > > bits from NetBSD. > > Now with the correct diff, ok? This looks good to me (and survived a couple of full builds on amd64), ok jmatthew@ > > Index: uvm/uvm_amap.c > === > RCS file: /cvs/src/sys/uvm/uvm_amap.c,v > retrieving revision 1.85 > diff -u -p -r1.85 uvm_amap.c > --- uvm/uvm_amap.c12 Oct 2020 08:44:45 - 1.85 > +++ uvm/uvm_amap.c23 Oct 2020 08:23:59 - > @@ -68,7 +68,23 @@ static inline void amap_list_remove(stru > > struct vm_amap_chunk *amap_chunk_get(struct vm_amap *, int, int, int); > void amap_chunk_free(struct vm_amap *, struct vm_amap_chunk *); > -void amap_wiperange_chunk(struct vm_amap *, struct vm_amap_chunk *, int, > int); > + > +/* > + * if we enable PPREF, then we have a couple of extra functions that > + * we need to prototype here... > + */ > + > +#ifdef UVM_AMAP_PPREF > + > +#define PPREF_NONE ((int *) -1) /* not using ppref */ > + > +void amap_pp_adjref(struct vm_amap *, int, vsize_t, int); > +void amap_pp_establish(struct vm_amap *); > +void amap_wiperange_chunk(struct vm_amap *, struct vm_amap_chunk *, int, > + int); > +void amap_wiperange(struct vm_amap *, int, int); > + > +#endif /* UVM_AMAP_PPREF */ > > static inline void > amap_list_insert(struct vm_amap *amap) > @@ -1153,6 +1169,32 @@ amap_unadd(struct vm_aref *aref, vaddr_t > } > > /* > + * amap_adjref_anons: adjust the reference count(s) on amap and its anons. > + */ > +static void > +amap_adjref_anons(struct vm_amap *amap, vaddr_t offset, vsize_t len, > +int refv, boolean_t all) > +{ > +#ifdef UVM_AMAP_PPREF > + if (amap->am_ppref == NULL && !all && len != amap->am_nslot) { > + amap_pp_establish(amap); > + } > +#endif > + > + amap->am_ref += refv; > + > +#ifdef UVM_AMAP_PPREF > + if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { > + if (all) { > + amap_pp_adjref(amap, 0, amap->am_nslot, refv); > + } else { > + amap_pp_adjref(amap, offset, len, refv); > + } > + } > +#endif > +} > + > +/* > * amap_ref: gain a reference to an amap > * > * => "offset" and "len" are in units of pages > @@ -1162,51 +1204,36 @@ void > amap_ref(struct vm_amap *amap, vaddr_t offset, vsize_t len, int flags) > { > > - amap->am_ref++; > if (flags & AMAP_SHARED) > amap->am_flags |= AMAP_SHARED; > -#ifdef UVM_AMAP_PPREF > - if (amap->am_ppref == NULL && (flags & AMAP_REFALL) == 0 && > - len != amap->am_nslot) > - amap_pp_establish(amap); > - if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { > - if (flags & AMAP_REFALL) > - amap_pp_adjref(amap, 0, amap->am_nslot, 1); > - else > - amap_pp_adjref(amap, offset, len, 1); > - } > -#endif > + amap_adjref_anons(amap, offset, len, 1, (flags & AMAP_REFALL) != 0); > } > > /* > * amap_unref: remove a reference to an amap > * > - * => caller must remove all pmap-level references to this amap before > - * dropping the reference > - * => called from uvm_unmap_detach [only] ... note that entry is no > - * longer part of a map > + * => All pmap-level references to this amap must be already removed. > + * => Called from uvm_unmap_detach(); entry is already removed from the map. > */ > void > amap_unref(struct vm_amap *amap, vaddr_t offset, vsize_t len, boolean_t all) > { > + KASSERT(amap->am_ref > 0); > > - /* if we are the last reference, free the amap and return. */ > - if (amap->am_ref-- == 1) { > - amap_wipeout(amap); /* drops final ref and frees */ > + if (amap->am_ref == 1) { > + /* > + * If the last reference - wipeout and destroy the amap. > + */ > + amap->am_ref--; > + amap_wipeout(amap); > return; > } > > - /* otherwise just drop the reference count(s) */ > - if (amap->am_ref == 1 && (amap->am_flags & AMAP_SHARED) != 0) > - amap->am_flags &= ~AMAP_SHARED; /* clear shared flag */ > -#ifdef UVM_AMAP_PPREF > - if (amap->am_ppref == NULL && all == 0 && len != amap->am_nslot) > - amap_pp_establish(amap); > - if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { > - if (all) > - amap_pp_adjref(amap, 0, amap->am_nslot, -1); > - else > - amap_pp_adjref(amap, offset, len, -1); > + /* > + * Otherwise, drop the reference count(s) on anons. > + */ > + if (amap->am_ref
Re: Document art locking fields
On Wed, Nov 11, 2020 at 05:25:25AM -0300, Martin Pieuchot wrote: > While discussing the new source address mechanism with denis@, I figured > those ought to be documented. > > Note that `ar_rtableid' is unused and can die. The ART code is actually > free from any network knowledge. > > ok? ok jmatthew@ > > Index: net/art.c > === > RCS file: /cvs/src/sys/net/art.c,v > retrieving revision 1.28 > diff -u -p -r1.28 art.c > --- net/art.c 31 Mar 2019 19:29:27 - 1.28 > +++ net/art.c 9 Nov 2020 19:52:48 - > @@ -115,7 +115,6 @@ art_alloc(unsigned int rtableid, unsigne > } > > ar->ar_off = off; > - ar->ar_rtableid = rtableid; > rw_init(&ar->ar_lock, "art"); > > return (ar); > Index: net/art.h > === > RCS file: /cvs/src/sys/net/art.h,v > retrieving revision 1.19 > diff -u -p -r1.19 art.h > --- net/art.h 29 Oct 2020 21:15:27 - 1.19 > +++ net/art.h 9 Nov 2020 19:52:42 - > @@ -27,16 +27,22 @@ > > /* > * Root of the ART tables, equivalent to the radix head. > + * > + * Locks used to protect struct members in this file: > + * I immutable after creation > + * l root's `ar_lock' > + * K kernel lock > + * For SRP related structures that allow lock-free reads, the write lock > + * is indicated below. > */ > struct art_root { > - struct srp ar_root; /* First table */ > - struct rwlockar_lock; /* Serialise modifications */ > - uint8_t ar_bits[ART_MAXLVL]; /* Per level stride */ > - uint8_t ar_nlvl; /* Number of levels */ > - uint8_t ar_alen; /* Address length in bits */ > - uint8_t ar_off;/* Offset of the key in bytes */ > - unsigned int ar_rtableid; /* ID of this routing table */ > - struct sockaddr *source;/* optional src addr to use */ > + struct srp ar_root; /* [l] First table */ > + struct rwlockar_lock; /* [] Serialise modifications */ > + uint8_t ar_bits[ART_MAXLVL]; /* [I] Per level stride */ > + uint8_t ar_nlvl; /* [I] Number of levels */ > + uint8_t ar_alen; /* [I] Address length in bits */ > + uint8_t ar_off;/* [I] Offset of key in bytes */ > + struct sockaddr *source;/* [K] optional src addr to use > */ > }; > > #define ISLEAF(e)(((unsigned long)(e) & 1) == 0) >
ospf6d: use ROUTE_FLAGFILTER
Like ospfd, ospf6d can use ROUTE_FLAGFILTER to opt out of receiving messages relating to L2 and broadcast routes on its routing socket. We've been running this for a week or so with no problems. ok? Index: kroute.c === RCS file: /cvs/src/usr.sbin/ospf6d/kroute.c,v retrieving revision 1.64 diff -u -p -u -p -r1.64 kroute.c --- kroute.c17 May 2020 18:29:25 - 1.64 +++ kroute.c18 Aug 2020 11:56:09 - @@ -102,6 +102,7 @@ kr_init(int fs, u_int rdomain, int redis int opt = 0, rcvbuf, default_rcvbuf; socklen_t optlen; int filter_prio = fib_prio; + int filter_flags = RTF_LLINFO | RTF_BROADCAST; kr_state.fib_sync = fs; kr_state.rdomain = rdomain; @@ -127,6 +128,12 @@ kr_init(int fs, u_int rdomain, int redis if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_PRIOFILTER, &filter_prio, sizeof(filter_prio)) == -1) { log_warn("%s: setsockopt AF_ROUTE ROUTE_PRIOFILTER", __func__); + /* not fatal */ + } + + if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_FLAGFILTER, &filter_flags, + sizeof(filter_flags)) == -1) { + log_warn("%s: setsockopt AF_ROUTE ROUTE_FLAGFILTER", __func__); /* not fatal */ }
Re: ldapd(8): fix, simplify UUID timestamp code
On Wed, Aug 19, 2020 at 09:28:41PM -0500, Scott Cheloha wrote: > Hi, > > I was auditing the tree for odd-looking time structure usage and I > came across the UUID code in ldapd(8), uuid.c. > > time_cmp() is backwards. Or the caller is misusing it. One or the > other. It returns -1 if tv1 exceeds tv2 but the comments in the > caller indicate the opposite impression. I don't think this code has > ever worked as intended. > > It would be a lot easier if we just threw the code out and used random > UUIDs. After reading over the RFC it seems to me that time-based > UUIDs are collision-prone. Their implementation is also complicated. > Purely random UUIDs should effectively never collide and are trivial > to implement. RFC 4530, defining the entryUUID attribute, says this: UUID are to be generated in accordance with Section 4 of [RFC4122]. In particular, servers MUST ensure that each generated UUID is unique in space and time. Which doesn't rule out random uuids at all. Is arc4random_buf() a better attempt at ensuring the uuids are unique than doing complicated stuff with clocks and mac addresses? Maybe. Windows has been generating random uuids for about 20 years now. > > However, assuming we can't just use random UUIDs, here's an attempt at > improving this code: > > - Use clock_gettime(2). With nanosecond resolution we don't need > a 'counter'. > > - Reduce the scope of all the static state to uuid_create(). > > - Shrink the loop. Just read the clock until it changes, then decide > what to do re. seq_num. This is effectively what the example code in > RFC 4122 does. > > I'm unsure what the right thing to do is if the system clock predates > the UUID epoch (Oct 15 1582). My code just returns zero. Maybe we > should just kill the daemon in that case? The UUIDv1 scheme breaks > down if time is that seriously screwed up. > > Is there an active ldapd(8) person? Or at least someone with an > ldapd(8) setup who can test this? I'm kind of an active ldapd person, though I don't actually use it actively. I can try this out if need be. > > Thoughts? > > Index: uuid.c > === > RCS file: /cvs/src/usr.sbin/ldapd/uuid.c,v > retrieving revision 1.6 > diff -u -p -r1.6 uuid.c > --- uuid.c26 Apr 2018 12:42:51 - 1.6 > +++ uuid.c20 Aug 2020 01:44:00 - > @@ -63,27 +63,8 @@ > > #include "uuid.h" > > -static uint32_t seq_num; > -static struct timeval last_time; > -static int32_t counter; > -static char nodeaddr[6]; > - > enum { UUID_NODE_MULTICAST = 0x80 }; > > -static int > -time_cmp(struct timeval *tv1, struct timeval *tv2) > -{ > -if (tv1->tv_sec > tv2->tv_sec) > - return -1; > -if (tv1->tv_sec < tv2->tv_sec) > - return 1; > -if (tv1->tv_usec > tv2->tv_usec) > - return -1; > -if (tv1->tv_usec < tv2->tv_usec) > - return 1; > -return 0; > -} > - > static void > get_node_addr(char *addr) > { > @@ -138,6 +119,40 @@ get_node_addr(char *addr) > } > > /* > + * A UUID v1 timestamp: > + * > + * - 60 bits. > + * - Unsigned. > + * - Epoch at Oct 15 1582 00:00:00 UTC. > + * - Increments every 100 nanoseconds. > + */ > +#define UUID_EPOCH_OFFSET12219292800LL > +#define UUID_TIME_MAX(1ULL << 60) > +#define UUID_HZ 1000LL > +#define NSEC_PER_UUID_TICK 100LL > + > +static uint64_t > +get_uuid_timestamp(void) > +{ > + static const struct timespec min = { -UUID_EPOCH_OFFSET, 0 }; > + static const struct timespec max = { > + UUID_TIME_MAX / UUID_HZ, > + UUID_TIME_MAX % UUID_HZ * NSEC_PER_UUID_TICK > + }; > + struct timespec utc; > + uint64_t timestamp; > + > + clock_gettime(CLOCK_REALTIME, &utc); > + if (timespeccmp(&utc, &min, <)) > + return 0; > + if (timespeccmp(&max, &utc, <)) > + return UUID_TIME_MAX; > + timestamp = (UUID_EPOCH_OFFSET + utc.tv_sec) * UUID_HZ; > + timestamp += utc.tv_nsec / NSEC_PER_UUID_TICK; > + return timestamp; > +} > + > +/* > *Creates a new UUID. > */ > > @@ -145,55 +160,32 @@ void > uuid_create(afsUUID *uuid) > { > static int uuid_inited = 0; > -struct timeval tv; > -int ret, got_time; > +static uint64_t last_time; > +static uint32_t seq_num; > +static char nodeaddr[6]; > uint64_t dce_time; > > if (uuid_inited == 0) { > - gettimeofday(&last_time, NULL); > + last_time = get_uuid_timestamp(); > seq_num = arc4random(); > get_node_addr(nodeaddr); > uuid_inited = 1; > } > > -gettimeofday(&tv, NULL); > - > -got_time = 0; > +while ((dce_time = get_uuid_timestamp()) == last_time) > + continue; > > -do { > - ret = time_cmp(&tv, &last_time); > - if (ret < 0) { > - /* Time went backward, just inc seq_num and be done. > - * seq_num is 6 + 8 bit field it the uuid, so let it wrap > -
ospfd: use ROUTE_FLAGFILTER
ospfd is our first target for using ROUTE_FLAGFILTER to reduce pressure on the route socket, so here's the diff we've been running for a couple of weeks now (minus the fix for RTM_DELETE flags, notably). ok? Index: kroute.c === RCS file: /cvs/src/usr.sbin/ospfd/kroute.c,v retrieving revision 1.113 diff -u -p -r1.113 kroute.c --- kroute.c9 Nov 2019 15:54:19 - 1.113 +++ kroute.c27 Jul 2020 03:45:41 - @@ -133,6 +133,7 @@ kr_init(int fs, u_int rdomain, int redis int opt = 0, rcvbuf, default_rcvbuf; socklen_t optlen; int filter_prio = fib_prio; + int filter_flags = RTF_LLINFO | RTF_BROADCAST; kr_state.fib_sync = fs; kr_state.rdomain = rdomain; @@ -158,6 +159,11 @@ kr_init(int fs, u_int rdomain, int redis if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_PRIOFILTER, &filter_prio, sizeof(filter_prio)) == -1) { log_warn("%s: setsockopt AF_ROUTE ROUTE_PRIOFILTER", __func__); + /* not fatal */ + } + if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_FLAGFILTER, &filter_flags, + sizeof(filter_flags)) == -1) { + log_warn("%s: setsockopt AF_ROUTE ROUTE_FLAGFILTER", __func__); /* not fatal */ }
RTM_DELETE messages for L2 routes have incorrect flags
While looking into filtering out messages for L2 routes in the kernel to reduce load on routing daemons, I noticed that the RTM_DELETE messages do not have the RTF_LLINFO flag set, which is inconvenient because that's what I want to filter on. I tracked this down to r1.361 and r1.362 of net/route.c, where we stopped saving rt->rt_flags before calling rtrequest_delete(). rtrequest_delete() calls ifp->if_rtrequest(), which removes the llinfo from the route and clears RTF_LLINFO. I think the simplest way to fix this would be for rtdeletemsg() to go back to calling rtm_miss() directly rather than using rtm_send(). Adding more parameters to rtm_send() to specify additional flags seems like overcomplicating it. Index: route.c === RCS file: /cvs/src/sys/net/route.c,v retrieving revision 1.394 diff -u -p -r1.394 route.c --- route.c 24 Jun 2020 22:03:43 - 1.394 +++ route.c 11 Aug 2020 04:12:51 - @@ -663,6 +663,7 @@ rtdeletemsg(struct rtentry *rt, struct i { int error; struct rt_addrinfo info; + struct sockaddr_rtlabel sa_rl; struct sockaddr_in6 sa_mask; KASSERT(rt->rt_ifidx == ifp->if_index); @@ -677,8 +678,13 @@ rtdeletemsg(struct rtentry *rt, struct i info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if (!ISSET(rt->rt_flags, RTF_HOST)) info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask); + info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl); + info.rti_flags = rt->rt_flags; + info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl); + info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; error = rtrequest_delete(&info, rt->rt_priority, ifp, &rt, tableid); - rtm_send(rt, RTM_DELETE, error, tableid); + rtm_miss(RTM_DELETE, &info, info.rti_flags, rt->rt_priority, + rt->rt_ifidx, error, tableid); if (error == 0) rtfree(rt); return (error);
filtering routing socket messages by flags
Most (all?) of our routing daemons don't care about layer 2 or broadcast routing entries, so they do something like this after reading a message off the socket: /* Skip ARP/ND cache and broadcast routes. */ if (rtm->rtm_flags & (RTF_LLINFO|RTF_BROADCAST)) continue; ARP can generate a lot of routing messages during an address space scan, and then again when the entries expire, and this can cause routing daemons to desync. To reduce the impact of this, we'd like to filter these out on the kernel side. There's another issue we need to fix to make this work properly, but that can be done separately. This adds a new type of filter on the routing socket, specifying a flag bitmask, which filters out messages for routes with flags matching the mask. ok? Index: route.h === RCS file: /cvs/src/sys/net/route.h,v retrieving revision 1.181 diff -u -p -u -p -r1.181 route.h --- route.h 10 Mar 2020 21:35:41 - 1.181 +++ route.h 6 Aug 2020 01:47:11 - @@ -297,6 +297,8 @@ struct rt_msghdr { #define ROUTE_PRIOFILTER 3 /* only pass updates with a priority higher or equal (actual value lower) to the specified priority. */ +#define ROUTE_FLAGFILTER 4 /* do not pass updates for routes with flags + in this bitmask. */ #define ROUTE_FILTER(m)(1 << (m)) #define RTABLE_ANY 0x Index: rtsock.c === RCS file: /cvs/src/sys/net/rtsock.c,v retrieving revision 1.299 diff -u -p -u -p -r1.299 rtsock.c --- rtsock.c24 Jun 2020 22:03:42 - 1.299 +++ rtsock.c6 Aug 2020 01:47:11 - @@ -145,6 +145,7 @@ struct rtpcb { struct refcnt rop_refcnt; struct timeout rop_timeout; unsigned introp_msgfilter; + unsigned introp_flagfilter; unsigned introp_flags; u_int rop_rtableid; unsigned short rop_proto; @@ -402,6 +403,12 @@ route_ctloutput(int op, struct socket *s else rop->rop_priority = prio; break; + case ROUTE_FLAGFILTER: + if (m == NULL || m->m_len != sizeof(unsigned int)) + error = EINVAL; + else + rop->rop_flagfilter = *mtod(m, unsigned int *); + break; default: error = ENOPROTOOPT; break; @@ -421,6 +428,10 @@ route_ctloutput(int op, struct socket *s m->m_len = sizeof(unsigned int); *mtod(m, unsigned int *) = rop->rop_priority; break; + case ROUTE_FLAGFILTER: + m->m_len = sizeof(unsigned int); + *mtod(m, unsigned int *) = rop->rop_flagfilter; + break; default: error = ENOPROTOOPT; break; @@ -516,9 +527,13 @@ next: /* filter messages that the process does not want */ rtm = mtod(m, struct rt_msghdr *); /* but RTM_DESYNC can't be filtered */ - if (rtm->rtm_type != RTM_DESYNC && rop->rop_msgfilter != 0 && - !(rop->rop_msgfilter & (1 << rtm->rtm_type))) - goto next; + if (rtm->rtm_type != RTM_DESYNC) { + if (rop->rop_msgfilter != 0 && + !(rop->rop_msgfilter & (1 << rtm->rtm_type))) + goto next; + if (ISSET(rop->rop_flagfilter, rtm->rtm_flags)) + goto next; + } switch (rtm->rtm_type) { case RTM_IFANNOUNCE: case RTM_DESYNC:
acpicpu: remove acpicpu_sc array
This came out of the work on supporting ACPI0007 devices in acpicpu(4), but it's independent of that and I'd like to get it in the tree separately. Since it was first added, acpicpu stores instances of itself in an array, which it uses to find the acpicpu device for a cpu. This runs into problems when there are more than MAXCPUS acpicpu devices. Currently it overwrites whatever's after the array, leading to varying crashes and hangs depending on kernel link order. More recently, we've added a pointer to struct cpu_info that does this more directly, and also has the advantage that it actually matches up the cpu ids rather than assuming cpu3 maps to acpicpu3. This diff removes the acpicpu_sc array and uses the pointer from struct cpu_info instead. Most of the accesses are just looking for the first acpicpu, so we can use cpu_info_primary to find that. I've tested this on a few different machines (including one with 128 acpicpu devices) and everything still works. ok? Index: acpicpu.c === RCS file: /cvs/src/sys/dev/acpi/acpicpu.c,v retrieving revision 1.85 diff -u -p -r1.85 acpicpu.c --- acpicpu.c 27 May 2020 05:02:21 - 1.85 +++ acpicpu.c 3 Aug 2020 05:10:45 - @@ -188,8 +188,6 @@ struct cfdriver acpicpu_cd = { extern int setperf_prio; -struct acpicpu_softc *acpicpu_sc[MAXCPUS]; - #if 0 void acpicpu_set_throttle(struct acpicpu_softc *sc, int level) @@ -672,7 +670,6 @@ acpicpu_attach(struct device *parent, st sc->sc_acpi = (struct acpi_softc *)parent; sc->sc_devnode = aa->aaa_node; - acpicpu_sc[sc->sc_dev.dv_unit] = sc; SLIST_INIT(&sc->sc_cstates); @@ -979,7 +976,7 @@ acpicpu_fetch_pss(struct acpicpu_pss **p * the bios ensures this... */ - sc = acpicpu_sc[0]; + sc = (struct acpicpu_softc *)cpu_info_primary.ci_acpicpudev; if (!sc) return 0; *pss = sc->sc_pss; @@ -1024,7 +1021,7 @@ acpicpu_set_notify(void (*func)(struct a { struct acpicpu_softc*sc; - sc = acpicpu_sc[0]; + sc = (struct acpicpu_softc *)cpu_info_primary.ci_acpicpudev; if (sc != NULL) sc->sc_notify = func; } @@ -1034,7 +1031,7 @@ acpicpu_setperf_ppc_change(struct acpicp { struct acpicpu_softc*sc; - sc = acpicpu_sc[0]; + sc = (struct acpicpu_softc *)cpu_info_primary.ci_acpicpudev; if (sc != NULL) cpu_setperf(sc->sc_level); @@ -1048,7 +1045,7 @@ acpicpu_setperf(int level) int idx, len; uint32_tstatus = 0; - sc = acpicpu_sc[cpu_number()]; + sc = (struct acpicpu_softc *)curcpu()->ci_acpicpudev; dnprintf(10, "%s: acpicpu setperf level %d\n", sc->sc_devnode->name, level);
Re: acpicpu(4) and ACPI0007
On Wed, Jul 29, 2020 at 08:29:31PM +1000, Jonathan Matthew wrote: > On Wed, Jul 29, 2020 at 10:06:14AM +0200, Mark Kettenis wrote: > > > Date: Wed, 29 Jul 2020 10:38:55 +1000 > > > From: Jonathan Matthew > > > > > > On Tue, Jul 28, 2020 at 07:30:36PM +0200, Mark Kettenis wrote: > > > > > Date: Tue, 28 Jul 2020 21:42:46 +1000 > > > > > From: Jonathan Matthew > > > > > > > > > > On Tue, Jul 28, 2020 at 11:12:21AM +0200, Mark Kettenis wrote: > > > > > > > Date: Tue, 28 Jul 2020 13:46:34 +1000 > > > > > > > From: Jonathan Matthew > > > > > > > > > > > > > > On Mon, Jul 27, 2020 at 05:16:47PM +0200, Mark Kettenis wrote: > > > > > > > > > Date: Mon, 27 Jul 2020 17:02:41 +0200 (CEST) > > > > > > > > > From: Mark Kettenis > > > > > > > > > > > > > > > > > > Recent ACPI versions have deprecated "Processor()" nodes in > > > > > > > > > favout of > > > > > > > > > "Device()" nodes with a _HID() method that returns > > > > > > > > > "ACPI0007". This > > > > > > > > > diff tries to support machines with firmware that implements > > > > > > > > > this. If > > > > > > > > > you see something like: > > > > > > > > > > > > > > > > > > "ACPI0007" at acpi0 not configured > > > > > > > > > > > > > > > > > > please try the following diff and report back with an updated > > > > > > > > > dmesg. > > > > > > > > > > > > > > > > > > Cheers, > > > > > > > > > > > > > > > > > > Mark > > > > > > > > > > > > > > > > And now with the right diff... > > > > > > > > > > > > > > On a dell r6415, it looks like this: > > > > > > > > > > > > > > acpicpu0 at acpi0copyvalue: 6: C1(@1 halt!) > > > > > > > all the way up to > > > > > > > acpicpu127 at acpi0copyvalue: 6: no cpu matching ACPI ID 127 > > > > > > > > > > > > > > which I guess means aml_copyvalue() needs to learn how to copy > > > > > > > AML_OBJTYPE_DEVICE. > > > > > > > > > > > > Yes. It is not immediately obvious how this should work. Do we > > > > > > need > > > > > > to copy the aml_node pointer or not? We don't do that for > > > > > > AML_OBJTYPE_PROCESSOR and AML_OBJTYPE_POWERRSRC types which are > > > > > > similar to AML_OBJTYPE_DEVICE. But AML_OBJTYPE_DEVICE object don't > > > > > > carry any additional information. So we end up with just an empty > > > > > > case to avoid the warning. > > > > > > > > > > > > Does this work on the Dell machines? > > > > > > > > > > We've seen crashes in pool_cache_get() in various places after all > > > > > the acpicpus > > > > > attach, which we haven't seen before on these machines, so I think > > > > > it's > > > > > corrupting memory somehow. > > > > > > > > Does that happen with only the acpicpu(4) diff? > > > > > > Yes. Looking at this a bit more, in the case where aml_evalnode() can't > > > copy the result value, it leaves it uninitialised, which means we'll call > > > aml_freevalue(&res) where res is stack junk. memset(&res, 0, sizeof(res)) > > > seems to fix it. > > > > Eh, where exactly? > > I had it just before the call to aml_evalnode(), but that can't be it, > since aml_evalnode() does the same thing. Much better theory: the acpicpu_sc array has MAXCPUS elements, but on this system (and all R6415s, as far as I can tell) we have more acpicpu devices than that. I suppose we should just make acpicpu_match fail if cf->cf_unit is >= MAXCPUS as we do with the actual cpu devices. Index: acpicpu.c === RCS file: /cvs/src/sys/dev/acpi/acpicpu.c,v retrieving revision 1.85 diff -u -p -r1.85 acpicpu.c --- acpicpu.c 27 May 2020 05:02:21 - 1.85 +++ acpicpu.c
Re: acpicpu(4) and ACPI0007
On Wed, Jul 29, 2020 at 10:06:14AM +0200, Mark Kettenis wrote: > > Date: Wed, 29 Jul 2020 10:38:55 +1000 > > From: Jonathan Matthew > > > > On Tue, Jul 28, 2020 at 07:30:36PM +0200, Mark Kettenis wrote: > > > > Date: Tue, 28 Jul 2020 21:42:46 +1000 > > > > From: Jonathan Matthew > > > > > > > > On Tue, Jul 28, 2020 at 11:12:21AM +0200, Mark Kettenis wrote: > > > > > > Date: Tue, 28 Jul 2020 13:46:34 +1000 > > > > > > From: Jonathan Matthew > > > > > > > > > > > > On Mon, Jul 27, 2020 at 05:16:47PM +0200, Mark Kettenis wrote: > > > > > > > > Date: Mon, 27 Jul 2020 17:02:41 +0200 (CEST) > > > > > > > > From: Mark Kettenis > > > > > > > > > > > > > > > > Recent ACPI versions have deprecated "Processor()" nodes in > > > > > > > > favout of > > > > > > > > "Device()" nodes with a _HID() method that returns "ACPI0007". > > > > > > > > This > > > > > > > > diff tries to support machines with firmware that implements > > > > > > > > this. If > > > > > > > > you see something like: > > > > > > > > > > > > > > > > "ACPI0007" at acpi0 not configured > > > > > > > > > > > > > > > > please try the following diff and report back with an updated > > > > > > > > dmesg. > > > > > > > > > > > > > > > > Cheers, > > > > > > > > > > > > > > > > Mark > > > > > > > > > > > > > > And now with the right diff... > > > > > > > > > > > > On a dell r6415, it looks like this: > > > > > > > > > > > > acpicpu0 at acpi0copyvalue: 6: C1(@1 halt!) > > > > > > all the way up to > > > > > > acpicpu127 at acpi0copyvalue: 6: no cpu matching ACPI ID 127 > > > > > > > > > > > > which I guess means aml_copyvalue() needs to learn how to copy > > > > > > AML_OBJTYPE_DEVICE. > > > > > > > > > > Yes. It is not immediately obvious how this should work. Do we need > > > > > to copy the aml_node pointer or not? We don't do that for > > > > > AML_OBJTYPE_PROCESSOR and AML_OBJTYPE_POWERRSRC types which are > > > > > similar to AML_OBJTYPE_DEVICE. But AML_OBJTYPE_DEVICE object don't > > > > > carry any additional information. So we end up with just an empty > > > > > case to avoid the warning. > > > > > > > > > > Does this work on the Dell machines? > > > > > > > > We've seen crashes in pool_cache_get() in various places after all the > > > > acpicpus > > > > attach, which we haven't seen before on these machines, so I think it's > > > > corrupting memory somehow. > > > > > > Does that happen with only the acpicpu(4) diff? > > > > Yes. Looking at this a bit more, in the case where aml_evalnode() can't > > copy the result value, it leaves it uninitialised, which means we'll call > > aml_freevalue(&res) where res is stack junk. memset(&res, 0, sizeof(res)) > > seems to fix it. > > Eh, where exactly? I had it just before the call to aml_evalnode(), but that can't be it, since aml_evalnode() does the same thing. > > > > > With this addition, we get this for each cpu: > > > > acpicpu0 at acpi0: C1(@1 halt!) > > > > > > The exclamation mark indicates that this is the "fallback" C-state. > > > Is there a _CST method at all? > > > > > > Anyway, given that this is a server system, it isn't really surprising > > > that there isn't any fancy power saving stuff. > > > > Right, there doesn't seem to be any. The processor devices look like this > > in the aml: > > > > Scope (_SB) > > { > > Device (C000) > > { > > Name (_HID, "ACPI0007" /* Processor Device */) // _HID: > > Hardware ID > > Name (_UID, 0x00) // _UID: Unique ID > > } > > > > Device (C001) > > { > > Name (_HID, "ACPI0007" /* Processor Device */) // _HID: > > Hardware ID > > Name (_UID, 0x01) // _UID: Unique ID > > } > > > > .. and so on. > > Usually there is an SSDT that fills in the details. The acpidump > output I have for the r6415 does have one. but it doesn't add > anything. Same here. > > > > > > Index: dev/acpi/dsdt.c > > > > > === > > > > > RCS file: /cvs/src/sys/dev/acpi/dsdt.c,v > > > > > retrieving revision 1.252 > > > > > diff -u -p -r1.252 dsdt.c > > > > > --- dev/acpi/dsdt.c 21 Jul 2020 03:48:06 - 1.252 > > > > > +++ dev/acpi/dsdt.c 28 Jul 2020 09:04:15 - > > > > > @@ -996,6 +996,8 @@ aml_copyvalue(struct aml_value *lhs, str > > > > > lhs->v_objref = rhs->v_objref; > > > > > aml_addref(lhs->v_objref.ref, ""); > > > > > break; > > > > > + case AML_OBJTYPE_DEVICE: > > > > > + break; > > > > > default: > > > > > printf("copyvalue: %x", rhs->type); > > > > > break; > > > > > > > > > >
Re: acpicpu(4) and ACPI0007
On Tue, Jul 28, 2020 at 07:30:36PM +0200, Mark Kettenis wrote: > > Date: Tue, 28 Jul 2020 21:42:46 +1000 > > From: Jonathan Matthew > > > > On Tue, Jul 28, 2020 at 11:12:21AM +0200, Mark Kettenis wrote: > > > > Date: Tue, 28 Jul 2020 13:46:34 +1000 > > > > From: Jonathan Matthew > > > > > > > > On Mon, Jul 27, 2020 at 05:16:47PM +0200, Mark Kettenis wrote: > > > > > > Date: Mon, 27 Jul 2020 17:02:41 +0200 (CEST) > > > > > > From: Mark Kettenis > > > > > > > > > > > > Recent ACPI versions have deprecated "Processor()" nodes in favout > > > > > > of > > > > > > "Device()" nodes with a _HID() method that returns "ACPI0007". This > > > > > > diff tries to support machines with firmware that implements this. > > > > > > If > > > > > > you see something like: > > > > > > > > > > > > "ACPI0007" at acpi0 not configured > > > > > > > > > > > > please try the following diff and report back with an updated dmesg. > > > > > > > > > > > > Cheers, > > > > > > > > > > > > Mark > > > > > > > > > > And now with the right diff... > > > > > > > > On a dell r6415, it looks like this: > > > > > > > > acpicpu0 at acpi0copyvalue: 6: C1(@1 halt!) > > > > all the way up to > > > > acpicpu127 at acpi0copyvalue: 6: no cpu matching ACPI ID 127 > > > > > > > > which I guess means aml_copyvalue() needs to learn how to copy > > > > AML_OBJTYPE_DEVICE. > > > > > > Yes. It is not immediately obvious how this should work. Do we need > > > to copy the aml_node pointer or not? We don't do that for > > > AML_OBJTYPE_PROCESSOR and AML_OBJTYPE_POWERRSRC types which are > > > similar to AML_OBJTYPE_DEVICE. But AML_OBJTYPE_DEVICE object don't > > > carry any additional information. So we end up with just an empty > > > case to avoid the warning. > > > > > > Does this work on the Dell machines? > > > > We've seen crashes in pool_cache_get() in various places after all the > > acpicpus > > attach, which we haven't seen before on these machines, so I think it's > > corrupting memory somehow. > > Does that happen with only the acpicpu(4) diff? Yes. Looking at this a bit more, in the case where aml_evalnode() can't copy the result value, it leaves it uninitialised, which means we'll call aml_freevalue(&res) where res is stack junk. memset(&res, 0, sizeof(res)) seems to fix it. > > > With this addition, we get this for each cpu: > > acpicpu0 at acpi0: C1(@1 halt!) > > The exclamation mark indicates that this is the "fallback" C-state. > Is there a _CST method at all? > > Anyway, given that this is a server system, it isn't really surprising > that there isn't any fancy power saving stuff. Right, there doesn't seem to be any. The processor devices look like this in the aml: Scope (_SB) { Device (C000) { Name (_HID, "ACPI0007" /* Processor Device */) // _HID: Hardware ID Name (_UID, 0x00) // _UID: Unique ID } Device (C001) { Name (_HID, "ACPI0007" /* Processor Device */) // _HID: Hardware ID Name (_UID, 0x01) // _UID: Unique ID } .. and so on. > > > > Index: dev/acpi/dsdt.c > > > === > > > RCS file: /cvs/src/sys/dev/acpi/dsdt.c,v > > > retrieving revision 1.252 > > > diff -u -p -r1.252 dsdt.c > > > --- dev/acpi/dsdt.c 21 Jul 2020 03:48:06 - 1.252 > > > +++ dev/acpi/dsdt.c 28 Jul 2020 09:04:15 - > > > @@ -996,6 +996,8 @@ aml_copyvalue(struct aml_value *lhs, str > > > lhs->v_objref = rhs->v_objref; > > > aml_addref(lhs->v_objref.ref, ""); > > > break; > > > + case AML_OBJTYPE_DEVICE: > > > + break; > > > default: > > > printf("copyvalue: %x", rhs->type); > > > break; > > > >
Re: acpicpu(4) and ACPI0007
On Tue, Jul 28, 2020 at 11:12:21AM +0200, Mark Kettenis wrote: > > Date: Tue, 28 Jul 2020 13:46:34 +1000 > > From: Jonathan Matthew > > > > On Mon, Jul 27, 2020 at 05:16:47PM +0200, Mark Kettenis wrote: > > > > Date: Mon, 27 Jul 2020 17:02:41 +0200 (CEST) > > > > From: Mark Kettenis > > > > > > > > Recent ACPI versions have deprecated "Processor()" nodes in favout of > > > > "Device()" nodes with a _HID() method that returns "ACPI0007". This > > > > diff tries to support machines with firmware that implements this. If > > > > you see something like: > > > > > > > > "ACPI0007" at acpi0 not configured > > > > > > > > please try the following diff and report back with an updated dmesg. > > > > > > > > Cheers, > > > > > > > > Mark > > > > > > And now with the right diff... > > > > On a dell r6415, it looks like this: > > > > acpicpu0 at acpi0copyvalue: 6: C1(@1 halt!) > > all the way up to > > acpicpu127 at acpi0copyvalue: 6: no cpu matching ACPI ID 127 > > > > which I guess means aml_copyvalue() needs to learn how to copy > > AML_OBJTYPE_DEVICE. > > Yes. It is not immediately obvious how this should work. Do we need > to copy the aml_node pointer or not? We don't do that for > AML_OBJTYPE_PROCESSOR and AML_OBJTYPE_POWERRSRC types which are > similar to AML_OBJTYPE_DEVICE. But AML_OBJTYPE_DEVICE object don't > carry any additional information. So we end up with just an empty > case to avoid the warning. > > Does this work on the Dell machines? We've seen crashes in pool_cache_get() in various places after all the acpicpus attach, which we haven't seen before on these machines, so I think it's corrupting memory somehow. With this addition, we get this for each cpu: acpicpu0 at acpi0: C1(@1 halt!) > > > Index: dev/acpi/dsdt.c > === > RCS file: /cvs/src/sys/dev/acpi/dsdt.c,v > retrieving revision 1.252 > diff -u -p -r1.252 dsdt.c > --- dev/acpi/dsdt.c 21 Jul 2020 03:48:06 - 1.252 > +++ dev/acpi/dsdt.c 28 Jul 2020 09:04:15 - > @@ -996,6 +996,8 @@ aml_copyvalue(struct aml_value *lhs, str > lhs->v_objref = rhs->v_objref; > aml_addref(lhs->v_objref.ref, ""); > break; > + case AML_OBJTYPE_DEVICE: > + break; > default: > printf("copyvalue: %x", rhs->type); > break;
Re: acpicpu(4) and ACPI0007
On Mon, Jul 27, 2020 at 05:16:47PM +0200, Mark Kettenis wrote: > > Date: Mon, 27 Jul 2020 17:02:41 +0200 (CEST) > > From: Mark Kettenis > > > > Recent ACPI versions have deprecated "Processor()" nodes in favout of > > "Device()" nodes with a _HID() method that returns "ACPI0007". This > > diff tries to support machines with firmware that implements this. If > > you see something like: > > > > "ACPI0007" at acpi0 not configured > > > > please try the following diff and report back with an updated dmesg. > > > > Cheers, > > > > Mark > > And now with the right diff... On a dell r6415, it looks like this: acpicpu0 at acpi0copyvalue: 6: C1(@1 halt!) all the way up to acpicpu127 at acpi0copyvalue: 6: no cpu matching ACPI ID 127 which I guess means aml_copyvalue() needs to learn how to copy AML_OBJTYPE_DEVICE. > > > Index: dev/acpi/acpicpu.c > === > RCS file: /cvs/src/sys/dev/acpi/acpicpu.c,v > retrieving revision 1.85 > diff -u -p -r1.85 acpicpu.c > --- dev/acpi/acpicpu.c27 May 2020 05:02:21 - 1.85 > +++ dev/acpi/acpicpu.c27 Jul 2020 14:58:38 - > @@ -186,6 +186,11 @@ struct cfdriver acpicpu_cd = { > NULL, "acpicpu", DV_DULL > }; > > +const char *acpicpu_hids[] = { > + "ACPI0007", > + NULL > +}; > + > extern int setperf_prio; > > struct acpicpu_softc *acpicpu_sc[MAXCPUS]; > @@ -650,6 +655,9 @@ acpicpu_match(struct device *parent, voi > struct acpi_attach_args *aa = aux; > struct cfdata *cf = match; > > + if (acpi_matchhids(aa, acpicpu_hids, cf->cf_driver->cd_name)) > + return (1); > + > /* sanity */ > if (aa->aaa_name == NULL || > strcmp(aa->aaa_name, cf->cf_driver->cd_name) != 0 || > @@ -665,6 +673,7 @@ acpicpu_attach(struct device *parent, st > struct acpicpu_softc*sc = (struct acpicpu_softc *)self; > struct acpi_attach_args *aa = aux; > struct aml_valueres; > + int64_t uid; > int i; > uint32_tstatus = 0; > CPU_INFO_ITERATOR cii; > @@ -675,6 +684,10 @@ acpicpu_attach(struct device *parent, st > acpicpu_sc[sc->sc_dev.dv_unit] = sc; > > SLIST_INIT(&sc->sc_cstates); > + > + if (aml_evalinteger(sc->sc_acpi, sc->sc_devnode, > + "_UID", 0, NULL, &uid) == 0) > + sc->sc_cpu = uid; > > if (aml_evalnode(sc->sc_acpi, sc->sc_devnode, 0, NULL, &res) == 0) { > if (res.type == AML_OBJTYPE_PROCESSOR) { >
mcx(4) RSS
mcx(4) is almost ready to enable RSS, except arm64 doesn't yet support mapping interrupts to cpus. Until that's in place, here's a diff with the missing pieces from the driver in case anyone wants to test. This will enable up to 8 rx/tx queues, depending on the number of cpus available. Index: if_mcx.c === RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v retrieving revision 1.64 diff -u -p -r1.64 if_mcx.c --- if_mcx.c14 Jul 2020 04:10:18 - 1.64 +++ if_mcx.c14 Jul 2020 04:49:36 - @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -83,7 +84,7 @@ #define MCX_LOG_RQ_SIZE10 #define MCX_LOG_SQ_SIZE11 -#define MCX_MAX_QUEUES 1 +#define MCX_MAX_QUEUES 8 /* completion event moderation - about 10khz, or 90% of the cq */ #define MCX_CQ_MOD_PERIOD 50 @@ -2331,6 +2332,7 @@ struct mcx_softc { unsigned int sc_calibration_gen; struct timeout sc_calibrate; + struct intrmap *sc_intrmap; struct mcx_queuessc_queues[MCX_MAX_QUEUES]; unsigned int sc_nqueues; @@ -2716,7 +2718,11 @@ mcx_attach(struct device *parent, struct ether_sprintf(sc->sc_ac.ac_enaddr)); msix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag); - sc->sc_nqueues = 1; + sc->sc_intrmap = intrmap_create(&sc->sc_dev, msix, MCX_MAX_QUEUES, + INTRMAP_POWEROF2); + sc->sc_nqueues = intrmap_count(sc->sc_intrmap); + KASSERT(sc->sc_nqueues > 0); + KASSERT(powerof2(sc->sc_nqueues)); strlcpy(ifp->if_xname, DEVNAME(sc), IFNAMSIZ); ifp->if_softc = sc; @@ -2786,8 +2792,9 @@ mcx_attach(struct device *parent, struct } snprintf(q->q_name, sizeof(q->q_name), "%s:%d", DEVNAME(sc), i); - q->q_ihc = pci_intr_establish(sc->sc_pc, ih, - IPL_NET | IPL_MPSAFE, mcx_cq_intr, q, q->q_name); + q->q_ihc = pci_intr_establish_cpu(sc->sc_pc, ih, + IPL_NET | IPL_MPSAFE, intrmap_cpu(sc->sc_intrmap, i), + mcx_cq_intr, q, q->q_name); } timeout_set(&sc->sc_calibrate, mcx_calibrate, sc); Index: files.pci === RCS file: /cvs/src/sys/dev/pci/files.pci,v retrieving revision 1.350 diff -u -p -r1.350 files.pci --- files.pci 14 Jul 2020 04:10:18 - 1.350 +++ files.pci 14 Jul 2020 04:49:36 - @@ -831,7 +831,7 @@ attach bnxt at pci file dev/pci/if_bnxt.c bnxt # Mellanox ConnectX-4 and later -device mcx: ether, ifnet, ifmedia, stoeplitz +device mcx: ether, ifnet, ifmedia, stoeplitz, intrmap attach mcx at pci filedev/pci/if_mcx.cmcx
Re: multiple rings and cpus for ix(4)
On Wed, Jun 17, 2020 at 12:50:46PM +0200, Hrvoje Popovski wrote: > On 17.6.2020. 12:45, Hrvoje Popovski wrote: > > On 17.6.2020. 11:27, Hrvoje Popovski wrote: > >> On 17.6.2020. 10:36, David Gwynne wrote: > >>> this is an updated version of a diff from christiano haesbaert by way of > >>> mpi@ to enable the use of multiple tx and rx rings with msi-x. > >>> > >>> the high level description is that that driver checks to see if msix is > >>> available, and if so how many vectors it has. it then gets an intrmap > >>> based on that information, and bumps the number of queues to the number > >>> of cpus that intrmap says are available. > >>> > >>> once the queues are allocated, it then iterates over them and wires up > >>> interrupts to the cpus provided by the intrmap. > >>> > >>> im happy for people to try this out, but i can't commit it until all the > >>> architectures that ix(4) is enabled on support the APIs that it's using. > >>> this basically means it'll work on amd64 (and a little bit on i386), but > >>> not much else. please hold back your tears and cries of anguish. > >>> > >>> thanks to christiano and mpi for doing most of the work leading up to > >>> this diff :) > >> > >> Hi, > >> > >> first, thank you all for mq work :) > >> > >> with this diff, if i'm sending traffic over ix and at the same time > >> execute ifconfig ix down/up, forwarding stops until i stop generator, > >> wait for few seconds and execute ifconfig ix down/up few times and than > >> forwarding start normally > > > > > in vmstat i should see ix0:0-5 and ix1:0-5 ? vmstat -i only shows interrupts that have actually fired. Use -zi to show all interrupts. This diff doesn't set up RSS, so received packets will only go to the first vector, which is why only one of the ix1 interrupts has fired. Outgoing packets are scattered across the tx queues, so all the ix0 interrupts have fired. > > r620-1# vmstat -i > interrupt total rate > irq0/clock3985752 599 > irq0/ipi 3462063 520 > irq144/acpi040 > irq114/ix0:0 8042709 1209 > irq115/ix0:1 2906070 437 > irq116/ix0:2 1975350 297 > irq117/ix0:3849089681 127721 > irq118/ix0:4 4441608 668 > irq119/ix0:5 4330871 651 > irq120/ix0 100 > irq121/ix1:0 43209056 6499 > irq127/ix1 160 > irq97/mfi0 368465 > irq132/ixl2 70 > irq133/ixl2:0 4590 > irq134/ixl3 70 > irq135/ixl3:0 4510 > irq99/ehci0 1390 > irq136/em0 186372 > irq137/em14510 > irq100/ehci1 280 > irq101/ahci010 > irq146/com1 44110 > Total 921504627 138613 >
urtwn(4) hardware crypto
This enables use of hardware crypto for CCMP in urtwn(4). As with other drivers, this reduces cpu usage significantly when moving lots of data. I've tested this on an assortment of hardware (RTL8188CUS, RTL8188EU, RTL8192EU) with no problems, and this is one of the few things that remains constant across a lot of Realtek wifi chips, but some wider testing couldn't hurt. Since this touches the code shared with rtwn(4), I've also tested that that still works. Index: ic/r92creg.h === RCS file: /cvs/src/sys/dev/ic/r92creg.h,v retrieving revision 1.24 diff -u -p -r1.24 r92creg.h --- ic/r92creg.h11 Mar 2019 06:19:33 - 1.24 +++ ic/r92creg.h5 Jun 2020 11:52:21 - @@ -688,6 +688,16 @@ #define R92C_CAMCMD_CLR0x4000 #define R92C_CAMCMD_POLLING0x8000 +/* Bits for R92C_SECCFG. */ +#define R92C_SECCFG_TXUCKEY_DEF 0x0001 +#define R92C_SECCFG_RXUCKEY_DEF0x0002 +#define R92C_SECCFG_TXENC_ENA 0x0004 +#define R92C_SECCFG_RXENC_ENA 0x0008 +#define R92C_SECCFG_CMP_A2 0x0010 +#define R92C_SECCFG_MC_SRCH_DIS0x0020 +#define R92C_SECCFG_TXBCKEY_DEF 0x0040 +#define R92C_SECCFG_RXBCKEY_DEF 0x0080 + /* IMR */ /*Beacon DMA interrupt 6 */ Index: ic/rtwn.c === RCS file: /cvs/src/sys/dev/ic/rtwn.c,v retrieving revision 1.49 diff -u -p -r1.49 rtwn.c --- ic/rtwn.c 9 Jan 2020 14:35:19 - 1.49 +++ ic/rtwn.c 5 Jun 2020 11:52:22 - @@ -3154,6 +3154,14 @@ rtwn_init(struct ifnet *ifp) /* Clear per-station keys table. */ rtwn_cam_init(sc); + /* Enable decryption / encryption. */ + if (sc->chip & RTWN_CHIP_USB) { + rtwn_write_2(sc, R92C_SECCFG, + R92C_SECCFG_TXUCKEY_DEF | R92C_SECCFG_RXUCKEY_DEF | + R92C_SECCFG_TXENC_ENA | R92C_SECCFG_RXENC_ENA | + R92C_SECCFG_TXBCKEY_DEF | R92C_SECCFG_RXBCKEY_DEF); + } + /* Enable hardware sequence numbering. */ rtwn_write_1(sc, R92C_HWSEQ_CTRL, 0xff); @@ -3204,14 +3212,14 @@ rtwn_init(struct ifnet *ifp) ifq_clr_oactive(&ifp->if_snd); ifp->if_flags |= IFF_RUNNING; -#ifdef notyet - if (ic->ic_flags & IEEE80211_F_WEPON) { + if ((ic->ic_flags & IEEE80211_F_WEPON) && + (sc->chip & RTWN_CHIP_USB)) { /* Install WEP keys. */ for (i = 0; i < IEEE80211_WEP_NKID; i++) ic->ic_set_key(ic, NULL, &ic->ic_nw_keys[i]); sc->sc_ops.wait_async(sc->sc_ops.cookie); } -#endif + if (ic->ic_opmode == IEEE80211_M_MONITOR) ieee80211_new_state(ic, IEEE80211_S_RUN, -1); else Index: usb/if_urtwn.c === RCS file: /cvs/src/sys/dev/usb/if_urtwn.c,v retrieving revision 1.89 diff -u -p -r1.89 if_urtwn.c --- usb/if_urtwn.c 26 May 2020 06:04:30 - 1.89 +++ usb/if_urtwn.c 5 Jun 2020 11:52:22 - @@ -490,10 +490,8 @@ urtwn_attach(struct device *parent, stru ic->ic_updateslot = urtwn_updateslot; ic->ic_updateedca = urtwn_updateedca; -#ifdef notyet ic->ic_set_key = urtwn_set_key; ic->ic_delete_key = urtwn_delete_key; -#endif /* Override state transition machine. */ ic->ic_newstate = urtwn_newstate; @@ -1035,6 +1033,10 @@ urtwn_set_key(struct ieee80211com *ic, s struct urtwn_softc *sc = (struct urtwn_softc *)self; struct urtwn_cmd_key cmd; + /* Only handle keys for CCMP */ + if (k->k_cipher != IEEE80211_CIPHER_CCMP) + return ieee80211_set_key(ic, ni, k); + /* Defer setting of WEP keys until interface is brought up. */ if ((ic->ic_if.if_flags & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING)) @@ -1065,6 +1067,12 @@ urtwn_delete_key(struct ieee80211com *ic struct urtwn_softc *sc = (struct urtwn_softc *)self; struct urtwn_cmd_key cmd; + /* Only handle keys for CCMP */ + if (k->k_cipher != IEEE80211_CIPHER_CCMP) { + ieee80211_delete_key(ic, ni, k); + return; + } + if (!(ic->ic_if.if_flags & IFF_RUNNING) || ic->ic_state != IEEE80211_S_RUN) return; /* Nothing to do. */ @@ -1084,6 +1092,52 @@ urtwn_delete_key_cb(struct urtwn_softc * rtwn_delete_key(ic, cmd->ni, &cmd->key); } +int +urtwn_ccmp_decap(struct urtwn_softc *sc, struct mbuf *m, +struct ieee80211_node *ni) +{ + struct ieee80211com *ic = &sc->sc_sc.sc_ic; + struct ieee80211_key *k; + struct ieee80211_frame *wh; + uint64_t pn, *prsc; + uint8_t *ivp; + uint8_t tid; + int hdrlen, hasqos; + + k = ieee80211_get_rxkey(ic, m, ni); + if (k == NULL) + return 1; + + wh = mtod(m, struct ieee80211_frame *); + hdrlen = iee
mcx(4) vlan offload
This implements vlan offload in mcx(4). vlan stripping is fairly straightforward, as the nic just removes the tag and populates a field in the completion queue entry. vlan insertion is a bit funny, as the nic doesn't do any of the work here at all. The driver has to copy at least the L2 headers of the packet into the send queue entry, so it can insert a tag into a previously untagged packet while it's doing that, and somewhat lower cost than shuffling the packet data around in an mbuf. I've tested that this doesn't break tcp or udp checksums on vlan-tagged packets (including udp fragments). ok? Index: if_mcx.c === RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v retrieving revision 1.48 diff -u -p -r1.48 if_mcx.c --- if_mcx.c27 May 2020 04:03:20 - 1.48 +++ if_mcx.c28 May 2020 09:30:31 - @@ -18,6 +18,7 @@ */ #include "bpfilter.h" +#include "vlan.h" #include #include @@ -92,6 +93,7 @@ ((1 << MCX_LOG_FLOW_TABLE_SIZE) - MCX_NUM_STATIC_FLOWS) #define MCX_SQ_INLINE_SIZE 18 +CTASSERT(ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN == MCX_SQ_INLINE_SIZE); /* doorbell offsets */ #define MCX_CQ_DOORBELL_OFFSET 0 @@ -1258,6 +1260,8 @@ struct mcx_cq_entry { #define MCX_CQ_ENTRY_FLAGS_L4_OK (1 << 26) #define MCX_CQ_ENTRY_FLAGS_L3_OK (1 << 25) #define MCX_CQ_ENTRY_FLAGS_L2_OK (1 << 24) +#define MCX_CQ_ENTRY_FLAGS_CV (1 << 16) +#define MCX_CQ_ENTRY_FLAGS_VLAN_MASK (0x) uint32_tcq_lro_srqn; uint32_t__reserved__[2]; @@ -2363,6 +2367,9 @@ mcx_attach(struct device *parent, struct ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_IPv4 | IFCAP_CSUM_UDPv4 | IFCAP_CSUM_UDPv6 | IFCAP_CSUM_TCPv4 | IFCAP_CSUM_TCPv6; +#if NVLAN > 0 + ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; +#endif IFQ_SET_MAXLEN(&ifp->if_snd, 1024); ifmedia_init(&sc->sc_media, IFM_IMASK, mcx_media_change, @@ -4013,6 +4020,7 @@ mcx_create_rq(struct mcx_softc *sc, int struct mcx_rq_ctx *mbin; int error; uint64_t *pas; + uint32_t rq_flags; uint8_t *doorbell; int insize, npages, paslen, token; @@ -4044,7 +4052,11 @@ mcx_create_rq(struct mcx_softc *sc, int goto free; } mbin = (struct mcx_rq_ctx *)(((char *)mcx_cq_mbox_data(mcx_cq_mbox(&mxm, 0))) + 0x10); - mbin->rq_flags = htobe32(MCX_RQ_CTX_RLKEY | MCX_RQ_CTX_VLAN_STRIP_DIS); + rq_flags = MCX_RQ_CTX_RLKEY; +#if NVLAN == 0 + rq_flags |= MCX_RQ_CTX_VLAN_STRIP_DIS; +#endif + mbin->rq_flags = htobe32(rq_flags); mbin->rq_cqn = htobe32(cqn); mbin->rq_wq.wq_type = MCX_WQ_CTX_TYPE_CYCLIC; mbin->rq_wq.wq_pd = htobe32(sc->sc_pd); @@ -5697,6 +5709,13 @@ mcx_process_rx(struct mcx_softc *sc, str if (flags & MCX_CQ_ENTRY_FLAGS_L4_OK) m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK | M_UDP_CSUM_IN_OK; +#if NVLAN > 0 + if (flags & MCX_CQ_ENTRY_FLAGS_CV) { + m->m_pkthdr.ether_vtag = (flags & + MCX_CQ_ENTRY_FLAGS_VLAN_MASK); + m->m_flags |= M_VLANTAG; + } +#endif if (c->c_tdiff) { uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp; @@ -6369,9 +6388,26 @@ mcx_start(struct ifqueue *ifq) csum |= MCX_SQE_L4_CSUM; sqe->sqe_mss_csum = htobe32(csum); sqe->sqe_inline_header_size = htobe16(MCX_SQ_INLINE_SIZE); - m_copydata(m, 0, MCX_SQ_INLINE_SIZE, - (caddr_t)sqe->sqe_inline_headers); - m_adj(m, MCX_SQ_INLINE_SIZE); +#if NVLAN > 0 + if (m->m_flags & M_VLANTAG) { + struct ether_vlan_header *evh; + evh = (struct ether_vlan_header *) + &sqe->sqe_inline_headers; + + /* slightly cheaper vlan_inject() */ + m_copydata(m, 0, ETHER_HDR_LEN, (caddr_t)evh); + evh->evl_proto = evh->evl_encap_proto; + evh->evl_encap_proto = htons(ETHERTYPE_VLAN); + evh->evl_tag = htons(m->m_pkthdr.ether_vtag); + + m_adj(m, ETHER_HDR_LEN); + } else +#endif + { + m_copydata(m, 0, MCX_SQ_INLINE_SIZE, + (caddr_t)sqe->sqe_inline_headers); + m_adj(m, MCX_SQ_INLINE_SIZE); + } if (mcx_load_mbuf(sc, ms, m) != 0) { m_freem(m);
vmx(4) msi-x
This prepares vmx(4) for multi-queue operation, first by making use of msi-x where available, and second by rearranging the queue structures to fit the direction we're heading in. As with other drivers, here I'm reserving msi-x vector 0 for events, then mapping tx/rx queues to the subsequent vectors. Aside from the interrupt setup itself, the only change in behaviour here is that queue setup is done after interrupt setup, as we'll need to know what type of interrupt we're using to decide how many queues to use. This is how other vmx drivers work so this must be safe. I've tested this with esxi 6.7 and qemu. Can somone try this out on vmware workstation or player please? I wouldn't expect those to be any different to esxi in this respect, but it's always possible. ok? Index: if_vmx.c === RCS file: /cvs/src/sys/dev/pci/if_vmx.c,v retrieving revision 1.55 diff -u -p -r1.55 if_vmx.c --- if_vmx.c27 Oct 2019 22:24:40 - 1.55 +++ if_vmx.c25 May 2020 09:35:33 - @@ -42,8 +42,7 @@ #include #include -#define NRXQUEUE 1 -#define NTXQUEUE 1 +#define VMX_MAX_QUEUES 1 #define NTXDESC 512 /* tx ring size */ #define NTXSEGS 8 /* tx descriptors per packet */ @@ -95,6 +94,7 @@ struct vmxnet3_txqueue { struct vmxnet3_txring cmd_ring; struct vmxnet3_comp_ring comp_ring; struct vmxnet3_txq_shared *ts; + struct ifqueue *ifq; }; struct vmxnet3_rxqueue { @@ -103,6 +103,14 @@ struct vmxnet3_rxqueue { struct vmxnet3_rxq_shared *rs; }; +struct vmxnet3_queue { + struct vmxnet3_txqueue tx; + struct vmxnet3_rxqueue rx; + struct vmxnet3_softc *sc; + char intrname[8]; + int intr; +}; + struct vmxnet3_softc { struct device sc_dev; struct arpcom sc_arpcom; @@ -114,9 +122,11 @@ struct vmxnet3_softc { bus_space_handle_t sc_ioh1; bus_dma_tag_t sc_dmat; void *sc_ih; + void *sc_qih[VMX_MAX_QUEUES]; + int sc_nintr; + int sc_nqueues; - struct vmxnet3_txqueue sc_txq[NTXQUEUE]; - struct vmxnet3_rxqueue sc_rxq[NRXQUEUE]; + struct vmxnet3_queue sc_q[VMX_MAX_QUEUES]; struct vmxnet3_driver_shared *sc_ds; u_int8_t *sc_mcast; }; @@ -153,8 +163,8 @@ struct { int vmxnet3_match(struct device *, void *, void *); void vmxnet3_attach(struct device *, struct device *, void *); int vmxnet3_dma_init(struct vmxnet3_softc *); -int vmxnet3_alloc_txring(struct vmxnet3_softc *, int); -int vmxnet3_alloc_rxring(struct vmxnet3_softc *, int); +int vmxnet3_alloc_txring(struct vmxnet3_softc *, int, int); +int vmxnet3_alloc_rxring(struct vmxnet3_softc *, int, int); void vmxnet3_txinit(struct vmxnet3_softc *, struct vmxnet3_txqueue *); void vmxnet3_rxinit(struct vmxnet3_softc *, struct vmxnet3_rxqueue *); void vmxnet3_txstop(struct vmxnet3_softc *, struct vmxnet3_txqueue *); @@ -164,6 +174,8 @@ void vmxnet3_enable_all_intrs(struct vmx void vmxnet3_disable_all_intrs(struct vmxnet3_softc *); int vmxnet3_intr(void *); int vmxnet3_intr_intx(void *); +int vmxnet3_intr_event(void *); +int vmxnet3_intr_queue(void *); void vmxnet3_evintr(struct vmxnet3_softc *); void vmxnet3_txintr(struct vmxnet3_softc *, struct vmxnet3_txqueue *); void vmxnet3_rxintr(struct vmxnet3_softc *, struct vmxnet3_rxqueue *); @@ -212,6 +224,7 @@ vmxnet3_attach(struct device *parent, st u_int memtype, ver, macl, mach, intrcfg; u_char enaddr[ETHER_ADDR_LEN]; int (*isr)(void *); + int i; memtype = pci_mapreg_type(pa->pa_pc, pa->pa_tag, 0x10); if (pci_mapreg_map(pa, 0x10, memtype, 0, &sc->sc_iot0, &sc->sc_ioh0, @@ -241,18 +254,22 @@ vmxnet3_attach(struct device *parent, st WRITE_BAR1(sc, VMXNET3_BAR1_UVRS, 1); sc->sc_dmat = pa->pa_dmat; - if (vmxnet3_dma_init(sc)) { - printf(": failed to setup DMA\n"); - return; - } WRITE_CMD(sc, VMXNET3_CMD_GET_INTRCFG); intrcfg = READ_BAR1(sc, VMXNET3_BAR1_CMD); isr = vmxnet3_intr; + sc->sc_nintr = 0; + sc->sc_nqueues = 1; switch (intrcfg & VMXNET3_INTRCFG_TYPE_MASK) { case VMXNET3_INTRCFG_TYPE_AUTO: case VMXNET3_INTRCFG_TYPE_MSIX: + if (pci_intr_map_msix(pa, 0, &ih) == 0) { + isr = vmxnet3_intr_event; + sc->sc_nintr = sc->sc_nqueues + 1; + break; + } + /* FALLTHROUGH */ case VMXNET3_INTRCFG_TYPE_MSI: if (pci_intr_map_msi(pa, &ih) == 0) @@ -273,6 +290,35 @@ vmxnet3_attach(struct device *parent, st if (intrstr) printf(": %s", intrstr); + if (sc->sc_nintr > 1) { + for (i = 0; i < sc->sc_nqueues; i++) { + struct vmxnet3_queue *q; + int vec; + + q = &sc->sc_q[i]; + vec = i
mcx(4) checksum offload
So far I've completely ignored offloads in the ethernet drivers I've written, but on having a quick look at the documentation I found that mcx(4) checksum offload is extremely easy to use, and some simple testing suggests that it helps quite a bit. I've seen tcpbench receive throughput increase by around 15%. The nic supports all the checksum offloads we know about, reports checksum status for every packet without being asked to, and can figure out packet header lengths etc. for itself, so on the tx side, the driver just sets some flags to say "checksum this for me please", and on the rx side, it looks at two bits in the completion queue entry. I'm mostly sending this out to see if anyone can gather any interesting performance numbers. Index: if_mcx.c === RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v retrieving revision 1.44 diff -u -p -u -p -r1.44 if_mcx.c --- if_mcx.c24 Apr 2020 07:28:37 - 1.44 +++ if_mcx.c18 May 2020 10:22:32 - @@ -1255,6 +1292,10 @@ struct mcx_cq_entry { uint32_tcq_checksum; uint32_t__reserved__; uint32_tcq_flags; +#define MCX_CQ_ENTRY_FLAGS_L4_OK (1 << 26) +#define MCX_CQ_ENTRY_FLAGS_L3_OK (1 << 25) +#define MCX_CQ_ENTRY_FLAGS_L2_OK (1 << 24) + uint32_tcq_lro_srqn; uint32_t__reserved__[2]; uint32_tcq_byte_cnt; @@ -2355,7 +2396,9 @@ mcx_attach(struct device *parent, struct ifp->if_qstart = mcx_start; ifp->if_watchdog = mcx_watchdog; ifp->if_hardmtu = sc->sc_hardmtu; - ifp->if_capabilities = IFCAP_VLAN_MTU; + ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_IPv4 | + IFCAP_CSUM_UDPv4 | IFCAP_CSUM_UDPv6 | IFCAP_CSUM_TCPv4 | + IFCAP_CSUM_TCPv6; IFQ_SET_MAXLEN(&ifp->if_snd, 1024); ifmedia_init(&sc->sc_media, IFM_IMASK, mcx_media_change, @@ -5662,6 +5966,7 @@ mcx_process_rx(struct mcx_softc *sc, str struct mcx_slot *ms; struct mbuf *m; int slot; + uint32_t flags; slot = betoh16(cqe->cq_wqe_count) % (1 << MCX_LOG_RQ_SIZE); @@ -5680,6 +5985,13 @@ mcx_process_rx(struct mcx_softc *sc, str betoh32(cqe->cq_rx_hash); } + flags = bemtoh32(&cqe->cq_flags); + if (flags & MCX_CQ_ENTRY_FLAGS_L3_OK) + m->m_pkthdr.csum_flags = M_IPV4_CSUM_IN_OK; + if (flags & MCX_CQ_ENTRY_FLAGS_L4_OK) + m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK | + M_UDP_CSUM_IN_OK; + if (c->c_tdiff) { uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp; t *= c->c_udiff; @@ -6343,6 +6657,7 @@ mcx_start(struct ifqueue *ifq) sqe->sqe_signature = htobe32(MCX_SQE_CE_CQE_ALWAYS); /* eth segment */ + sqe->sqe_mss_csum = htobe32(MCX_SQE_L3_CSUM | MCX_SQE_L4_CSUM); sqe->sqe_inline_header_size = htobe16(MCX_SQ_INLINE_SIZE); m_copydata(m, 0, MCX_SQ_INLINE_SIZE, (caddr_t)sqe->sqe_inline_headers);
msi-x for ixl(4)
This makes ixl(4) use MSI-X where available. The hardware is set up for the same kind of approach as we're heading towards in em(4) and ix(4) - interrupts for admin commands and events (link state etc.) can only be delivered to vector 0, and the natural approach is to map rx and tx queues to other vectors, so that's what I've done here. The driver was already set up for multiple rx/tx queues (though it only uses one still), so the diff sets up one vector per queue. The vector setup here involves creating linked lists of interrupt causes, which are identified by a queue type (tx or rx) and a queue index. The queues also need to be told which msix vector they interrupt on. This is done through per-vector and per-queue registers. I've tested this with and without msix on amd64 with this nic: ixl0 at pci14 dev 0 function 0 "Intel X710 SFP+" rev 0x02: port 3, FW 6.0.48754 API 1.7, msix, 1 queue ok? Index: if_ixl.c === RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.47 diff -u -p -r1.47 if_ixl.c --- if_ixl.c22 Apr 2020 07:09:40 - 1.47 +++ if_ixl.c28 Apr 2020 00:24:02 - @@ -1092,6 +1092,13 @@ struct ixl_atq { }; SIMPLEQ_HEAD(ixl_atq_list, ixl_atq); +struct ixl_queue_intr { + struct ixl_softc*sc; + int queue; + void*ihc; + char name[8]; +}; + struct ixl_softc { struct devicesc_dev; struct arpcomsc_ac; @@ -1103,6 +1110,7 @@ struct ixl_softc { pci_intr_handle_tsc_ih; void*sc_ihc; pcitag_t sc_tag; + struct ixl_queue_intr *sc_qintr; bus_dma_tag_tsc_dmat; bus_space_tag_t sc_memt; @@ -1160,6 +1168,8 @@ struct ixl_softc { static voidixl_clear_hw(struct ixl_softc *); static int ixl_pf_reset(struct ixl_softc *); +static int ixl_setup_msix(struct ixl_softc *, struct pci_attach_args *); + static int ixl_dmamem_alloc(struct ixl_softc *, struct ixl_dmamem *, bus_size_t, u_int); static voidixl_dmamem_free(struct ixl_softc *, struct ixl_dmamem *); @@ -1214,7 +1224,8 @@ static void ixl_media_status(struct ifne static voidixl_watchdog(struct ifnet *); static int ixl_ioctl(struct ifnet *, u_long, caddr_t); static voidixl_start(struct ifqueue *); -static int ixl_intr(void *); +static int ixl_intr0(void *); +static int ixl_intr_queue(void *); static int ixl_up(struct ixl_softc *); static int ixl_down(struct ixl_softc *); static int ixl_iff(struct ixl_softc *); @@ -1524,13 +1535,24 @@ ixl_attach(struct device *parent, struct goto shutdown; } - if (pci_intr_map_msi(pa, &sc->sc_ih) != 0 && - pci_intr_map(pa, &sc->sc_ih) != 0) { - printf(", unable to map interrupt\n"); - goto shutdown; + if (pci_intr_map_msix(pa, 0, &sc->sc_ih) == 0) { + sc->sc_qintr = mallocarray(sizeof(struct ixl_queue_intr), + ixl_nqueues(sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); + if (sc->sc_qintr == NULL) { + printf(", unable to allocate queue interrupts\n"); + goto shutdown; + } + } else { + if (pci_intr_map_msi(pa, &sc->sc_ih) != 0 && + pci_intr_map(pa, &sc->sc_ih) != 0) { + printf(", unable to map interrupt\n"); + goto shutdown; + } } - printf(", %s, address %s\n", pci_intr_string(sc->sc_pc, sc->sc_ih), + printf(", %s, %d queue%s, address %s\n", + pci_intr_string(sc->sc_pc, sc->sc_ih), ixl_nqueues(sc), + (ixl_nqueues(sc) > 1 ? "s" : ""), ether_sprintf(sc->sc_ac.ac_enaddr)); if (ixl_hmc(sc) != 0) { @@ -1585,13 +1607,18 @@ ixl_attach(struct device *parent, struct } sc->sc_ihc = pci_intr_establish(sc->sc_pc, sc->sc_ih, - IPL_NET | IPL_MPSAFE, ixl_intr, sc, DEVNAME(sc)); + IPL_NET | IPL_MPSAFE, ixl_intr0, sc, DEVNAME(sc)); if (sc->sc_ihc == NULL) { printf("%s: unable to establish interrupt handler\n", DEVNAME(sc)); goto free_scratch; } + if (ixl_setup_msix(sc, pa) != 0) { + /* error printed by ixl_setup_msix */ + goto free_scratch; + } + ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_xflags = IFXF_MPSAFE; @@ -1667,6 +1694,9 @@ shutdown: BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); ixl_arq_unfill(sc); + + free(sc->sc_qintr, M_DEVBUF, ixl_nqueues(sc) * + sizeof(struct ixl_queue_intr)); free_arq: ixl_dmamem_free(sc, &sc->sc_arq); f