On Fri, Apr 14, 2023 at 10:26:14AM +0800, Kevin Lo wrote:
> On Thu, Apr 13, 2023 at 01:30:36PM -0500, Brian Conway wrote:
> > Reviving this thread, apologies for discontinuity in mail readers:
> > https://marc.info/?t=165642193500008
> >
> > After rebasing on 7.3, my results have mirrored Hrvoje's testing at
> > the end of that thread. No issues with throughput, unusual latency,
> > or reliability. `vmstat -i` shows some level of balancing between
> > the queues. I've been testing on as many em(4) systems as I have
> > access to, some manually, some in a packet forwarder/firewall
> > scenarios:
>
> Last time I tested (about a year go) on I211, rx locked up if I tried
> something
> like iperf3 or tcpbench. Don't know if you have a similar problem.
I rebased the rest to current and tested it with tcpbench between the
following interfaces:
em0 at pci7 dev 0 function 0 "Intel 82580" rev 0x01, msix, 4 queues, address
90:e2:ba:df:d5:2c
em0 at pci5 dev 0 function 0 "Intel I350" rev 0x01, msix, 8 queues, address
00:25:90:eb:b3:c2
After a second the connection stucked. As far as I can see, the
sending side got a problem.
ot45# tcpbench 192.168.99.3
elapsed_ms bytes mbps bwidth
1012 14574120 115.210 100.00%
Conn: 1 Mbps: 115.210 Peak Mbps: 115.210 Avg Mbps: 115.210
2022 0 0.000 -nan%
...
ot46# tcpbench -s
elapsed_ms bytes mbps bwidth
1017 14313480 112.594 100.00%
Conn: 1 Mbps: 112.594 Peak Mbps: 112.594 Avg Mbps: 112.594
2027 0 0.000 -nan%
...
ot45# netstat -nf inet -p tcp
Active Internet connections
Proto Recv-Q Send-Q Local Address Foreign Address TCP-State
tcp 0 260640 192.168.99.1.18530 192.168.99.3.12345 CLOSING
When I retried it, it sometimes work and most times not.
kstat tells me, that transmit queues 1 to 3 are oactive and just 0
works:
em0:0:txq:0
packets: 4042648 packets
bytes: 5310138322 bytes
qdrops: 9 packets
errors: 0 packets
qlen: 0 packets
maxqlen: 511 packets
oactive: false
em0:0:txq:1
packets: 9812 packets
bytes: 14846716 bytes
qdrops: 0 packets
errors: 0 packets
qlen: 184 packets
maxqlen: 511 packets
oactive: true
em0:0:txq:2
packets: 690362 packets
bytes: 60011484 bytes
qdrops: 0 packets
errors: 0 packets
qlen: 185 packets
maxqlen: 511 packets
oactive: true
em0:0:txq:3
packets: 443181 packets
bytes: 43829886 bytes
qdrops: 0 packets
errors: 0 packets
qlen: 198 packets
maxqlen: 511 packets
oactive: true
This is the rebased diff on current i tested:
Index: dev/pci/files.pci
===================================================================
RCS file: /cvs/src/sys/dev/pci/files.pci,v
retrieving revision 1.361
diff -u -p -r1.361 files.pci
--- dev/pci/files.pci 23 Apr 2023 00:20:26 -0000 1.361
+++ dev/pci/files.pci 25 Apr 2023 11:25:47 -0000
@@ -334,7 +334,7 @@ attach fxp at pci with fxp_pci
file dev/pci/if_fxp_pci.c fxp_pci
# Intel Pro/1000
-device em: ether, ifnet, ifmedia
+device em: ether, ifnet, ifmedia, intrmap, stoeplitz
attach em at pci
file dev/pci/if_em.c em
file dev/pci/if_em_hw.c em
Index: dev/pci/if_em.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_em.c,v
retrieving revision 1.365
diff -u -p -r1.365 if_em.c
--- dev/pci/if_em.c 9 Feb 2023 21:21:27 -0000 1.365
+++ dev/pci/if_em.c 25 Apr 2023 11:25:47 -0000
@@ -247,6 +247,7 @@ int em_intr(void *);
int em_allocate_legacy(struct em_softc *);
void em_start(struct ifqueue *);
int em_ioctl(struct ifnet *, u_long, caddr_t);
+int em_rxrinfo(struct em_softc *, struct if_rxrinfo *);
void em_watchdog(struct ifnet *);
void em_init(void *);
void em_stop(void *, int);
@@ -309,8 +310,10 @@ int em_setup_queues_msix(struct em_soft
int em_queue_intr_msix(void *);
int em_link_intr_msix(void *);
void em_enable_queue_intr_msix(struct em_queue *);
+void em_setup_rss(struct em_softc *);
#else
#define em_allocate_msix(_sc) (-1)
+#define em_setup_rss(_sc) 0
#endif
#if NKSTAT > 0
@@ -333,7 +336,6 @@ struct cfdriver em_cd = {
};
static int em_smart_pwr_down = FALSE;
-int em_enable_msix = 0;
/*********************************************************************
* Device identification routine
@@ -629,12 +631,12 @@ err_pci:
void
em_start(struct ifqueue *ifq)
{
+ struct em_queue *que = ifq->ifq_softc;
struct ifnet *ifp = ifq->ifq_if;
struct em_softc *sc = ifp->if_softc;
u_int head, free, used;
struct mbuf *m;
int post = 0;
- struct em_queue *que = sc->queues; /* Use only first queue. */
if (!sc->link_active) {
ifq_purge(ifq);
@@ -769,8 +771,7 @@ em_ioctl(struct ifnet *ifp, u_long comma
break;
case SIOCGIFRXR:
- error = if_rxr_ioctl((struct if_rxrinfo *)ifr->ifr_data,
- NULL, EM_MCLBYTES, &sc->queues->rx.sc_rx_ring);
+ error = em_rxrinfo(sc, (struct if_rxrinfo *)ifr->ifr_data);
break;
case SIOCGIFSFFPAGE:
@@ -801,6 +802,32 @@ em_ioctl(struct ifnet *ifp, u_long comma
return (error);
}
+int
+em_rxrinfo(struct em_softc *sc, struct if_rxrinfo *ifri)
+{
+ struct if_rxring_info *ifr;
+ struct em_queue *que;
+ int i;
+ int error;
+
+ ifr = mallocarray(sc->num_queues, sizeof(*ifr), M_TEMP,
+ M_WAITOK | M_ZERO | M_CANFAIL);
+ if (ifr == NULL)
+ return (ENOMEM);
+
+ i = 0;
+ FOREACH_QUEUE(sc, que) {
+ ifr[i].ifr_size = EM_MCLBYTES;
+ ifr[i].ifr_info = que->rx.sc_rx_ring;
+ i++;
+ }
+
+ error = if_rxr_info_ioctl(ifri, sc->num_queues, ifr);
+ free(ifr, M_TEMP, sc->num_queues * sizeof(*ifr));
+
+ return (error);
+}
+
/*********************************************************************
* Watchdog entry point
*
@@ -812,21 +839,22 @@ void
em_watchdog(struct ifnet *ifp)
{
struct em_softc *sc = ifp->if_softc;
- struct em_queue *que = sc->queues; /* Use only first queue. */
-
+ struct em_queue *que;
- /* If we are in this routine because of pause frames, then
- * don't reset the hardware.
- */
- if (E1000_READ_REG(&sc->hw, STATUS) & E1000_STATUS_TXOFF) {
- ifp->if_timer = EM_TX_TIMEOUT;
- return;
+ FOREACH_QUEUE(sc, que) {
+ /* If we are in this routine because of pause frames, then
+ * don't reset the hardware.
+ */
+ if (E1000_READ_REG(&sc->hw, STATUS) & E1000_STATUS_TXOFF) {
+ ifp->if_timer = EM_TX_TIMEOUT;
+ return;
+ }
+ printf("%s: watchdog queue %d: head %u tail %u TDH %u TDT %u\n",
+ DEVNAME(sc), que->me,
+ que->tx.sc_tx_desc_head, que->tx.sc_tx_desc_tail,
+ E1000_READ_REG(&sc->hw, TDH(que->me)),
+ E1000_READ_REG(&sc->hw, TDT(que->me)));
}
- printf("%s: watchdog: head %u tail %u TDH %u TDT %u\n",
- DEVNAME(sc),
- que->tx.sc_tx_desc_head, que->tx.sc_tx_desc_tail,
- E1000_READ_REG(&sc->hw, TDH(que->me)),
- E1000_READ_REG(&sc->hw, TDT(que->me)));
em_init(sc);
@@ -1669,7 +1697,6 @@ em_allocate_pci_resources(struct em_soft
{
int val, rid;
struct pci_attach_args *pa = &sc->osdep.em_pa;
- struct em_queue *que = NULL;
val = pci_conf_read(pa->pa_pc, pa->pa_tag, EM_MMBA);
if (PCI_MAPREG_TYPE(val) != PCI_MAPREG_TYPE_MEM) {
@@ -1742,18 +1769,6 @@ em_allocate_pci_resources(struct em_soft
sc->osdep.dev = (struct device *)sc;
sc->hw.back = &sc->osdep;
- /* Only one queue for the moment. */
- que = malloc(sizeof(struct em_queue), M_DEVBUF, M_NOWAIT | M_ZERO);
- if (que == NULL) {
- printf(": unable to allocate queue memory\n");
- return (ENOMEM);
- }
- que->me = 0;
- que->sc = sc;
- timeout_set(&que->rx_refill, em_rxrefill, que);
-
- sc->queues = que;
- sc->num_queues = 1;
sc->msix = 0;
sc->legacy_irq = 0;
if (em_allocate_msix(sc) && em_allocate_legacy(sc))
@@ -1826,11 +1841,7 @@ em_free_pci_resources(struct em_softc *s
sc->legacy_irq = 0;
sc->msix_linkvec = 0;
sc->msix_queuesmask = 0;
- if (sc->queues)
- free(sc->queues, M_DEVBUF,
- sc->num_queues * sizeof(struct em_queue));
sc->num_queues = 0;
- sc->queues = NULL;
}
/*********************************************************************
@@ -1949,8 +1960,10 @@ void
em_setup_interface(struct em_softc *sc)
{
struct ifnet *ifp;
+ struct em_queue *que;
uint64_t fiber_type = IFM_1000_SX;
-
+ int i;
+
INIT_DEBUGOUT("em_setup_interface: begin");
ifp = &sc->sc_ac.ac_if;
@@ -2012,6 +2025,22 @@ em_setup_interface(struct em_softc *sc)
if_attach(ifp);
ether_ifattach(ifp);
+
+ if_attach_iqueues(ifp, sc->num_queues);
+ if_attach_queues(ifp, sc->num_queues);
+
+ i = 0;
+ FOREACH_QUEUE(sc, que) {
+ que->me = i;
+ que->sc = sc;
+
+ ifp->if_iqs[i]->ifiq_softc = que;
+ ifp->if_ifqs[i]->ifq_softc = que;
+
+ timeout_set(&que->rx_refill, em_rxrefill, que);
+ i++;
+ }
+
em_enable_intr(sc);
}
@@ -2820,6 +2849,9 @@ em_initialize_receive_unit(struct em_sof
if (sc->hw.mac_type == em_82573)
E1000_WRITE_REG(&sc->hw, RDTR, 0x20);
+ if (sc->num_queues > 1)
+ em_setup_rss(sc);
+
FOREACH_QUEUE(sc, que) {
if (sc->num_queues > 1) {
/*
@@ -3487,6 +3519,12 @@ em_allocate_legacy(struct em_softc *sc)
}
sc->legacy_irq = 1;
}
+ sc->num_queues = 1;
+ sc->queues = malloc(sizeof(struct em_queue), M_DEVBUF, M_NOWAIT |
M_ZERO);
+ if (sc->queues == NULL) {
+ printf(": couldn't allocate queues\n");
+ return (ENOMEM);
+ }
intrstr = pci_intr_string(pc, ih);
sc->sc_intrhand = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
@@ -3869,44 +3907,66 @@ em_allocate_msix(struct em_softc *sc)
const char *intrstr = NULL;
struct pci_attach_args *pa = &sc->osdep.em_pa;
pci_chipset_tag_t pc = pa->pa_pc;
- struct em_queue *que = sc->queues; /* Use only first queue. */
+ struct em_queue *que;
+ int nmsix;
int vec;
-
- if (!em_enable_msix)
- return (ENODEV);
+ int max_queues;
switch (sc->hw.mac_type) {
case em_82576:
case em_82580:
case em_i350:
+ max_queues = 8;
+ break;
case em_i210:
+ if (sc->hw.device_id == PCI_PRODUCT_INTEL_I211_COPPER)
+ max_queues = 2;
+ else
+ max_queues = 4;
break;
default:
return (ENODEV);
}
+ /* if we only have one vector, just use msi */
+ nmsix = pci_intr_msix_count(pa);
+ if (nmsix < 2)
+ return (ENODEV);
+
vec = 0;
if (pci_intr_map_msix(pa, vec, &ih))
return (ENODEV);
sc->msix = 1;
- que->me = vec;
- que->eims = 1 << vec;
- snprintf(que->name, sizeof(que->name), "%s:%d", DEVNAME(sc), vec);
+ nmsix--;
+ sc->intrmap = intrmap_create(&sc->sc_dev, nmsix, max_queues,
INTRMAP_POWEROF2);
+ sc->num_queues = intrmap_count(sc->intrmap);
+ KASSERT(sc->num_queues > 0);
+ KASSERT(powerof2(sc->num_queues));
- intrstr = pci_intr_string(pc, ih);
- que->tag = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
- em_queue_intr_msix, que, que->name);
- if (que->tag == NULL) {
- printf(": couldn't establish interrupt");
- if (intrstr != NULL)
- printf(" at %s", intrstr);
- printf("\n");
- return (ENXIO);
+ sc->queues = mallocarray(sizeof(*que), sc->num_queues, M_DEVBUF,
M_NOWAIT | M_ZERO);
+ if (sc->queues == NULL)
+ return (ENOMEM);
+
+ FOREACH_QUEUE(sc, que) {
+ que->eims = 1 << vec;
+ snprintf(que->name, sizeof(que->name), "%s:%d", DEVNAME(sc),
vec);
+
+ if (pci_intr_map_msix(pa, vec, &ih)) {
+ printf(": unable to map msi-x vector %d", vec);
+ return (ENXIO);
+ }
+
+ que->tag = pci_intr_establish_cpu(pc, ih, IPL_NET | IPL_MPSAFE,
+ intrmap_cpu(sc->intrmap, vec), em_queue_intr_msix, que,
que->name);
+ if (que->tag == NULL) {
+ printf(": couldn't establish queue interrupt %d\n",
vec);
+ return (ENXIO);
+ }
+ vec++;
}
/* Setup linkvector, use last queue vector + 1 */
- vec++;
sc->msix_linkvec = vec;
if (pci_intr_map_msix(pa, sc->msix_linkvec, &ih)) {
printf(": couldn't map link vector\n");
@@ -4096,6 +4156,40 @@ void
em_enable_queue_intr_msix(struct em_queue *que)
{
E1000_WRITE_REG(&que->sc->hw, EIMS, que->eims);
+}
+
+void
+em_setup_rss(struct em_softc *sc)
+{
+ uint32_t rss_key[10];
+ uint32_t mrqc;
+ uint32_t reta;
+ int i;
+ int queue_id;
+
+ /* set redirection table to round robin across queues */
+ reta = 0;
+ for (i = 0; i < 128; i++) {
+ queue_id = i % sc->num_queues;
+ reta = reta >> 8;
+ reta = reta | (((uint32_t) queue_id) << 24);
+ if ((i & 3) == 3) {
+ E1000_WRITE_REG(&sc->hw, RETA(i >> 2), reta);
+ reta = 0;
+ }
+ }
+
+ stoeplitz_to_key(rss_key, sizeof(rss_key));
+ for (i = 0; i < nitems(rss_key); i++)
+ E1000_WRITE_REG_ARRAY(&sc->hw, RSSRK(0), i, rss_key[i]);
+
+ mrqc = E1000_MRQC_ENABLE_RSS_8Q;
+ mrqc |= E1000_MRQC_RSS_FIELD_IPV4 | E1000_MRQC_RSS_FIELD_IPV4_TCP
+ | E1000_MRQC_RSS_FIELD_IPV6 | E1000_MRQC_RSS_FIELD_IPV6_TCP
+ | E1000_MRQC_RSS_FIELD_IPV4_UDP | E1000_MRQC_RSS_FIELD_IPV6_UDP
+ | E1000_MRQC_RSS_FIELD_IPV6_UDP_EX |
E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
+ mrqc |= 0 << 3; /* default queue */
+ E1000_WRITE_REG(&sc->hw, MRQC, mrqc);
}
#endif /* !SMALL_KERNEL */
Index: dev/pci/if_em.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_em.h,v
retrieving revision 1.80
diff -u -p -r1.80 if_em.h
--- dev/pci/if_em.h 9 Jan 2022 05:42:50 -0000 1.80
+++ dev/pci/if_em.h 25 Apr 2023 11:25:47 -0000
@@ -52,9 +52,11 @@ POSSIBILITY OF SUCH DAMAGE.
#include <sys/timeout.h>
#include <sys/atomic.h>
#include <sys/kstat.h>
+#include <sys/intrmap.h>
#include <net/if.h>
#include <net/if_media.h>
+#include <net/toeplitz.h>
#include <netinet/in.h>
#include <netinet/ip.h>
@@ -449,6 +451,7 @@ struct em_softc {
uint32_t msix_queuesmask;
int num_queues;
struct em_queue *queues;
+ struct intrmap *intrmap;
struct kstat *kstat;
struct mutex kstat_mtx;