> Date: Tue, 2 Sep 2025 10:43:21 +0200 > From: Stefan Sperling <[email protected]> > > On Mon, Sep 01, 2025 at 07:23:58PM +0200, BESSOT Jean-Michel wrote: > > Hello > > > > I have the lines : > > > > qwx0: failed to setup rxd tid queue for tid 10: 12 > > qwx0: failed to setup dp for peer 18:df:26:7f:50:75 on vdev 0 (12) > > > > on my dmesg but the nework works. > > > > I attach the dmesg . > > > > I hope it can help, bye > > This means your system was too low on un-fragmented DMA memory in order > to allocate a ring with descriptors and buffers for Rx aggregation. > If network worked anyway then it was probably just a bit slower than it > could have been. > > If this happens often or causes issues the driver could be modified > to allocate this memory upfront when the system is booting. > > Or we could allow the driver to use DMA memory above the 4GB boundary > for memory related to Tx/Rx rings. The patch below does that and so far > works without issues for me on amd64.
The Linux ath11k driver definitely limits itself to 32-bit DMA for the non-PCI variants. I don't think we support those. For the PCI variant it uses a mix of 32-bit "coherent" and 36-bit "streaming" DMA. Not exactly sure what the difference is and why the distinction is made; this split was introduced in Linux commit dbd73acb22d8 and it hints at some hardware limitations. So at least some of the allocations that the Linux driver makes using dma_alloc_coherent() need to be below 4GB. Unfortunately you can put qwx(4) in systems that have memory above the 36-bit address boundary. Any x86 system with 64GB will have that, and even with less memory we can't be sure. So using BUS_DMA_64BIT for the DMA memory allocation isn't safe. However for you should be able to use bus_dmamem_alloc_range() with the appropriate upper limit for those allocations where the hardware is happy with a 36-bit limit. > I recall patrick@ clamping this driver for 4GB early on to fix some > problem related to loading the firmware. This was done before qwx even > provided a working network interface. So perhaps using 64-bit DMA for > packets is fine even on arm64? That doesn't really help us since mbufs are always allocated below 4GB. > Could you try this diff on your machine? > > commit 64c534826179113d40f77dfa20af572146d9926b (qwx64) > from: Stefan Sperling <[email protected]> > date: Tue Sep 2 08:31:17 2025 UTC > > make qwx(4) use 64-bit DMA allocations for Tx/Rx rings and related memory > > M sys/dev/ic/qwx.c | 23+ 13- > M sys/dev/ic/qwxvar.h | 1+ 1- > M sys/dev/pci/if_qwx_pci.c | 11+ 11- > > 3 files changed, 35 insertions(+), 25 deletions(-) > > commit - d5c7514e92137dc0e160a8f4f8ce9935d1c6968a > commit + 64c534826179113d40f77dfa20af572146d9926b > blob - 83d90d8a26cf1678ca1847b39a6269275b8b5333 > blob + 5abea3afe50987a9b5c3ee065114267698ce0c92 > --- sys/dev/ic/qwx.c > +++ sys/dev/ic/qwx.c > @@ -8431,7 +8431,7 @@ qwx_qmi_mem_seg_send(struct qwx_softc *sc) > } else if (sc->fwmem == NULL || QWX_DMA_LEN(sc->fwmem) < total_size) { > if (sc->fwmem != NULL) > qwx_dmamem_free(sc->sc_dmat, sc->fwmem); > - sc->fwmem = qwx_dmamem_alloc(sc->sc_dmat, total_size, 65536); > + sc->fwmem = qwx_dmamem_alloc(sc->sc_dmat, total_size, 65536, 0); > if (sc->fwmem == NULL) { > printf("%s: failed to allocate %zu bytes of DMA " > "memory for firmware\n", sc->sc_dev.dv_xname, > @@ -9287,7 +9287,7 @@ qwx_qmi_m3_load(struct qwx_softc *sc) > if (sc->m3_mem == NULL || QWX_DMA_LEN(sc->m3_mem) < len) { > if (sc->m3_mem) > qwx_dmamem_free(sc->sc_dmat, sc->m3_mem); > - sc->m3_mem = qwx_dmamem_alloc(sc->sc_dmat, len, 65536); > + sc->m3_mem = qwx_dmamem_alloc(sc->sc_dmat, len, 65536, 0); > if (sc->m3_mem == NULL) { > printf("%s: failed to allocate %zu bytes of DMA " > "memory for M3 firmware\n", sc->sc_dev.dv_xname, > @@ -9604,7 +9604,7 @@ qwx_dp_srng_setup(struct qwx_softc *sc, struct dp_srng > #endif > if (!cached) { > ring->mem = qwx_dmamem_alloc(sc->sc_dmat, ring->size, > - PAGE_SIZE); > + PAGE_SIZE, BUS_DMA_64BIT); > if (ring->mem == NULL) { > printf("%s: could not allocate DP SRNG DMA memory\n", > sc->sc_dev.dv_xname); > @@ -9804,7 +9804,7 @@ qwx_dp_link_desc_bank_alloc(struct qwx_softc *sc, > desc_sz = last_bank_sz; > > desc_bank[i].mem = qwx_dmamem_alloc(sc->sc_dmat, desc_sz, > - PAGE_SIZE); > + PAGE_SIZE, BUS_DMA_64BIT); > if (!desc_bank[i].mem) { > ret = ENOMEM; > goto err; > @@ -9955,7 +9955,8 @@ qwx_dp_scatter_idle_link_desc_setup(struct qwx_softc * > > for (i = 0; i < num_scatter_buf; i++) { > slist[i].mem = qwx_dmamem_alloc(sc->sc_dmat, > - HAL_WBM_IDLE_SCATTER_BUF_SIZE_MAX, PAGE_SIZE); > + HAL_WBM_IDLE_SCATTER_BUF_SIZE_MAX, PAGE_SIZE, > + BUS_DMA_64BIT); > if (slist[i].mem == NULL) { > ret = ENOMEM; > goto err; > @@ -21022,7 +21023,8 @@ qwx_hal_alloc_cont_rdp(struct qwx_softc *sc) > size_t size = sizeof(uint32_t) * HAL_SRNG_RING_ID_MAX; > > if (hal->rdpmem == NULL) { > - hal->rdpmem = qwx_dmamem_alloc(sc->sc_dmat, size, PAGE_SIZE); > + hal->rdpmem = qwx_dmamem_alloc(sc->sc_dmat, size, PAGE_SIZE, > + BUS_DMA_64BIT); > if (hal->rdpmem == NULL) { > printf("%s: could not allocate RDP DMA memory\n", > sc->sc_dev.dv_xname); > @@ -21057,7 +21059,8 @@ qwx_hal_alloc_cont_wrp(struct qwx_softc *sc) > size_t size = sizeof(uint32_t) * HAL_SRNG_NUM_LMAC_RINGS; > > if (hal->wrpmem == NULL) { > - hal->wrpmem = qwx_dmamem_alloc(sc->sc_dmat, size, PAGE_SIZE); > + hal->wrpmem = qwx_dmamem_alloc(sc->sc_dmat, size, PAGE_SIZE, > + BUS_DMA_64BIT); > if (hal->wrpmem == NULL) { > printf("%s: could not allocate WDP DMA memory\n", > sc->sc_dev.dv_xname); > @@ -24677,7 +24680,7 @@ qwx_peer_rx_tid_setup(struct qwx_softc *sc, struct iee > hw_desc_sz = qwx_hal_reo_qdesc_size(DP_BA_WIN_SZ_MAX, tid); > > rx_tid->mem = qwx_dmamem_alloc(sc->sc_dmat, hw_desc_sz, > - HAL_LINK_DESC_ALIGN); > + HAL_LINK_DESC_ALIGN, BUS_DMA_64BIT); > if (rx_tid->mem == NULL) { > #ifdef notyet > spin_unlock_bh(&ab->base_lock); > @@ -26523,7 +26526,8 @@ qwx_detach(struct qwx_softc *sc) > } > > struct qwx_dmamem * > -qwx_dmamem_alloc(bus_dma_tag_t dmat, bus_size_t size, bus_size_t align) > +qwx_dmamem_alloc(bus_dma_tag_t dmat, bus_size_t size, bus_size_t align, > + int flags) > { > struct qwx_dmamem *adm; > int nsegs; > @@ -26534,12 +26538,18 @@ qwx_dmamem_alloc(bus_dma_tag_t dmat, bus_size_t > size, > adm->size = size; > > if (bus_dmamap_create(dmat, size, 1, size, 0, > - BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW, &adm->map) != 0) > + BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW | flags, &adm->map) != 0) > goto admfree; > > - if (bus_dmamem_alloc_range(dmat, size, align, 0, &adm->seg, 1, > - &nsegs, BUS_DMA_NOWAIT | BUS_DMA_ZERO, 0, 0xffffffff) != 0) > - goto destroy; > + if (flags & BUS_DMA_64BIT) { > + if (bus_dmamem_alloc(dmat, size, align, 0, &adm->seg, 1, > + &nsegs, BUS_DMA_NOWAIT | BUS_DMA_ZERO | flags) != 0) > + goto destroy; > + } else { > + if (bus_dmamem_alloc_range(dmat, size, align, 0, &adm->seg, 1, > + &nsegs, BUS_DMA_NOWAIT | BUS_DMA_ZERO, 0, 0xffffffff) != 0) > + goto destroy; > + } > > if (bus_dmamem_map(dmat, &adm->seg, nsegs, size, > &adm->kva, BUS_DMA_NOWAIT | BUS_DMA_COHERENT) != 0) > blob - a45e474c188469d9c0375dd883db99993a0ce36f > blob + 3bfc0e0d70417239a76a794fcf5d38451c1b78fb > --- sys/dev/ic/qwxvar.h > +++ sys/dev/ic/qwxvar.h > @@ -473,7 +473,7 @@ struct qwx_dmamem { > caddr_t kva; > }; > > -struct qwx_dmamem *qwx_dmamem_alloc(bus_dma_tag_t, bus_size_t, bus_size_t); > +struct qwx_dmamem *qwx_dmamem_alloc(bus_dma_tag_t, bus_size_t, bus_size_t, > int); > void qwx_dmamem_free(bus_dma_tag_t, struct qwx_dmamem *); > > #define QWX_DMA_MAP(_adm) ((_adm)->map) > blob - 0dadf173db4401d6d7ad4ed761b5eeab92e95932 > blob + 1f21c39bd2b601f91342c2ad4219fc8774d28d11 > --- sys/dev/pci/if_qwx_pci.c > +++ sys/dev/pci/if_qwx_pci.c > @@ -978,7 +978,7 @@ unsupported_wcn6855_soc: > goto err_pci_disable_msi; > > psc->chan_ctxt = qwx_dmamem_alloc(sc->sc_dmat, > - sizeof(struct qwx_mhi_chan_ctxt) * psc->max_chan, 0); > + sizeof(struct qwx_mhi_chan_ctxt) * psc->max_chan, 0, 0); > if (psc->chan_ctxt == NULL) { > printf("%s: could not allocate channel context array\n", > sc->sc_dev.dv_xname); > @@ -992,7 +992,7 @@ unsupported_wcn6855_soc: > } > > psc->event_ctxt = qwx_dmamem_alloc(sc->sc_dmat, > - sizeof(struct qwx_mhi_event_ctxt) * QWX_NUM_EVENT_CTX, 0); > + sizeof(struct qwx_mhi_event_ctxt) * QWX_NUM_EVENT_CTX, 0, 0); > if (psc->event_ctxt == NULL) { > printf("%s: could not allocate event context array\n", > sc->sc_dev.dv_xname); > @@ -1006,7 +1006,7 @@ unsupported_wcn6855_soc: > } > > psc->cmd_ctxt = qwx_dmamem_alloc(sc->sc_dmat, > - sizeof(struct qwx_mhi_cmd_ctxt), 0); > + sizeof(struct qwx_mhi_cmd_ctxt), 0, 0); > if (psc->cmd_ctxt == NULL) { > printf("%s: could not allocate command context array\n", > sc->sc_dev.dv_xname); > @@ -1245,7 +1245,7 @@ qwx_pci_alloc_xfer_ring(struct qwx_softc *sc, struct q > > size = sizeof(struct qwx_mhi_ring_element) * num_elements; > /* Hardware requires that rings are aligned to ring size. */ > - ring->dmamem = qwx_dmamem_alloc(sc->sc_dmat, size, size); > + ring->dmamem = qwx_dmamem_alloc(sc->sc_dmat, size, size, 0); > if (ring->dmamem == NULL) > return ENOMEM; > > @@ -1396,7 +1396,7 @@ qwx_pci_alloc_event_ring(struct qwx_softc *sc, struct > > size = sizeof(struct qwx_mhi_ring_element) * num_elements; > /* Hardware requires that rings are aligned to ring size. */ > - ring->dmamem = qwx_dmamem_alloc(sc->sc_dmat, size, size); > + ring->dmamem = qwx_dmamem_alloc(sc->sc_dmat, size, size, 0); > if (ring->dmamem == NULL) > return ENOMEM; > > @@ -1451,7 +1451,7 @@ qwx_pci_init_cmd_ring(struct qwx_softc *sc, struct qwx > ring->size = sizeof(struct qwx_mhi_ring_element) * ring->num_elements; > > /* Hardware requires that rings are aligned to ring size. */ > - ring->dmamem = qwx_dmamem_alloc(sc->sc_dmat, ring->size, ring->size); > + ring->dmamem = qwx_dmamem_alloc(sc->sc_dmat, ring->size, ring->size, 0); > if (ring->dmamem == NULL) > return ENOMEM; > > @@ -3263,7 +3263,7 @@ qwx_mhi_fw_load_bhi(struct qwx_pci_softc *psc, uint8_t > uint64_t paddr; > int ret; > > - data_adm = qwx_dmamem_alloc(sc->sc_dmat, len, 0); > + data_adm = qwx_dmamem_alloc(sc->sc_dmat, len, 0, 0); > if (data_adm == NULL) { > printf("%s: could not allocate BHI DMA data buffer\n", > sc->sc_dev.dv_xname); > @@ -3331,7 +3331,7 @@ qwx_mhi_fw_load_bhie(struct qwx_pci_softc *psc, uint8_ > if (psc->amss_data == NULL || QWX_DMA_LEN(psc->amss_data) < len) { > if (psc->amss_data) > qwx_dmamem_free(sc->sc_dmat, psc->amss_data); > - psc->amss_data = qwx_dmamem_alloc(sc->sc_dmat, len, 0); > + psc->amss_data = qwx_dmamem_alloc(sc->sc_dmat, len, 0, 0); > if (psc->amss_data == NULL) { > printf("%s: could not allocate BHIE DMA data buffer\n", > sc->sc_dev.dv_xname); > @@ -3343,7 +3343,7 @@ qwx_mhi_fw_load_bhie(struct qwx_pci_softc *psc, uint8_ > if (psc->amss_vec == NULL || QWX_DMA_LEN(psc->amss_vec) < vec_size) { > if (psc->amss_vec) > qwx_dmamem_free(sc->sc_dmat, psc->amss_vec); > - psc->amss_vec = qwx_dmamem_alloc(sc->sc_dmat, vec_size, 0); > + psc->amss_vec = qwx_dmamem_alloc(sc->sc_dmat, vec_size, 0, 0); > if (psc->amss_vec == NULL) { > printf("%s: could not allocate BHIE DMA vec buffer\n", > sc->sc_dev.dv_xname); > @@ -3429,7 +3429,7 @@ qwx_rddm_prepare(struct qwx_pci_softc *psc) > return; > } > > - data_adm = qwx_dmamem_alloc(sc->sc_dmat, len, 0); > + data_adm = qwx_dmamem_alloc(sc->sc_dmat, len, 0, 0); > if (data_adm == NULL) { > printf("%s: could not allocate BHIE DMA data buffer\n", > sc->sc_dev.dv_xname); > @@ -3437,7 +3437,7 @@ qwx_rddm_prepare(struct qwx_pci_softc *psc) > } > > vec_size = nseg * sizeof(*vec); > - vec_adm = qwx_dmamem_alloc(sc->sc_dmat, vec_size, 0); > + vec_adm = qwx_dmamem_alloc(sc->sc_dmat, vec_size, 0, 0); > if (vec_adm == NULL) { > printf("%s: could not allocate BHIE DMA vector buffer\n", > sc->sc_dev.dv_xname); > >
