On Mon, Jan 12, 2026 at 02:03:49PM +0100, Vivek Behera wrote:
> The current implementation in the igb_xsk_wakeup expects the Rx and Tx queues
> to share the same irq. This would lead to triggering of incorrect irq
> in split irq configuration.
> This patch addresses this issue which could impact environments
> with 2 active cpu cores
> or when the number of queues is reduced to 2 or less
>
> cat /proc/interrupts | grep eno2
> 167: 0 0 0 0 IR-PCI-MSIX-0000:08:00.0
> 0-edge eno2
> 168: 0 0 0 0 IR-PCI-MSIX-0000:08:00.0
> 1-edge eno2-rx-0
> 169: 0 0 0 0 IR-PCI-MSIX-0000:08:00.0
> 2-edge eno2-rx-1
> 170: 0 0 0 0 IR-PCI-MSIX-0000:08:00.0
> 3-edge eno2-tx-0
> 171: 0 0 0 0 IR-PCI-MSIX-0000:08:00.0
> 4-edge eno2-tx-1
>
> Furthermore it uses the flags input argument to trigger either rx, tx or
> both rx and tx irqs as specified in the ndo_xsk_wakeup api documentation
>
> Fixes: 80f6ccf9f116 ("igb: Introduce XSK data structures and helpers")
> Signed-off-by: Vivek Behera <[email protected]>
> Reviewed-by: Aleksandr Loktionov <[email protected]>
> ---
> v1:
> https://lore.kernel.org/intel-wired-lan/[email protected]/
> v2:
> https://lore.kernel.org/intel-wired-lan/[email protected]/
> v3:
> https://lore.kernel.org/intel-wired-lan/[email protected]/
> v4:
> https://lore.kernel.org/intel-wired-lan/[email protected]/
>
> changelog:
> v1
> - Inital description of the Bug and fixes made in the patch
>
> v1 -> v2
> - Handling of RX and TX Wakeup in igc_xsk_wakeup for a split IRQ configuration
> - Review suggestions by Aleksander: Modified sequence to complete all
> error checks for rx and tx before updating napi states and triggering irqs
> - Corrected trigger of TX and RX interrupts over E1000_ICS (non msix use case)
> - Added define for Tx interrupt trigger bit mask for E1000_ICS
>
> v2 -> v3
> - Included applicable feedback and suggestions from igc patch
> - Fixed logic in updating eics value when both TX and RX need wakeup
>
> v3 -> v4
> - Added comments to explain trigerring of both TX and RX with active queue
> pairs
> - Fixed check of xsk pools in if statement
>
> v4 -> v5
> - Introduced a simplified logic for sequential check for RX and TX
> ---
> .../net/ethernet/intel/igb/e1000_defines.h | 1 +
> drivers/net/ethernet/intel/igb/igb_xsk.c | 75 +++++++++++++++----
> 2 files changed, 61 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h
> b/drivers/net/ethernet/intel/igb/e1000_defines.h
> index fa028928482f..9357564a2d58 100644
> --- a/drivers/net/ethernet/intel/igb/e1000_defines.h
> +++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
> @@ -443,6 +443,7 @@
> #define E1000_ICS_LSC E1000_ICR_LSC /* Link Status Change */
> #define E1000_ICS_RXDMT0 E1000_ICR_RXDMT0 /* rx desc min. threshold */
> #define E1000_ICS_DRSTA E1000_ICR_DRSTA /* Device Reset Aserted */
> +#define E1000_ICS_TXDW E1000_ICR_TXDW /* Transmit desc written back */
>
> /* Extended Interrupt Cause Set */
> /* E1000_EITR_CNT_IGNR is only for 82576 and newer */
> diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c
> b/drivers/net/ethernet/intel/igb/igb_xsk.c
> index 30ce5fbb5b77..6e51b5b6f131 100644
> --- a/drivers/net/ethernet/intel/igb/igb_xsk.c
> +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
> @@ -529,6 +529,13 @@ int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32
> flags)
> struct igb_adapter *adapter = netdev_priv(dev);
> struct e1000_hw *hw = &adapter->hw;
> struct igb_ring *ring;
> + struct igb_q_vector *q_vector;
> + struct napi_struct *rx_napi;
> + struct napi_struct *tx_napi;
> + bool trigger_irq_tx = false;
> + bool trigger_irq_rx = false;
> + u32 eics_tx = 0;
> + u32 eics_rx = 0;
> u32 eics = 0;
>
> if (test_bit(__IGB_DOWN, &adapter->state))
> @@ -536,27 +543,65 @@ int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32
> flags)
>
> if (!igb_xdp_is_enabled(adapter))
> return -EINVAL;
> -
> - if (qid >= adapter->num_tx_queues)
> + /* Check if queue_id is valid. Tx and Rx queue numbers are always same
> */
> + if (qid >= adapter->num_rx_queues)
> return -EINVAL;
> -
> - ring = adapter->tx_ring[qid];
> -
> - if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags))
> - return -ENETDOWN;
> -
> - if (!READ_ONCE(ring->xsk_pool))
> + /* Check if flags are valid */
> + if (!(flags & XDP_WAKEUP_RX) && !(flags & XDP_WAKEUP_TX))
> return -EINVAL;
> -
> - if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
> - /* Cause software interrupt */
> + if (flags & XDP_WAKEUP_RX) {
> + /* IRQ trigger preparation for Rx */
> + ring = adapter->rx_ring[qid];
> + if (!READ_ONCE(ring->xsk_pool))
> + return -ENXIO;
> + q_vector = ring->q_vector;
> + rx_napi = &q_vector->napi;
> + /* Extend the BIT mask for eics */
> + eics_rx = ring->q_vector->eims_value;
> + trigger_irq_rx = true;
> + }
> + if (flags & XDP_WAKEUP_TX) {
> + if (adapter->flags & IGB_FLAG_QUEUE_PAIRS) {
> + /* In queue-pair mode, rx_ring and tx_ring share the same
> q_vector,
> + * so a single IRQ trigger will wake both RX and TX processing
> + */
> + } else {
> + /* IRQ trigger preparation for Tx */
> + ring = adapter->tx_ring[qid];
> + if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags))
> + return -ENETDOWN;
> +
> + if (!READ_ONCE(ring->xsk_pool))
> + return -ENXIO;
> + q_vector = ring->q_vector;
> + tx_napi = &q_vector->napi;
> + /* Extend the BIT mask for eics */
> + eics_tx = ring->q_vector->eims_value;
> + trigger_irq_tx = true;
> + }
> + }
> + /* All error checks are finished. Check and update napi states for rx
> and tx */
> + if (trigger_irq_rx) {
> + if (!napi_if_scheduled_mark_missed(rx_napi))
> + eics |= eics_rx;
> + }
> + if (trigger_irq_tx) {
> + if (!napi_if_scheduled_mark_missed(tx_napi))
> + eics |= eics_tx;
> + }
> + /* Now we trigger the required irqs for Rx and Tx */
> + if ((trigger_irq_rx) || (trigger_irq_tx)) {
> if (adapter->flags & IGB_FLAG_HAS_MSIX) {
> - eics |= ring->q_vector->eims_value;
> wr32(E1000_EICS, eics);
> } else {
> - wr32(E1000_ICS, E1000_ICS_RXDMT0);
> + if ((trigger_irq_rx) && (trigger_irq_tx))
> + wr32(E1000_ICS,
> + E1000_ICS_RXDMT0 | E1000_ICS_TXDW);
> + else if (trigger_irq_rx)
> + wr32(E1000_ICS, E1000_ICS_RXDMT0);
> + else
> + wr32(E1000_ICS, E1000_ICS_TXDW);
My understanding is something below would be sufficient. Bits set on
E1000_ICS are not handled in any way so we don't have to distinguish
between rx/tx, it's just the matter of irq trigger and napi schedule.
-----------------8<-----------------
diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c
b/drivers/net/ethernet/intel/igb/igb_xsk.c
index 30ce5fbb5b77..0aba7afd6a03 100644
--- a/drivers/net/ethernet/intel/igb/igb_xsk.c
+++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
@@ -524,12 +524,26 @@ bool igb_xmit_zc(struct igb_ring *tx_ring, struct
xsk_buff_pool *xsk_pool)
return nb_pkts < budget;
}
+static void igb_sw_irq(struct igb_q_vector *q_vector)
+{
+ u32 eics = 0;
+
+ if (!napi_if_scheduled_mark_missed(&q_vector->napi)) {
+ /* Cause software interrupt */
+ if (adapter->flags & IGB_FLAG_HAS_MSIX) {
+ eics |= ring->q_vector->eims_value;
+ wr32(E1000_EICS, eics);
+ } else {
+ wr32(E1000_ICS, E1000_ICS_RXDMT0);
+ }
+ }
+}
+
int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
{
struct igb_adapter *adapter = netdev_priv(dev);
struct e1000_hw *hw = &adapter->hw;
struct igb_ring *ring;
- u32 eics = 0;
if (test_bit(__IGB_DOWN, &adapter->state))
return -ENETDOWN;
@@ -548,14 +562,15 @@ int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32
flags)
if (!READ_ONCE(ring->xsk_pool))
return -EINVAL;
- if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
- /* Cause software interrupt */
- if (adapter->flags & IGB_FLAG_HAS_MSIX) {
- eics |= ring->q_vector->eims_value;
- wr32(E1000_EICS, eics);
- } else {
- wr32(E1000_ICS, E1000_ICS_RXDMT0);
- }
+ if (flags & XDP_WAKEUP_TX)
+ igb_sw_irq(ring->q_vector);
+
+ if (flags & XDP_WAKEUP_RX) {
+ ring = adapter->rx_ring[qid];
+ /* for !IGB_FLAG_QUEUE_PAIRS, this will be NOP as NAPI has
+ * been already marked with NAPIF_STATE_MISSED
+ */
+ igb_sw_irq(ring->q_vector);
}
return 0;
----------------->8-----------------
> }
> }
> -
> return 0;
> }
> --
> 2.34.1
>