On Mon, Jan 12, 2026 at 02:03:49PM +0100, Vivek Behera wrote:
> The current implementation in the igb_xsk_wakeup expects the Rx and Tx queues
> to share the same irq. This would lead to triggering of incorrect irq
> in split irq configuration.
> This patch addresses this issue which could impact environments
> with 2 active cpu cores
> or when the number of queues is reduced to 2 or less
> 
> cat /proc/interrupts | grep eno2
>  167:          0          0          0          0 IR-PCI-MSIX-0000:08:00.0
>  0-edge      eno2
>  168:          0          0          0          0 IR-PCI-MSIX-0000:08:00.0
>  1-edge      eno2-rx-0
>  169:          0          0          0          0 IR-PCI-MSIX-0000:08:00.0
>  2-edge      eno2-rx-1
>  170:          0          0          0          0 IR-PCI-MSIX-0000:08:00.0
>  3-edge      eno2-tx-0
>  171:          0          0          0          0 IR-PCI-MSIX-0000:08:00.0
>  4-edge      eno2-tx-1
> 
> Furthermore it uses the flags input argument to trigger either rx, tx or
> both rx and tx irqs as specified in the ndo_xsk_wakeup api documentation
> 
> Fixes: 80f6ccf9f116 ("igb: Introduce XSK data structures and helpers")
> Signed-off-by: Vivek Behera <[email protected]>
> Reviewed-by: Aleksandr Loktionov <[email protected]>
> ---
> v1: 
> https://lore.kernel.org/intel-wired-lan/[email protected]/
> v2: 
> https://lore.kernel.org/intel-wired-lan/[email protected]/
> v3: 
> https://lore.kernel.org/intel-wired-lan/[email protected]/
> v4: 
> https://lore.kernel.org/intel-wired-lan/[email protected]/
> 
> changelog:
> v1
> - Inital description of the Bug and fixes made in the patch
> 
> v1 -> v2
> - Handling of RX and TX Wakeup in igc_xsk_wakeup for a split IRQ configuration
> - Review suggestions by Aleksander: Modified sequence to complete all
>   error checks for rx and tx before updating napi states and triggering irqs
> - Corrected trigger of TX and RX interrupts over E1000_ICS (non msix use case)
> - Added define for Tx interrupt trigger bit mask for E1000_ICS
> 
> v2 -> v3
> - Included applicable feedback and suggestions from igc patch
> - Fixed logic in updating eics value when  both TX and RX need wakeup
> 
> v3 -> v4
> - Added comments to explain trigerring of both TX and RX with active queue 
> pairs
> - Fixed check of xsk pools in if statement
> 
> v4 -> v5
> - Introduced a simplified logic for sequential check for RX and TX
> ---
>  .../net/ethernet/intel/igb/e1000_defines.h    |  1 +
>  drivers/net/ethernet/intel/igb/igb_xsk.c      | 75 +++++++++++++++----
>  2 files changed, 61 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h 
> b/drivers/net/ethernet/intel/igb/e1000_defines.h
> index fa028928482f..9357564a2d58 100644
> --- a/drivers/net/ethernet/intel/igb/e1000_defines.h
> +++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
> @@ -443,6 +443,7 @@
>  #define E1000_ICS_LSC       E1000_ICR_LSC       /* Link Status Change */
>  #define E1000_ICS_RXDMT0    E1000_ICR_RXDMT0    /* rx desc min. threshold */
>  #define E1000_ICS_DRSTA     E1000_ICR_DRSTA     /* Device Reset Aserted */
> +#define E1000_ICS_TXDW      E1000_ICR_TXDW   /* Transmit desc written back */
>  
>  /* Extended Interrupt Cause Set */
>  /* E1000_EITR_CNT_IGNR is only for 82576 and newer */
> diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c 
> b/drivers/net/ethernet/intel/igb/igb_xsk.c
> index 30ce5fbb5b77..6e51b5b6f131 100644
> --- a/drivers/net/ethernet/intel/igb/igb_xsk.c
> +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
> @@ -529,6 +529,13 @@ int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 
> flags)
>       struct igb_adapter *adapter = netdev_priv(dev);
>       struct e1000_hw *hw = &adapter->hw;
>       struct igb_ring *ring;
> +     struct igb_q_vector *q_vector;
> +     struct napi_struct *rx_napi;
> +     struct napi_struct *tx_napi;
> +     bool trigger_irq_tx = false;
> +     bool trigger_irq_rx = false;
> +     u32 eics_tx = 0;
> +     u32 eics_rx = 0;
>       u32 eics = 0;
>  
>       if (test_bit(__IGB_DOWN, &adapter->state))
> @@ -536,27 +543,65 @@ int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 
> flags)
>  
>       if (!igb_xdp_is_enabled(adapter))
>               return -EINVAL;
> -
> -     if (qid >= adapter->num_tx_queues)
> +     /* Check if queue_id is valid. Tx and Rx queue numbers are always same 
> */
> +     if (qid >= adapter->num_rx_queues)
>               return -EINVAL;
> -
> -     ring = adapter->tx_ring[qid];
> -
> -     if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags))
> -             return -ENETDOWN;
> -
> -     if (!READ_ONCE(ring->xsk_pool))
> +     /* Check if flags are valid */
> +     if (!(flags & XDP_WAKEUP_RX) && !(flags & XDP_WAKEUP_TX))
>               return -EINVAL;
> -
> -     if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
> -             /* Cause software interrupt */
> +     if (flags & XDP_WAKEUP_RX) {
> +             /* IRQ trigger preparation for Rx */
> +             ring = adapter->rx_ring[qid];
> +             if (!READ_ONCE(ring->xsk_pool))
> +                     return -ENXIO;
> +             q_vector = ring->q_vector;
> +             rx_napi = &q_vector->napi;
> +             /* Extend the BIT mask for eics */
> +             eics_rx = ring->q_vector->eims_value;
> +             trigger_irq_rx = true;
> +     }
> +     if (flags & XDP_WAKEUP_TX) {
> +             if (adapter->flags & IGB_FLAG_QUEUE_PAIRS) {
> +             /* In queue-pair mode, rx_ring and tx_ring share the same 
> q_vector,
> +              * so a single IRQ trigger will wake both RX and TX processing
> +              */
> +             } else {
> +                     /* IRQ trigger preparation for Tx */
> +                     ring = adapter->tx_ring[qid];
> +                     if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags))
> +                             return -ENETDOWN;
> +
> +                     if (!READ_ONCE(ring->xsk_pool))
> +                             return -ENXIO;
> +                     q_vector = ring->q_vector;
> +                     tx_napi = &q_vector->napi;
> +                     /* Extend the BIT mask for eics */
> +                     eics_tx = ring->q_vector->eims_value;
> +                     trigger_irq_tx = true;
> +             }
> +     }
> +     /* All error checks are finished. Check and update napi states for rx 
> and tx */
> +     if (trigger_irq_rx) {
> +             if (!napi_if_scheduled_mark_missed(rx_napi))
> +                     eics |= eics_rx;
> +     }
> +     if (trigger_irq_tx) {
> +             if (!napi_if_scheduled_mark_missed(tx_napi))
> +                     eics |= eics_tx;
> +     }
> +     /* Now we trigger the required irqs for Rx and Tx */
> +     if ((trigger_irq_rx) || (trigger_irq_tx)) {
>               if (adapter->flags & IGB_FLAG_HAS_MSIX) {
> -                     eics |= ring->q_vector->eims_value;
>                       wr32(E1000_EICS, eics);
>               } else {
> -                     wr32(E1000_ICS, E1000_ICS_RXDMT0);
> +                     if ((trigger_irq_rx) && (trigger_irq_tx))
> +                             wr32(E1000_ICS,
> +                                  E1000_ICS_RXDMT0 | E1000_ICS_TXDW);
> +                     else if (trigger_irq_rx)
> +                             wr32(E1000_ICS, E1000_ICS_RXDMT0);
> +                     else
> +                             wr32(E1000_ICS, E1000_ICS_TXDW);

My understanding is something below would be sufficient. Bits set on
E1000_ICS are not handled in any way so we don't have to distinguish
between rx/tx, it's just the matter of irq trigger and napi schedule.

-----------------8<-----------------

diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c 
b/drivers/net/ethernet/intel/igb/igb_xsk.c
index 30ce5fbb5b77..0aba7afd6a03 100644
--- a/drivers/net/ethernet/intel/igb/igb_xsk.c
+++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
@@ -524,12 +524,26 @@ bool igb_xmit_zc(struct igb_ring *tx_ring, struct 
xsk_buff_pool *xsk_pool)
        return nb_pkts < budget;
 }
 
+static void igb_sw_irq(struct igb_q_vector *q_vector)
+{
+       u32 eics = 0;
+
+       if (!napi_if_scheduled_mark_missed(&q_vector->napi)) {
+               /* Cause software interrupt */
+               if (adapter->flags & IGB_FLAG_HAS_MSIX) {
+                       eics |= ring->q_vector->eims_value;
+                       wr32(E1000_EICS, eics);
+               } else {
+                       wr32(E1000_ICS, E1000_ICS_RXDMT0);
+               }
+       }
+}
+
 int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 {
        struct igb_adapter *adapter = netdev_priv(dev);
        struct e1000_hw *hw = &adapter->hw;
        struct igb_ring *ring;
-       u32 eics = 0;
 
        if (test_bit(__IGB_DOWN, &adapter->state))
                return -ENETDOWN;
@@ -548,14 +562,15 @@ int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 
flags)
        if (!READ_ONCE(ring->xsk_pool))
                return -EINVAL;
 
-       if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
-               /* Cause software interrupt */
-               if (adapter->flags & IGB_FLAG_HAS_MSIX) {
-                       eics |= ring->q_vector->eims_value;
-                       wr32(E1000_EICS, eics);
-               } else {
-                       wr32(E1000_ICS, E1000_ICS_RXDMT0);
-               }
+       if (flags & XDP_WAKEUP_TX)
+               igb_sw_irq(ring->q_vector);
+
+       if (flags & XDP_WAKEUP_RX) {
+               ring = adapter->rx_ring[qid];
+               /* for !IGB_FLAG_QUEUE_PAIRS, this will be NOP as NAPI has
+                * been already marked with NAPIF_STATE_MISSED
+                */
+               igb_sw_irq(ring->q_vector);
        }
 
        return 0;

----------------->8-----------------

>               }
>       }
> -
>       return 0;
>  }
> -- 
> 2.34.1
> 

Reply via email to