For i40e driver, enable direct re-arm mode. This patch supports the case
of mapping Rx/Tx queues from the same single lcore.

Suggested-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>
Signed-off-by: Feifei Wang <feifei.wa...@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>
---
  drivers/net/i40e/i40e_rxtx.h            |   4 +
  drivers/net/i40e/i40e_rxtx_common_avx.h | 269 ++++++++++++++++++++++++
  drivers/net/i40e/i40e_rxtx_vec_avx2.c   |  14 +-
  drivers/net/i40e/i40e_rxtx_vec_avx512.c | 249 +++++++++++++++++++++-
  drivers/net/i40e/i40e_rxtx_vec_neon.c   | 141 ++++++++++++-
  drivers/net/i40e/i40e_rxtx_vec_sse.c    | 170 ++++++++++++++-
  6 files changed, 839 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 5e6eecc501..1fdf4305f4 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -102,6 +102,8 @@ struct i40e_rx_queue {
uint16_t rxrearm_nb; /**< number of remaining to be re-armed */
        uint16_t rxrearm_start; /**< the idx we start the re-arming from */
+       uint16_t direct_rxrearm_port; /** device TX port ID for direct re-arm 
mode */
+       uint16_t direct_rxrearm_queue; /** TX queue index for direct re-arm 
mode */
        uint64_t mbuf_initializer; /**< value to init mbufs */
uint16_t port_id; /**< device port ID */
@@ -121,6 +123,8 @@ struct i40e_rx_queue {
        uint16_t rx_using_sse; /**<flag indicate the usage of vPMD for rx */
        uint8_t dcb_tc;         /**< Traffic class of rx queue */
        uint64_t offloads; /**< Rx offload flags of RTE_ETH_RX_OFFLOAD_* */
+       /**<  0 if direct re-arm mode disabled, 1 when enabled */
+       bool direct_rxrearm_enable;
        const struct rte_memzone *mz;
  };
diff --git a/drivers/net/i40e/i40e_rxtx_common_avx.h b/drivers/net/i40e/i40e_rxtx_common_avx.h
index cfc1e63173..a742723e07 100644
--- a/drivers/net/i40e/i40e_rxtx_common_avx.h
+++ b/drivers/net/i40e/i40e_rxtx_common_avx.h
@@ -209,6 +209,275 @@ i40e_rxq_rearm_common(struct i40e_rx_queue *rxq, 
__rte_unused bool avx512)
        /* Update the tail pointer on the NIC */
        I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
  }
+
+static __rte_always_inline void
+i40e_rxq_direct_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool 
avx512)
+{
+       struct rte_eth_dev *dev;
+       struct i40e_tx_queue *txq;rivers/net/i40e/i40e_rxtx_common_avx.h
+       volatile union i40e_rx_desc *rxdp;
+       struct i40e_tx_entry *txep;
+       struct i40e_rx_entry *rxep;
+       struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];
+       uint16_t tx_port_id, tx_queue_id;
+       uint16_t rx_id;
+       uint16_t i, n;
+       uint16_t nb_rearm = 0;
+
+       rxdp = rxq->rx_ring + rxq->rxrearm_start;
+       rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+       tx_port_id = rxq->direct_rxrearm_port;
+       tx_queue_id = rxq->direct_rxrearm_queue;
+       dev = &rte_eth_devices[tx_port_id];
+       txq = dev->data->tx_queues[tx_queue_id];
+
+       /* check Rx queue is able to take in the whole
+        * batch of free mbufs from Tx queue
+        */
+       if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+               /* check DD bits on threshold descriptor */
+               if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+                               rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+                               rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) 
{
+                       goto mempool_bulk;
+               }
+
+               if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)
+                       goto mempool_bulk;

I think all these checks (is this mode can be enabled) should be done at config phase, not at data-path.

+
+               n = txq->tx_rs_thresh;
+
+               /* first buffer to free from S/W ring is at index
+                * tx_next_dd - (tx_rs_thresh-1)
+                */
+               txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];


It really looks bad that RX function acesses and modifies TXQ data directly. Would be much better to hide TXD checking/manipulation into a separate TXQ function (txq_mbuf() or so) that RX path can invoke.

+
+               if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+                       /* directly put mbufs from Tx to Rx,
+                        * and initialize the mbufs in vector
+                        */
+                       for (i = 0; i < n; i++)
+                               rxep[i].mbuf = txep[i].mbuf;
+               } else {
+                       for (i = 0; i < n; i++) {
+                               m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+                               /* ensure each Tx freed buffer is valid */
+                               if (m[i] != NULL)
+                                       nb_rearm++;
+                       }
+
+                       if (nb_rearm != n) {
+                               txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + 
txq->tx_rs_thresh);
+                               txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + 
txq->tx_rs_thresh);
+                               if (txq->tx_next_dd >= txq->nb_tx_desc)
+                                       txq->tx_next_dd = 
(uint16_t)(txq->tx_rs_thresh - 1);


So if nb_rearm != 0 what would happen with mbufs collected in m[]?
Are you just dropping/forgetting them?


+
+                               goto mempool_bulk;

+                       } else {
+                               for (i = 0; i < n; i++)
+                                       rxep[i].mbuf = m[i];
+                       }
+               }
+
+               /* update counters for Tx */
+               txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + 
txq->tx_rs_thresh);
+               txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + 
txq->tx_rs_thresh);
+               if (txq->tx_next_dd >= txq->nb_tx_desc)
+                       txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+       } else {


I suppose the chunk of code below is just a copy&paste of
exising i40e_rxq_direct_rearm_common()?
If so, no point to duplicate it, better to just invoke it here
(I presume a bit of re-factoring) would be need for that.

Pretty much same thoughts for other rearm functions below.

+mempool_bulk:
+               /* if TX did not free bufs into Rx sw-ring,
+                * get new bufs from mempool
+                */
+               n = RTE_I40E_RXQ_REARM_THRESH;
+
+               /* Pull 'n' more MBUFs into the software ring */
+               if (rte_mempool_get_bulk(rxq->mp,
+                                       (void *)rxep,
+                                       RTE_I40E_RXQ_REARM_THRESH) < 0) {
+                       if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+                               rxq->nb_rx_desc) {
+                               __m128i dma_addr0;
+                               dma_addr0 = _mm_setzero_si128();
+                               for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+                                       rxep[i].mbuf = &rxq->fake_mbuf;
+                                       _mm_store_si128((__m128i 
*)&rxdp[i].read,
+                                                       dma_addr0);
+                               }
+                       }
+                       
rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+                               RTE_I40E_RXQ_REARM_THRESH;
+                       return;
+               }
+       }
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+       struct rte_mbuf *mb0, *mb1;
+       __m128i dma_addr0, dma_addr1;
+       __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+                       RTE_PKTMBUF_HEADROOM);
+       /* Initialize the mbufs in vector, process 2 mbufs in one loop */
+       for (i = 0; i < n; i += 2, rxep += 2) {
+               __m128i vaddr0, vaddr1;
+
+               mb0 = rxep[0].mbuf;
+               mb1 = rxep[1].mbuf;
+
+               /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+               RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+                               offsetof(struct rte_mbuf, buf_addr) + 8);
+               vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+               vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+               /* convert pa to dma_addr hdr/data */
+               dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+               dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+               /* add headroom to pa values */
+               dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+               dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+               /* flush desc with pa dma_addr */
+               _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+               _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+       }
+#else
+#ifdef __AVX512VL__
+       if (avx512) {
+               struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+               struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
+               __m512i dma_addr0_3, dma_addr4_7;
+               __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+               /* Initialize the mbufs in vector, process 8 mbufs in one loop 
*/
+               for (i = 0; i < n; i += 8, rxep += 8, rxdp += 8) {
+                       __m128i vaddr0, vaddr1, vaddr2, vaddr3;
+                       __m128i vaddr4, vaddr5, vaddr6, vaddr7;
+                       __m256i vaddr0_1, vaddr2_3;
+                       __m256i vaddr4_5, vaddr6_7;
+                       __m512i vaddr0_3, vaddr4_7;
+
+                       mb0 = rxep[0].mbuf;
+                       mb1 = rxep[1].mbuf;
+                       mb2 = rxep[2].mbuf;
+                       mb3 = rxep[3].mbuf;
+                       mb4 = rxep[4].mbuf;
+                       mb5 = rxep[5].mbuf;
+                       mb6 = rxep[6].mbuf;
+                       mb7 = rxep[7].mbuf;
+
+                       /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+                       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+                                       offsetof(struct rte_mbuf, buf_addr) + 
8);
+                       vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+                       vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+                       vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
+                       vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+                       vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
+                       vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
+                       vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
+                       vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
+
+                       /**
+                        * merge 0 & 1, by casting 0 to 256-bit and inserting 1
+                        * into the high lanes. Similarly for 2 & 3, and so on.
+                        */
+                       vaddr0_1 =
+                               
_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
+                                                       vaddr1, 1);
+                       vaddr2_3 =
+                               
_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
+                                                       vaddr3, 1);
+                       vaddr4_5 =
+                               
_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
+                                                       vaddr5, 1);
+                       vaddr6_7 =
+                               
_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
+                                                       vaddr7, 1);
+                       vaddr0_3 =
+                               
_mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
+                                                  vaddr2_3, 1);
+                       vaddr4_7 =
+                               
_mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
+                                                  vaddr6_7, 1);
+
+                       /* convert pa to dma_addr hdr/data */
+                       dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);
+                       dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);
+
+                       /* add headroom to pa values */
+                       dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);
+                       dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
+
+                       /* flush desc with pa dma_addr */
+                       _mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3);
+                       _mm512_store_si512((__m512i *)&(rxdp + 4)->read, 
dma_addr4_7);
+               }
+       } else {
+#endif /* __AVX512VL__*/
+               struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+               __m256i dma_addr0_1, dma_addr2_3;
+               __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
+               /* Initialize the mbufs in vector, process 4 mbufs in one loop 
*/
+               for (i = 0; i < n; i += 4, rxep += 4, rxdp += 4) {
+                       __m128i vaddr0, vaddr1, vaddr2, vaddr3;
+                       __m256i vaddr0_1, vaddr2_3;
+
+                       mb0 = rxep[0].mbuf;
+                       mb1 = rxep[1].mbuf;
+                       mb2 = rxep[2].mbuf;
+                       mb3 = rxep[3].mbuf;
+
+                       /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+                       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+                                       offsetof(struct rte_mbuf, buf_addr) + 
8);
+                       vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+                       vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+                       vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
+                       vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+
+                       /**
+                        * merge 0 & 1, by casting 0 to 256-bit and inserting 1
+                        * into the high lanes. Similarly for 2 & 3
+                        */
+                       vaddr0_1 = _mm256_inserti128_si256
+                               (_mm256_castsi128_si256(vaddr0), vaddr1, 1);
+                       vaddr2_3 = _mm256_inserti128_si256
+                               (_mm256_castsi128_si256(vaddr2), vaddr3, 1);
+
+                       /* convert pa to dma_addr hdr/data */
+                       dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1);
+                       dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3);
+
+                       /* add headroom to pa values */
+                       dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room);
+                       dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room);
+
+                       /* flush desc with pa dma_addr */
+                       _mm256_store_si256((__m256i *)&rxdp->read, dma_addr0_1);
+                       _mm256_store_si256((__m256i *)&(rxdp + 2)->read, 
dma_addr2_3);
+               }
+       }
+
+#endif
+
+       /* Update the descriptor initializer index */
+       rxq->rxrearm_start += n;
+       rx_id = rxq->rxrearm_start - 1;
+
+       if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+               rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+               if (!rxq->rxrearm_start)
+                       rx_id = rxq->nb_rx_desc - 1;
+               else
+                       rx_id = rxq->rxrearm_start - 1;
+       }
+
+       rxq->rxrearm_nb -= n;
+
+       /* Update the tail pointer on the NIC */
+       I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
+}
  #endif /* __AVX2__*/
#endif /*_I40E_RXTX_COMMON_AVX_H_*/
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index c73b2a321b..fcb7ba0273 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -25,6 +25,12 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
        return i40e_rxq_rearm_common(rxq, false);
  }
+static __rte_always_inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+       return i40e_rxq_direct_rearm_common(rxq, false);
+}
+
  #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
  /* Handles 32B descriptor FDIR ID processing:
   * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
@@ -128,8 +134,12 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
        /* See if we need to rearm the RX queue - gives the prefetch a bit
         * of time to act
         */
-       if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
-               i40e_rxq_rearm(rxq);
+       if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+               if (rxq->direct_rxrearm_enable)
+                       i40e_rxq_direct_rearm(rxq);
+               else
+                       i40e_rxq_rearm(rxq);
+       }
/* Before we start moving massive data around, check to see if
         * there is actually a packet available
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 2e8a3f0df6..d967095edc 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -21,6 +21,12 @@
#define RTE_I40E_DESCS_PER_LOOP_AVX 8 +enum i40e_direct_rearm_type_value {
+       I40E_DIRECT_REARM_TYPE_NORMAL           = 0x0,
+       I40E_DIRECT_REARM_TYPE_FAST_FREE        = 0x1,
+       I40E_DIRECT_REARM_TYPE_PRE_FREE         = 0x2,
+};
+
  static __rte_always_inline void
  i40e_rxq_rearm(struct i40e_rx_queue *rxq)
  {
@@ -150,6 +156,241 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
        I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
  }
+static __rte_always_inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+       struct rte_eth_dev *dev;
+       struct i40e_tx_queue *txq;
+       volatile union i40e_rx_desc *rxdp;
+       struct i40e_vec_tx_entry *txep;
+       struct i40e_rx_entry *rxep;
+       struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];
+       uint16_t tx_port_id, tx_queue_id;
+       uint16_t rx_id;
+       uint16_t i, n;
+       uint16_t j = 0;
+       uint16_t nb_rearm = 0;
+       enum i40e_direct_rearm_type_value type;
+       struct rte_mempool_cache *cache = NULL;
+
+       rxdp = rxq->rx_ring + rxq->rxrearm_start;
+       rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+       tx_port_id = rxq->direct_rxrearm_port;
+       tx_queue_id = rxq->direct_rxrearm_queue;
+       dev = &rte_eth_devices[tx_port_id];
+       txq = dev->data->tx_queues[tx_queue_id];
+
+       /* check Rx queue is able to take in the whole
+        * batch of free mbufs from Tx queue
+        */
+       if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+               /* check DD bits on threshold descriptor */
+               if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+                       rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+                       rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
+                       goto mempool_bulk;
+               }
+
+               if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)
+                       goto mempool_bulk;
+
+               n = txq->tx_rs_thresh;
+
+               /* first buffer to free from S/W ring is at index
+                * tx_next_dd - (tx_rs_thresh-1)
+                */
+               txep = (void *)txq->sw_ring;
+               txep += txq->tx_next_dd - (n - 1);
+
+               if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+                       /* directly put mbufs from Tx to Rx */
+                       uint32_t copied = 0;
+                       /* n is multiple of 32 */
+                       while (copied < n) {
+                               const __m512i a = 
_mm512_load_si512(&txep[copied]);
+                               const __m512i b = 
_mm512_load_si512(&txep[copied + 8]);
+                               const __m512i c = 
_mm512_load_si512(&txep[copied + 16]);
+                               const __m512i d = 
_mm512_load_si512(&txep[copied + 24]);
+
+                               _mm512_storeu_si512(&rxep[copied], a);
+                               _mm512_storeu_si512(&rxep[copied + 8], b);
+                               _mm512_storeu_si512(&rxep[copied + 16], c);
+                               _mm512_storeu_si512(&rxep[copied + 24], d);
+                               copied += 32;
+                       }
+                       type = I40E_DIRECT_REARM_TYPE_FAST_FREE;
+               } else {
+                       for (i = 0; i < n; i++) {
+                               m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+                               /* ensure each Tx freed buffer is valid */
+                               if (m[i] != NULL)
+                                       nb_rearm++;
+                       }
+
+                       if (nb_rearm != n) {
+                               txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + 
txq->tx_rs_thresh);
+                               txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + 
txq->tx_rs_thresh);
+                               if (txq->tx_next_dd >= txq->nb_tx_desc)
+                                       txq->tx_next_dd = 
(uint16_t)(txq->tx_rs_thresh - 1);
+
+                               goto mempool_bulk;
+                       } else {
+                               type = I40E_DIRECT_REARM_TYPE_PRE_FREE;
+                       }
+               }
+
+       /* update counters for Tx */
+       txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+       txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+       if (txq->tx_next_dd >= txq->nb_tx_desc)
+               txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+       } else {
+mempool_bulk:
+               cache = rte_mempool_default_cache(rxq->mp, rte_lcore_id());
+
+               if (unlikely(!cache))
+                       return i40e_rxq_rearm_common(rxq, true);
+
+               n = RTE_I40E_RXQ_REARM_THRESH;
+
+               /* We need to pull 'n' more MBUFs into the software ring from 
mempool
+                * We inline the mempool function here, so we can vectorize the 
copy
+                * from the cache into the shadow ring.
+                */
+
+               if (cache->len < RTE_I40E_RXQ_REARM_THRESH) {
+                       /* No. Backfill the cache first, and then fill from it 
*/
+                       uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size 
-
+                                       cache->len);
+
+                       /* How many do we require
+                        * i.e. number to fill the cache + the request
+                        */
+                       int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
+                                       &cache->objs[cache->len], req);
+                       if (ret == 0) {
+                               cache->len += req;
+                       } else {
+                               if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH 
>=
+                                               rxq->nb_rx_desc) {
+                                       __m128i dma_addr0;
+
+                                       dma_addr0 = _mm_setzero_si128();
+                                       for (i = 0; i < 
RTE_I40E_DESCS_PER_LOOP; i++) {
+                                               rxep[i].mbuf = &rxq->fake_mbuf;
+                                               _mm_store_si128
+                                                       ((__m128i 
*)&rxdp[i].read,
+                                                               dma_addr0);
+                                       }
+                               }
+                               
rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+                                               RTE_I40E_RXQ_REARM_THRESH;
+                               return;
+                       }
+               }
+
+               type = I40E_DIRECT_REARM_TYPE_NORMAL;
+       }
+
+       const __m512i iova_offsets =  _mm512_set1_epi64
+               (offsetof(struct rte_mbuf, buf_iova));
+       const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+       /* to shuffle the addresses to correct slots. Values 4-7 will contain
+        * zeros, so use 7 for a zero-value.
+        */
+       const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
+#else
+       const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
+#endif
+
+       __m512i mbuf_ptrs;
+
+       /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
+        * from mempool cache and populating both shadow and HW rings
+        */
+       for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {
+               switch (type) {
+               case I40E_DIRECT_REARM_TYPE_FAST_FREE:
+                       mbuf_ptrs = _mm512_loadu_si512(rxep);
+                       break;
+               case I40E_DIRECT_REARM_TYPE_PRE_FREE:
+                       mbuf_ptrs = _mm512_loadu_si512(&m[j]);
+                       _mm512_store_si512(rxep, mbuf_ptrs);
+                       j += 8;
+                       break;
+               case I40E_DIRECT_REARM_TYPE_NORMAL:
+                       mbuf_ptrs = _mm512_loadu_si512
+                               (&cache->objs[cache->len - 8]);
+                       _mm512_store_si512(rxep, mbuf_ptrs);
+                       cache->len -= 8;
+                       break;
+               }
+
+               /* gather iova of mbuf0-7 into one zmm reg */
+               const __m512i iova_base_addrs = _mm512_i64gather_epi64
+                       (_mm512_add_epi64(mbuf_ptrs, iova_offsets),
+                               0, /* base */
+                               1 /* scale */);
+               const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
+                               headroom);
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+               const __m512i iovas0 = _mm512_castsi256_si512
+                       (_mm512_extracti64x4_epi64(iova_addrs, 0));
+               const __m512i iovas1 = _mm512_castsi256_si512
+                       (_mm512_extracti64x4_epi64(iova_addrs, 1));
+
+               /* permute leaves desc 2-3 addresses in header address slots 0-1
+                * but these are ignored by driver since header split not
+                * enabled. Similarly for desc 4 & 5.
+                */
+               const __m512i desc_rd_0_1 = _mm512_permutexvar_epi64
+                       (permute_idx, iovas0);
+               const __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8);
+
+               const __m512i desc_rd_4_5 = _mm512_permutexvar_epi64
+                       (permute_idx, iovas1);
+               const __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8);
+
+               _mm512_store_si512((void *)rxdp, desc_rd_0_1);
+               _mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3);
+               _mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5);
+               _mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7);
+#else
+               /* permute leaves desc 4-7 addresses in header address slots 0-3
+                * but these are ignored by driver since header split not
+                * enabled.
+                */
+               const __m512i desc_rd_0_3 = _mm512_permutexvar_epi64
+                       (permute_idx, iova_addrs);
+               const __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8);
+
+               _mm512_store_si512((void *)rxdp, desc_rd_0_3);
+               _mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7);
+#endif
+               rxdp += 8, rxep += 8;
+       }
+
+       /* Update the descriptor initializer index */
+       rxq->rxrearm_start += n;
+       rx_id = rxq->rxrearm_start - 1;
+
+       if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+               rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+               if (!rxq->rxrearm_start)
+                       rx_id = rxq->nb_rx_desc - 1;
+               else
+                       rx_id = rxq->rxrearm_start - 1;
+       }
+
+       rxq->rxrearm_nb -= n;
+
+       /* Update the tail pointer on the NIC */
+       I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
+}
+
  #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
  /* Handles 32B descriptor FDIR ID processing:
   * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
@@ -252,8 +493,12 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, 
struct rte_mbuf **rx_pkts,
        /* See if we need to rearm the RX queue - gives the prefetch a bit
         * of time to act
         */
-       if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
-               i40e_rxq_rearm(rxq);
+       if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+               if (rxq->direct_rxrearm_enable)
+                       i40e_rxq_direct_rearm(rxq);
+               else
+                       i40e_rxq_rearm(rxq);
+       }
/* Before we start moving massive data around, check to see if
         * there is actually a packet available
diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c 
b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index fa9e6582c5..dc78e3c90b 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -77,6 +77,139 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
        I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
  }
+static inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+       struct rte_eth_dev *dev;
+       struct i40e_tx_queue *txq;
+       volatile union i40e_rx_desc *rxdp;
+       struct i40e_tx_entry *txep;
+       struct i40e_rx_entry *rxep;
+       uint16_t tx_port_id, tx_queue_id;
+       uint16_t rx_id;
+       struct rte_mbuf *mb0, *mb1, *m;
+       uint64x2_t dma_addr0, dma_addr1;
+       uint64x2_t zero = vdupq_n_u64(0);
+       uint64_t paddr;
+       uint16_t i, n;
+       uint16_t nb_rearm = 0;
+
+       rxdp = rxq->rx_ring + rxq->rxrearm_start;
+       rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+       tx_port_id = rxq->direct_rxrearm_port;
+       tx_queue_id = rxq->direct_rxrearm_queue;
+       dev = &rte_eth_devices[tx_port_id];
+       txq = dev->data->tx_queues[tx_queue_id];
+
+       /* check Rx queue is able to take in the whole
+        * batch of free mbufs from Tx queue
+        */
+       if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+               /* check DD bits on threshold descriptor */
+               if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+                               rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+                               rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) 
{
+                       goto mempool_bulk;
+               }
+
+               n = txq->tx_rs_thresh;
+
+               /* first buffer to free from S/W ring is at index
+                * tx_next_dd - (tx_rs_thresh-1)
+                */
+               txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
+
+               if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+                       /* directly put mbufs from Tx to Rx,
+                        * and initialize the mbufs in vector
+                        */
+                       for (i = 0; i < n; i++, rxep++, txep++) {
+                               rxep[0].mbuf = txep[0].mbuf;
+
+                               /* Initialize rxdp descs */
+                               mb0 = txep[0].mbuf;
+
+                               paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
+                               dma_addr0 = vdupq_n_u64(paddr);
+                               /* flush desc with pa dma_addr */
+                               vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+                       }
+               } else {
+                       for (i = 0; i < n; i++) {
+                               m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+                               if (m != NULL) {
+                                       rxep[i].mbuf = m;
+
+                                       /* Initialize rxdp descs */
+                                       paddr = m->buf_iova + 
RTE_PKTMBUF_HEADROOM;
+                                       dma_addr0 = vdupq_n_u64(paddr);
+                                       /* flush desc with pa dma_addr */
+                                       vst1q_u64((uint64_t *)&rxdp++->read, 
dma_addr0);
+                                       nb_rearm++;
+                               }
+                       }
+                       n = nb_rearm;
+               }
+
+               /* update counters for Tx */
+               txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + 
txq->tx_rs_thresh);
+               txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + 
txq->tx_rs_thresh);
+               if (txq->tx_next_dd >= txq->nb_tx_desc)
+                       txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+       } else {
+mempool_bulk:
+               /* if TX did not free bufs into Rx sw-ring,
+                * get new bufs from mempool
+                */
+               n = RTE_I40E_RXQ_REARM_THRESH;
+               if (unlikely(rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 
0)) {
+                       if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {
+                               for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+                                       rxep[i].mbuf = &rxq->fake_mbuf;
+                                       vst1q_u64((uint64_t *)&rxdp[i].read, 
zero);
+                               }
+                       }
+                       
rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += n;
+                       return;
+               }
+
+               /* Initialize the mbufs in vector, process 2 mbufs in one loop 
*/
+               for (i = 0; i < n; i += 2, rxep += 2) {
+                       mb0 = rxep[0].mbuf;
+                       mb1 = rxep[1].mbuf;
+
+                       paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
+                       dma_addr0 = vdupq_n_u64(paddr);
+                       /* flush desc with pa dma_addr */
+                       vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+
+                       paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;
+                       dma_addr1 = vdupq_n_u64(paddr);
+                       /* flush desc with pa dma_addr */
+                       vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
+               }
+       }
+
+       /* Update the descriptor initializer index */
+       rxq->rxrearm_start += n;
+       rx_id = rxq->rxrearm_start - 1;
+
+       if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+               rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+               if (!rxq->rxrearm_start)
+                       rx_id = rxq->nb_rx_desc - 1;
+               else
+                       rx_id = rxq->rxrearm_start - 1;
+       }
+
+       rxq->rxrearm_nb -= n;
+
+       rte_io_wmb();
+       /* Update the tail pointer on the NIC */
+       I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
+}
+
  #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
  /* NEON version of FDIR mark extraction for 4 32B descriptors at a time */
  static inline uint32x4_t
@@ -381,8 +514,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict 
rxq,
        /* See if we need to rearm the RX queue - gives the prefetch a bit
         * of time to act
         */
-       if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
-               i40e_rxq_rearm(rxq);
+       if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+               if (rxq->direct_rxrearm_enable)
+                       i40e_rxq_direct_rearm(rxq);
+               else
+                       i40e_rxq_rearm(rxq);
+       }
/* Before we start moving massive data around, check to see if
         * there is actually a packet available
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c 
b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 3782e8052f..b2f1ab2c8d 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -89,6 +89,168 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
        I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
  }
+static inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+       struct rte_eth_dev *dev;
+       struct i40e_tx_queue *txq;
+       volatile union i40e_rx_desc *rxdp;
+       struct i40e_tx_entry *txep;
+       struct i40e_rx_entry *rxep;
+       uint16_t tx_port_id, tx_queue_id;
+       uint16_t rx_id;
+       struct rte_mbuf *mb0, *mb1, *m;
+       __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+                       RTE_PKTMBUF_HEADROOM);
+       __m128i dma_addr0, dma_addr1;
+       __m128i vaddr0, vaddr1;
+       uint16_t i, n;
+       uint16_t nb_rearm = 0;
+
+       rxdp = rxq->rx_ring + rxq->rxrearm_start;
+       rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+       tx_port_id = rxq->direct_rxrearm_port;
+       tx_queue_id = rxq->direct_rxrearm_queue;
+       dev = &rte_eth_devices[tx_port_id];
+       txq = dev->data->tx_queues[tx_queue_id];
+
+       /* check Rx queue is able to take in the whole
+        * batch of free mbufs from Tx queue
+        */
+       if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+               /* check DD bits on threshold descriptor */
+               if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+                               rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+                               rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) 
{
+                       goto mempool_bulk;
+               }
+
+               n = txq->tx_rs_thresh;
+
+               /* first buffer to free from S/W ring is at index
+                * tx_next_dd - (tx_rs_thresh-1)
+                */
+               txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
+
+               if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+                       /* directly put mbufs from Tx to Rx,
+                        * and initialize the mbufs in vector
+                        */
+                       for (i = 0; i < n; i++, rxep++, txep++) {
+                               rxep[0].mbuf = txep[0].mbuf;
+
+                               /* Initialize rxdp descs */
+                               mb0 = txep[0].mbuf;
+
+                               /* load buf_addr(lo 64bit) and buf_iova(hi 
64bit) */
+                               RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, 
buf_iova) !=
+                                               offsetof(struct rte_mbuf, 
buf_addr) + 8);
+                               vaddr0 = _mm_loadu_si128((__m128i 
*)&mb0->buf_addr);
+
+                               /* convert pa to dma_addr hdr/data */
+                               dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+
+                               /* add headroom to pa values */
+                               dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+
+                               /* flush desc with pa dma_addr */
+                               _mm_store_si128((__m128i *)&rxdp++->read, 
dma_addr0);
+                       }
+               } else {
+                       for (i = 0; i < n; i++) {
+                               m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+                               if (m != NULL) {
+                                       rxep[i].mbuf = m;
+
+                                       /* load buf_addr(lo 64bit) and 
buf_iova(hi 64bit) */
+                                       RTE_BUILD_BUG_ON(offsetof(struct 
rte_mbuf, buf_iova) !=
+                                                       offsetof(struct 
rte_mbuf, buf_addr) + 8);
+                                       vaddr0 = _mm_loadu_si128((__m128i 
*)&m->buf_addr);
+
+                                       /* convert pa to dma_addr hdr/data */
+                                       dma_addr0 = _mm_unpackhi_epi64(vaddr0, 
vaddr0);
+
+                                       /* add headroom to pa values */
+                                       dma_addr0 = _mm_add_epi64(dma_addr0, 
hdr_room);
+
+                                       /* flush desc with pa dma_addr */
+                                       _mm_store_si128((__m128i 
*)&rxdp++->read, dma_addr0);
+                                       nb_rearm++;
+                               }
+                       }
+                       n = nb_rearm;
+               }
+
+               /* update counters for Tx */
+               txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + 
txq->tx_rs_thresh);
+               txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + 
txq->tx_rs_thresh);
+               if (txq->tx_next_dd >= txq->nb_tx_desc)
+                       txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+       } else {
+mempool_bulk:
+               /* if TX did not free bufs into Rx sw-ring,
+                * get new bufs from mempool
+                */
+               n = RTE_I40E_RXQ_REARM_THRESH;
+               /* Pull 'n' more MBUFs into the software ring */
+               if (rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0) {
+                       if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {
+                               dma_addr0 = _mm_setzero_si128();
+                               for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+                                       rxep[i].mbuf = &rxq->fake_mbuf;
+                                       _mm_store_si128((__m128i 
*)&rxdp[i].read,
+                                                       dma_addr0);
+                               }
+                       }
+                       
rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+                               RTE_I40E_RXQ_REARM_THRESH;
+                       return;
+               }
+
+               /* Initialize the mbufs in vector, process 2 mbufs in one loop 
*/
+               for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+                       mb0 = rxep[0].mbuf;
+                       mb1 = rxep[1].mbuf;
+
+                       /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+                       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+                                       offsetof(struct rte_mbuf, buf_addr) + 
8);
+                       vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+                       vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+                       /* convert pa to dma_addr hdr/data */
+                       dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+                       dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+                       /* add headroom to pa values */
+                       dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+                       dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+                       /* flush desc with pa dma_addr */
+                       _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+                       _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+               }
+       }
+
+       /* Update the descriptor initializer index */
+       rxq->rxrearm_start += n;
+       rx_id = rxq->rxrearm_start - 1;
+
+       if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+               rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+               if (!rxq->rxrearm_start)
+                       rx_id = rxq->nb_rx_desc - 1;
+               else
+                       rx_id = rxq->rxrearm_start - 1;
+       }
+
+       rxq->rxrearm_nb -= n;
+
+       /* Update the tail pointer on the NIC */
+       I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
+}
+
  #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
  /* SSE version of FDIR mark extraction for 4 32B descriptors at a time */
  static inline __m128i
@@ -394,8 +556,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
        /* See if we need to rearm the RX queue - gives the prefetch a bit
         * of time to act
         */
-       if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
-               i40e_rxq_rearm(rxq);
+       if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+               if (rxq->direct_rxrearm_enable)
+                       i40e_rxq_direct_rearm(rxq);
+               else
+                       i40e_rxq_rearm(rxq);
+       }
/* Before we start moving massive data around, check to see if
         * there is actually a packet available

Reply via email to