From: Long Wu <long...@corigine.com>

Use AVX2 instructions to accelerate Tx performance. The
acceleration only works on X86 machine.

Signed-off-by: Long Wu <long...@corigine.com>
Reviewed-by: Chaoyong He <chaoyong...@corigine.com>
---
 drivers/net/nfp/meson.build                 |  20 +
 drivers/net/nfp/nfdk/nfp_nfdk.h             |   1 +
 drivers/net/nfp/nfdk/nfp_nfdk_dp.c          |  12 +
 drivers/net/nfp/nfdk/nfp_nfdk_vec.h         |  36 ++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c | 432 ++++++++++++++++++++
 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c    |  14 +
 drivers/net/nfp/nfp_ethdev.c                |   3 +-
 drivers/net/nfp/nfp_ethdev_vf.c             |   3 +-
 drivers/net/nfp/nfp_rxtx.h                  |   5 +-
 drivers/net/nfp/nfp_rxtx_vec.h              |  13 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c         |  21 +
 drivers/net/nfp/nfp_rxtx_vec_stub.c         |  16 +
 12 files changed, 573 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec.h
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
 create mode 100644 drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec.h
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_avx2.c
 create mode 100644 drivers/net/nfp/nfp_rxtx_vec_stub.c

diff --git a/drivers/net/nfp/meson.build b/drivers/net/nfp/meson.build
index d805644ec5..463a482a32 100644
--- a/drivers/net/nfp/meson.build
+++ b/drivers/net/nfp/meson.build
@@ -16,6 +16,7 @@ sources = files(
         'flower/nfp_flower_service.c',
         'nfd3/nfp_nfd3_dp.c',
         'nfdk/nfp_nfdk_dp.c',
+        'nfdk/nfp_nfdk_vec_stub.c',
         'nfpcore/nfp_cppcore.c',
         'nfpcore/nfp_crc.c',
         'nfpcore/nfp_elf.c',
@@ -43,7 +44,26 @@ sources = files(
         'nfp_net_flow.c',
         'nfp_net_meta.c',
         'nfp_rxtx.c',
+        'nfp_rxtx_vec_stub.c',
         'nfp_service.c',
 )
 
+if arch_subdir == 'x86'
+        includes += include_directories('../../common/nfp')
+
+        avx2_sources = files(
+                'nfdk/nfp_nfdk_vec_avx2_dp.c',
+                'nfp_rxtx_vec_avx2.c',
+        )
+
+        nfp_avx2_lib = static_library('nfp_avx2_lib',
+                avx2_sources,
+                dependencies: [static_rte_ethdev, static_rte_bus_pci],
+                include_directories: includes,
+                c_args: [cflags, '-mavx2']
+        )
+
+        objs += nfp_avx2_lib.extract_all_objects(recursive: true)
+endif
+
 deps += ['hash', 'security', 'common_nfp']
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk.h b/drivers/net/nfp/nfdk/nfp_nfdk.h
index 89a98d13f3..29d862f6f0 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk.h
+++ b/drivers/net/nfp/nfdk/nfp_nfdk.h
@@ -222,5 +222,6 @@ int nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq 
*txq,
 int nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
                struct nfp_net_txq *txq,
                uint64_t *metadata);
+void nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_NFDK_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c 
b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
index 173aabf0b9..2cea5688b3 100644
--- a/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_dp.c
@@ -11,6 +11,8 @@
 #include "../flower/nfp_flower.h"
 #include "../nfp_logs.h"
 #include "../nfp_net_meta.h"
+#include "../nfp_rxtx_vec.h"
+#include "nfp_nfdk_vec.h"
 
 #define NFDK_TX_DESC_GATHER_MAX         17
 
@@ -511,6 +513,7 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
        dev->data->tx_queues[queue_idx] = txq;
        txq->hw = hw;
        txq->hw_priv = dev->process_private;
+       txq->simple_always = true;
 
        /*
         * Telling the HW about the physical address of the TX ring and number
@@ -521,3 +524,12 @@ nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
 
        return 0;
 }
+
+void
+nfp_net_nfdk_xmit_pkts_set(struct rte_eth_dev *eth_dev)
+{
+       if (nfp_net_get_avx2_supported())
+               eth_dev->tx_pkt_burst = nfp_net_nfdk_vec_avx2_xmit_pkts;
+       else
+               eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec.h 
b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
new file mode 100644
index 0000000000..14319d6cf6
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFDK_VEC_H__
+#define __NFP_NFDK_VEC_H__
+
+#include <stdbool.h>
+
+#include <rte_mbuf_core.h>
+
+#include "../nfp_net_common.h"
+#include "nfp_nfdk.h"
+
+static inline bool
+nfp_net_nfdk_is_simple_packet(struct rte_mbuf *pkt,
+               struct nfp_net_hw *hw)
+{
+       if (pkt->data_len > NFDK_TX_MAX_DATA_PER_HEAD)
+               return false;
+
+       if ((hw->super.cap & NFP_NET_CFG_CTRL_LSO_ANY) == 0)
+               return true;
+
+       if ((pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG) == 0)
+               return true;
+
+       return false;
+}
+
+uint16_t nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+               struct rte_mbuf **tx_pkts,
+               uint16_t nb_pkts);
+
+#endif /* __NFP_NFDK_VEC_H__ */
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c 
b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
new file mode 100644
index 0000000000..6d1359fdb1
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_avx2_dp.c
@@ -0,0 +1,432 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "../nfp_logs.h"
+#include "nfp_nfdk.h"
+#include "nfp_nfdk_vec.h"
+
+/*
+ * One simple packet needs 2 descriptors so if send 4 packets driver will use
+ * 8 descriptors at once.
+ */
+#define NFDK_SIMPLE_BURST_DES_NUM 8
+
+#define NFDK_SIMPLE_DES_TYPE (NFDK_DESC_TX_EOP | \
+               (NFDK_DESC_TX_TYPE_HEAD & (NFDK_DESC_TX_TYPE_SIMPLE << 12)))
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(struct rte_mbuf *pkt,
+               struct nfp_net_txq *txq,
+               uint64_t *des_addr,
+               uint64_t *des_meta,
+               bool repr_flag)
+{
+       int ret;
+       __m128i dma_addr;
+       __m128i dma_hi;
+       __m128i data_off;
+       __m128i dlen_type;
+       uint64_t metadata;
+
+       if (repr_flag) {
+               metadata = NFDK_DESC_TX_CHAIN_META;
+       } else {
+               ret = nfp_net_nfdk_set_meta_data(pkt, txq, &metadata);
+               if (unlikely(ret != 0))
+                       return ret;
+       }
+
+       data_off = _mm_set_epi64x(0, pkt->data_off);
+       dma_addr = _mm_add_epi64(_mm_loadu_si128((__m128i *)&pkt->buf_addr), 
data_off);
+       dma_hi = _mm_srli_epi64(dma_addr, 32);
+
+       dlen_type = _mm_set_epi64x(0, (pkt->data_len - 1) | 
NFDK_SIMPLE_DES_TYPE);
+
+       *des_addr = _mm_extract_epi64(_mm_add_epi64(_mm_unpacklo_epi32(dma_hi, 
dma_addr),
+                       _mm_slli_epi64(dlen_type, 16)), 0);
+
+       *des_meta = nfp_net_nfdk_tx_cksum(txq, pkt, metadata);
+
+       return 0;
+}
+
+static inline int
+nfp_net_nfdk_vec_avx2_xmit_simple_send1(struct nfp_net_txq *txq,
+               struct nfp_net_nfdk_tx_desc *txds,
+               struct rte_mbuf *pkt,
+               bool repr_flag)
+{
+       int ret;
+       __m128i des_data;
+       uint64_t des_addr;
+       uint64_t des_meta;
+
+       ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt, txq, &des_addr,
+                       &des_meta, repr_flag);
+       if (unlikely(ret != 0))
+               return ret;
+
+       txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+       if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+               txq->data_pending += pkt->data_len;
+       else
+               txq->data_pending = 0;
+
+       des_data = _mm_set_epi64x(des_meta, des_addr);
+
+       _mm_store_si128((void *)txds, des_data);
+
+       return 0;
+}
+
+static inline int
+nfp_vec_avx2_nfdk_xmit_simple_send4(struct nfp_net_txq *txq,
+               struct nfp_net_nfdk_tx_desc *txds,
+               struct rte_mbuf **pkt,
+               bool repr_flag)
+{
+       int ret;
+       uint16_t i;
+       __m256i des_data0_1;
+       __m256i des_data2_3;
+       uint64_t des_addr[4];
+       uint64_t des_meta[4];
+
+       for (i = 0; i < 4; i++) {
+               ret = nfp_net_nfdk_vec_avx2_xmit_simple_set_des2(pkt[i], txq,
+                               &des_addr[i], &des_meta[i], repr_flag);
+               if (unlikely(ret != 0))
+                       return ret;
+       }
+
+       for (i = 0; i < 4; i++) {
+               txq->wr_p = D_IDX(txq, txq->wr_p + NFDK_TX_DESC_PER_SIMPLE_PKT);
+               if ((txq->wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) != 0)
+                       txq->data_pending += pkt[i]->data_len;
+               else
+                       txq->data_pending = 0;
+       }
+
+       des_data0_1 = _mm256_set_epi64x(des_meta[1], des_addr[1], des_meta[0], 
des_addr[0]);
+       des_data2_3 = _mm256_set_epi64x(des_meta[3], des_addr[3], des_meta[2], 
des_addr[2]);
+
+       _mm256_store_si256((void *)txds, des_data0_1);
+       _mm256_store_si256((void *)(txds + 4), des_data2_3);
+
+       return 0;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(struct rte_mbuf **mbuf,
+               struct rte_mbuf **tx_pkts)
+{
+       __m256i mbuf_room0_1;
+       __m256i mbuf_room2_3;
+
+       mbuf_room0_1 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[1], 0,
+                       (uintptr_t)tx_pkts[0]);
+       mbuf_room2_3 = _mm256_set_epi64x(0, (uintptr_t)tx_pkts[3], 0,
+                       (uintptr_t)tx_pkts[2]);
+
+       _mm256_store_si256((void *)mbuf, mbuf_room0_1);
+       _mm256_store_si256((void *)(mbuf + 4), mbuf_room2_3);
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_pkts(struct nfp_net_txq *txq,
+               struct rte_mbuf **tx_pkts,
+               uint16_t nb_pkts,
+               uint16_t simple_close,
+               bool repr_flag)
+{
+       int ret;
+       uint16_t npkts = 0;
+       uint16_t need_txds;
+       uint16_t free_descs;
+       struct rte_mbuf **lmbuf;
+       struct nfp_net_nfdk_tx_desc *ktxds;
+
+       PMD_TX_LOG(DEBUG, "Working for queue %hu at pos %u and %hu packets",
+                       txq->qidx, txq->wr_p, nb_pkts);
+
+       need_txds = nb_pkts << 1;
+       if (nfp_net_nfdk_free_tx_desc(txq) < need_txds || 
nfp_net_nfdk_txq_full(txq))
+               nfp_net_tx_free_bufs(txq);
+
+       free_descs = nfp_net_nfdk_free_tx_desc(txq);
+       if (unlikely(free_descs < NFDK_TX_DESC_PER_SIMPLE_PKT)) {
+               if (unlikely(simple_close > 0))
+                       goto xmit_end;
+
+               return 0;
+       }
+
+       PMD_TX_LOG(DEBUG, "Queue: %hu. Sending %hu packets", txq->qidx, 
nb_pkts);
+
+       /* Sending packets */
+       while (npkts < nb_pkts && free_descs >= NFDK_TX_DESC_PER_SIMPLE_PKT) {
+               ktxds = &txq->ktxds[txq->wr_p];
+               lmbuf = &txq->txbufs[txq->wr_p].mbuf;
+
+               /*
+                * If can not send burst, just send one.
+                * 1. Tx ring will come to the tail.
+                * 2. Do not need to send 4 packets.
+                * 3. If pointer address unaligned on 32-bit boundary.
+                * 4. If free descriptors are not enough.
+                */
+               if ((txq->tx_count - txq->wr_p) < NFDK_SIMPLE_BURST_DES_NUM ||
+                               (nb_pkts - npkts) < 4 ||
+                               ((uintptr_t)ktxds & 0x1F) != 0 ||
+                               free_descs < NFDK_SIMPLE_BURST_DES_NUM) {
+                       ret = nfp_net_nfdk_vec_avx2_xmit_simple_send1(txq,
+                                       ktxds, tx_pkts[npkts], repr_flag);
+                       if (unlikely(ret != 0))
+                               goto xmit_end;
+
+                       rte_pktmbuf_free(*lmbuf);
+
+                       _mm_storel_epi64((void *)lmbuf,
+                                       _mm_loadu_si128((void 
*)&tx_pkts[npkts]));
+                       npkts++;
+                       free_descs -= NFDK_TX_DESC_PER_SIMPLE_PKT;
+                       continue;
+               }
+
+               ret = nfp_vec_avx2_nfdk_xmit_simple_send4(txq, ktxds,
+                               &tx_pkts[npkts], repr_flag);
+               if (unlikely(ret != 0))
+                       goto xmit_end;
+
+               rte_pktmbuf_free_bulk(lmbuf, NFDK_SIMPLE_BURST_DES_NUM);
+
+               nfp_net_nfdk_vec_avx2_xmit_mbuf_store4(lmbuf, &tx_pkts[npkts]);
+
+               npkts += 4;
+               free_descs -= NFDK_SIMPLE_BURST_DES_NUM;
+       }
+
+xmit_end:
+       /* Increment write pointers. Force memory write before we let HW know */
+       rte_wmb();
+       nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, ((npkts << 1) + 
simple_close));
+
+       return npkts;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_close_block(struct nfp_net_txq *txq,
+               uint16_t *simple_close)
+{
+       uint16_t i;
+       uint16_t wr_p;
+       uint16_t nop_slots;
+       __m128i zero_128 = _mm_setzero_si128();
+       __m256i zero_256 = _mm256_setzero_si256();
+
+       wr_p = txq->wr_p;
+       nop_slots = D_BLOCK_CPL(wr_p);
+
+       for (i = nop_slots; i >= 4; i -= 4, wr_p += 4) {
+               _mm256_store_si256((void *)&txq->ktxds[wr_p], zero_256);
+               rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 4);
+               _mm256_store_si256((void *)&txq->txbufs[wr_p], zero_256);
+       }
+
+       for (; i >= 2; i -= 2, wr_p += 2) {
+               _mm_store_si128((void *)&txq->ktxds[wr_p], zero_128);
+               rte_pktmbuf_free_bulk(&txq->txbufs[wr_p].mbuf, 2);
+               _mm_store_si128((void *)&txq->txbufs[wr_p], zero_128);
+       }
+
+       for (; i >= 1; i--, wr_p++) {
+               _mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+               rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+               _mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+       }
+
+       txq->data_pending = 0;
+       txq->wr_p = D_IDX(txq, txq->wr_p + nop_slots);
+
+       (*simple_close) += nop_slots;
+}
+
+static inline uint32_t
+nfp_net_nfdk_vec_avx2_xmit_simple_prepare(struct nfp_net_txq *txq,
+               uint16_t *simple_close)
+{
+       uint16_t wr_p;
+       __m128i zero_128 = _mm_setzero_si128();
+
+       wr_p = txq->wr_p;
+
+       _mm_storel_epi64((void *)&txq->ktxds[wr_p], zero_128);
+       rte_pktmbuf_free(txq->txbufs[wr_p].mbuf);
+       _mm_storel_epi64((void *)&txq->txbufs[wr_p], zero_128);
+
+       txq->wr_p = D_IDX(txq, wr_p + 1);
+       (*simple_close)++;
+
+       return txq->wr_p;
+}
+
+static inline void
+nfp_net_nfdk_vec_avx2_xmit_simple_check(struct nfp_net_txq *txq,
+               struct rte_mbuf *pkt,
+               bool *simple_flag,
+               bool *pending_flag,
+               uint16_t *data_pending,
+               uint32_t *wr_p,
+               uint16_t *simple_close)
+{
+       uint32_t data_pending_temp;
+
+       /* Let the first descriptor index even before send simple packets */
+       if (!(*simple_flag)) {
+               if ((*wr_p & 0x1) == 0x1)
+                       *wr_p = nfp_net_nfdk_vec_avx2_xmit_simple_prepare(txq, 
simple_close);
+
+               *simple_flag = true;
+       }
+
+       /* Simple packets only need one close block operation */
+       if (!(*pending_flag)) {
+               if ((*wr_p & (NFDK_TX_DESC_BLOCK_CNT - 1)) == 0) {
+                       *pending_flag = true;
+                       return;
+               }
+
+               data_pending_temp = *data_pending + pkt->data_len;
+               if (data_pending_temp > NFDK_TX_MAX_DATA_PER_BLOCK) {
+                       nfp_net_nfdk_vec_avx2_xmit_simple_close_block(txq, 
simple_close);
+                       *pending_flag = true;
+                       return;
+               }
+
+               *data_pending = data_pending_temp;
+
+               *wr_p += 2;
+       }
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_simple_count(struct nfp_net_txq *txq,
+               struct rte_mbuf **tx_pkts,
+               uint16_t head,
+               uint16_t nb_pkts,
+               uint16_t *simple_close)
+{
+       uint32_t wr_p;
+       uint16_t simple_idx;
+       struct rte_mbuf *pkt;
+       uint16_t data_pending;
+       bool simple_flag = false;
+       bool pending_flag = false;
+       uint16_t simple_count = 0;
+
+       *simple_close = 0;
+       wr_p = txq->wr_p;
+       data_pending = txq->data_pending;
+
+       for (simple_idx = head; simple_idx < nb_pkts; simple_idx++) {
+               pkt = tx_pkts[simple_idx];
+               if (!nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+                       break;
+
+               simple_count++;
+               if (!txq->simple_always)
+                       nfp_net_nfdk_vec_avx2_xmit_simple_check(txq, pkt, 
&simple_flag,
+                                       &pending_flag, &data_pending, &wr_p, 
simple_close);
+       }
+
+       return simple_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_others_count(struct nfp_net_txq *txq,
+               struct rte_mbuf **tx_pkts,
+               uint16_t head,
+               uint16_t nb_pkts)
+{
+       uint16_t others_idx;
+       struct rte_mbuf *pkt;
+       uint16_t others_count = 0;
+
+       for (others_idx = head; others_idx < nb_pkts; others_idx++) {
+               pkt = tx_pkts[others_idx];
+               if (nfp_net_nfdk_is_simple_packet(pkt, txq->hw))
+                       break;
+
+               others_count++;
+       }
+
+       return others_count;
+}
+
+static inline uint16_t
+nfp_net_nfdk_vec_avx2_xmit_common(void *tx_queue,
+               struct rte_mbuf **tx_pkts,
+               uint16_t nb_pkts)
+{
+       uint16_t i;
+       uint16_t avail = 0;
+       uint16_t simple_close;
+       uint16_t simple_count;
+       uint16_t simple_avail;
+       uint16_t others_count;
+       uint16_t others_avail;
+       struct nfp_net_txq *txq = tx_queue;
+
+       for (i = 0; i < nb_pkts; i++) {
+               simple_count = nfp_net_nfdk_vec_avx2_xmit_simple_count(txq, 
tx_pkts, i,
+                               nb_pkts, &simple_close);
+               if (simple_count > 0) {
+                       if (!txq->simple_always)
+                               txq->simple_always = true;
+
+                       simple_avail = 
nfp_net_nfdk_vec_avx2_xmit_simple_pkts(txq,
+                                       tx_pkts + i, simple_count, simple_close,
+                                       false);
+
+                       avail += simple_avail;
+                       if (simple_avail != simple_count)
+                               break;
+
+                       i += simple_count;
+               }
+
+               if (i == nb_pkts)
+                       break;
+
+               others_count = nfp_net_nfdk_vec_avx2_xmit_others_count(txq, 
tx_pkts,
+                               i, nb_pkts);
+
+               if (txq->simple_always)
+                       txq->simple_always = false;
+
+               others_avail = nfp_net_nfdk_xmit_pkts_common(tx_queue,
+                               tx_pkts + i, others_count, false);
+
+               avail += others_avail;
+               if (others_avail != others_count)
+                       break;
+
+               i += others_count;
+       }
+
+       return avail;
+}
+
+uint16_t
+nfp_net_nfdk_vec_avx2_xmit_pkts(void *tx_queue,
+               struct rte_mbuf **tx_pkts,
+               uint16_t nb_pkts)
+{
+       return nfp_net_nfdk_vec_avx2_xmit_common(tx_queue, tx_pkts, nb_pkts);
+}
diff --git a/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c 
b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
new file mode 100644
index 0000000000..146ec21d51
--- /dev/null
+++ b/drivers/net/nfp/nfdk/nfp_nfdk_vec_stub.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_nfdk_vec.h"
+
+uint16_t __rte_weak
+nfp_net_nfdk_vec_avx2_xmit_pkts(__rte_unused void *tx_queue,
+               __rte_unused struct rte_mbuf **tx_pkts,
+               __rte_unused uint16_t nb_pkts)
+{
+       return 0;
+}
diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index 8c0cacd3fc..a7b40af712 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -28,6 +28,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_flow.h"
+#include "nfp_rxtx_vec.h"
 
 /* 64-bit per app capabilities */
 #define NFP_NET_APP_CAP_SP_INDIFF       RTE_BIT64(0) /* Indifferent to port 
speed */
@@ -964,7 +965,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
        if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
                eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
        else
-               eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+               nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
        eth_dev->dev_ops = &nfp_net_eth_dev_ops;
        eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index e7c18fe90a..b955624ed6 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -14,6 +14,7 @@
 
 #include "nfp_logs.h"
 #include "nfp_net_common.h"
+#include "nfp_rxtx_vec.h"
 
 #define NFP_VF_DRIVER_NAME net_nfp_vf
 
@@ -240,7 +241,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
        if (hw->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
                eth_dev->tx_pkt_burst = nfp_net_nfd3_xmit_pkts;
        else
-               eth_dev->tx_pkt_burst = nfp_net_nfdk_xmit_pkts;
+               nfp_net_nfdk_xmit_pkts_set(eth_dev);
 
        eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
        eth_dev->rx_queue_count = nfp_net_rx_queue_count;
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 9806384a63..3ddf717da0 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -69,9 +69,12 @@ struct __rte_aligned(64) nfp_net_txq {
        /** Used by NFDk only */
        uint16_t data_pending;
 
+       /** Used by NFDk vector xmit only */
+       bool simple_always;
+
        /**
         * At this point 58 bytes have been used for all the fields in the
-        * TX critical path. We have room for 6 bytes and still all placed
+        * TX critical path. We have room for 5 bytes and still all placed
         * in a cache line.
         */
        uint64_t dma;
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
new file mode 100644
index 0000000000..c92660f963
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RXTX_VEC_AVX2_H__
+#define __NFP_RXTX_VEC_AVX2_H__
+
+#include <stdbool.h>
+
+bool nfp_net_get_avx2_supported(void);
+
+#endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c 
b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
new file mode 100644
index 0000000000..50638e74ab
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_cpuflags.h>
+#include <rte_vect.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool
+nfp_net_get_avx2_supported(void)
+{
+       if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+                       rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+               return true;
+
+       return false;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c 
b/drivers/net/nfp/nfp_rxtx_vec_stub.c
new file mode 100644
index 0000000000..1bc55b67e0
--- /dev/null
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Corigine, Inc.
+ * All rights reserved.
+ */
+
+#include <stdbool.h>
+
+#include <rte_common.h>
+
+#include "nfp_rxtx_vec.h"
+
+bool __rte_weak
+nfp_net_get_avx2_supported(void)
+{
+       return false;
+}
-- 
2.39.1

Reply via email to