From: Jie Liu <[email protected]> - Implement sxe2_recv_pkts_vec_neon for bulk packet receiving. - Implement sxe2_xmit_pkts_vec_neon for bulk packet transmission. - Added logic to select the vectorized path based on runtime config and CPU flags (RTE_ARCH_ARM64).
Vectorized path improves throughput for small packets by processing multiple descriptors simultaneously using SIMD instructions. Signed-off-by: Jie Liu <[email protected]> --- drivers/net/sxe2/meson.build | 2 + drivers/net/sxe2/sxe2_txrx.c | 36 ++ drivers/net/sxe2/sxe2_txrx_vec.h | 16 +- drivers/net/sxe2/sxe2_txrx_vec_common.h | 1 - drivers/net/sxe2/sxe2_txrx_vec_neon.c | 707 ++++++++++++++++++++++++ 5 files changed, 759 insertions(+), 3 deletions(-) create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_neon.c diff --git a/drivers/net/sxe2/meson.build b/drivers/net/sxe2/meson.build index c73e13bbad..0658b2ee3a 100644 --- a/drivers/net/sxe2/meson.build +++ b/drivers/net/sxe2/meson.build @@ -48,6 +48,8 @@ if arch_subdir == 'x86' include_directories: includes, c_args: [cflags, '-mavx2']) objs += sxe2_avx2_lib.extract_objects('sxe2_txrx_vec_avx2.c') +elif arch_subdir == 'arm' + sources += files('sxe2_txrx_vec_neon.c') endif sources += files( diff --git a/drivers/net/sxe2/sxe2_txrx.c b/drivers/net/sxe2/sxe2_txrx.c index dcfaf7278d..2eb8365457 100644 --- a/drivers/net/sxe2/sxe2_txrx.c +++ b/drivers/net/sxe2/sxe2_txrx.c @@ -176,6 +176,10 @@ void sxe2_tx_mode_func_set(struct rte_eth_dev *dev) if ((0 == (tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK))) tx_mode_flags |= SXE2_TX_MODE_VEC_SSE; +#elif defined(RTE_ARCH_ARM64) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) == 1) { + tx_mode_flags |= (vec_flags | SXE2_TX_MODE_VEC_NEON); + } #endif if (tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) { ret = sxe2_tx_queues_vec_prepare(dev); @@ -228,6 +232,13 @@ void sxe2_tx_mode_func_set(struct rte_eth_dev *dev) dev->tx_pkt_burst = sxe2_tx_pkts_vec_sse_simple; } } +#elif defined(RTE_ARCH_ARM64) + if (adapter->tx_mode_flags & SXE2_TX_MODE_VEC_NEON) { + dev->tx_pkt_prepare = sxe2_tx_pkts_prepare; + dev->tx_pkt_burst = sxe2_tx_pkts_vec_neon; + } else { + dev->tx_pkt_burst = sxe2_tx_pkts_vec_neon_simple; + } } else { #endif if (tx_mode_flags & SXE2_TX_MODE_SIMPLE_BATCH) { @@ -263,6 +274,12 @@ static const struct { { sxe2_tx_pkts_vec_sse_simple, "Vector SSE Simple" }, #endif +#ifdef RTE_ARCH_ARM64 + { sxe2_tx_pkts_vec_neon, + "Vector NEON" }, + { sxe2_tx_pkts_vec_neon_simple, + "Vector NEON Simple" }, +#endif }; int32_t sxe2_tx_burst_mode_get(struct rte_eth_dev *dev, @@ -366,6 +383,11 @@ void sxe2_rx_mode_func_set(struct rte_eth_dev *dev) if (((rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) == 0) && rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) rx_mode_flags |= SXE2_RX_MODE_VEC_SSE; + +#elif defined(RTE_ARCH_ARM64) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) == 1) { + rx_mode_flags |= (vec_flags | SXE2_RX_MODE_VEC_NEON); + } #endif if ((rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) != 0) { ret = sxe2_rx_queues_vec_prepare(dev); @@ -397,6 +419,14 @@ void sxe2_rx_mode_func_set(struct rte_eth_dev *dev) } return; } +#elif defined(RTE_ARCH_ARM64) + if (rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) { + if (rx_mode_flags & SXE2_RX_MODE_VEC_OFFLOAD) + dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_neon_offload; + else + dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_neon; + goto l_end; + } #endif if (sxe2_rx_offload_en_check(dev, RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT)) dev->rx_pkt_burst = sxe2_rx_pkts_scattered_split; @@ -426,6 +456,12 @@ static const struct { { sxe2_rx_pkts_scattered_vec_sse_offload, "Vector SSE Scattered" }, #endif +#ifdef RTE_ARCH_ARM64 + { sxe2_rx_pkts_scattered_vec_neon, + "Vector NEON Scattered" }, + { sxe2_rx_pkts_scattered_vec_neon_offload, + "Offload Vector NEON Scattered" }, +#endif }; int32_t sxe2_rx_burst_mode_get(struct rte_eth_dev *dev, diff --git a/drivers/net/sxe2/sxe2_txrx_vec.h b/drivers/net/sxe2/sxe2_txrx_vec.h index d7a0ce6ca5..02b1743e3e 100644 --- a/drivers/net/sxe2/sxe2_txrx_vec.h +++ b/drivers/net/sxe2/sxe2_txrx_vec.h @@ -12,19 +12,23 @@ #define SXE2_RX_MODE_VEC_SSE RTE_BIT32(2) #define SXE2_RX_MODE_VEC_AVX2 RTE_BIT32(3) #define SXE2_RX_MODE_VEC_AVX512 RTE_BIT32(4) +#define SXE2_RX_MODE_VEC_NEON RTE_BIT32(5) #define SXE2_RX_MODE_BATCH_ALLOC RTE_BIT32(10) #define SXE2_RX_MODE_VEC_SET_MASK (SXE2_RX_MODE_VEC_SIMPLE | \ SXE2_RX_MODE_VEC_OFFLOAD | SXE2_RX_MODE_VEC_SSE | \ - SXE2_RX_MODE_VEC_AVX2 | SXE2_RX_MODE_VEC_AVX512) + SXE2_RX_MODE_VEC_AVX2 | SXE2_RX_MODE_VEC_AVX512 | \ + SXE2_RX_MODE_VEC_NEON) #define SXE2_TX_MODE_VEC_SIMPLE RTE_BIT32(0) #define SXE2_TX_MODE_VEC_OFFLOAD RTE_BIT32(1) #define SXE2_TX_MODE_VEC_SSE RTE_BIT32(2) #define SXE2_TX_MODE_VEC_AVX2 RTE_BIT32(3) #define SXE2_TX_MODE_VEC_AVX512 RTE_BIT32(4) +#define SXE2_TX_MODE_VEC_NEON RTE_BIT32(5) #define SXE2_TX_MODE_SIMPLE_BATCH RTE_BIT32(10) #define SXE2_TX_MODE_VEC_SET_MASK (SXE2_TX_MODE_VEC_SIMPLE | \ SXE2_TX_MODE_VEC_OFFLOAD | SXE2_TX_MODE_VEC_SSE | \ - SXE2_TX_MODE_VEC_AVX2 | SXE2_TX_MODE_VEC_AVX512) + SXE2_TX_MODE_VEC_AVX2 | SXE2_TX_MODE_VEC_AVX512 | \ + SXE2_TX_MODE_VEC_NEON) #define SXE2_TX_VEC_NO_SUPPORT_OFFLOAD ( \ RTE_ETH_TX_OFFLOAD_MULTI_SEGS | \ RTE_ETH_TX_OFFLOAD_QINQ_INSERT | \ @@ -75,6 +79,14 @@ uint16_t sxe2_rx_pkts_scattered_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); uint16_t sxe2_rx_pkts_scattered_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); + +#elif defined(RTE_ARCH_ARM64) +uint16_t sxe2_rx_pkts_scattered_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); +uint16_t sxe2_rx_pkts_scattered_vec_neon_offload(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); +uint16_t sxe2_tx_pkts_vec_neon_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +uint16_t sxe2_tx_pkts_vec_neon(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); #endif int32_t __rte_cold sxe2_tx_vec_support_check(struct rte_eth_dev *dev, uint32_t *vec_flags); int32_t __rte_cold sxe2_tx_queues_vec_prepare(struct rte_eth_dev *dev); diff --git a/drivers/net/sxe2/sxe2_txrx_vec_common.h b/drivers/net/sxe2/sxe2_txrx_vec_common.h index 138b748f4a..8fce2bb7cc 100644 --- a/drivers/net/sxe2/sxe2_txrx_vec_common.h +++ b/drivers/net/sxe2/sxe2_txrx_vec_common.h @@ -4,7 +4,6 @@ #ifndef __SXE2_TXRX_VEC_COMMON_H__ #define __SXE2_TXRX_VEC_COMMON_H__ -#include <rte_atomic.h> #ifdef PCLINT #include "avx_stub.h" #endif diff --git a/drivers/net/sxe2/sxe2_txrx_vec_neon.c b/drivers/net/sxe2/sxe2_txrx_vec_neon.c new file mode 100644 index 0000000000..e50a0b21bf --- /dev/null +++ b/drivers/net/sxe2/sxe2_txrx_vec_neon.c @@ -0,0 +1,707 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C), 2025, Wuxi Stars Micro System Technologies Co., Ltd. + */ + +#ifdef RTE_ARCH_ARM64 +#include <arm_neon.h> +#include <rte_vect.h> + +#include "sxe2_txrx_vec_common.h" +#include "sxe2_txrx_vec.h" +#include "sxe2_common_log.h" + +#define PKTLEN_SHIFT 10 +#define SXE2_UINT16_BIT (CHAR_BIT * sizeof(uint16_t)) + +static __rte_always_inline void +sxe2_tx_desc_fill_one_neon(volatile union sxe2_tx_data_desc *desc, + struct rte_mbuf *pkt, uint64_t desc_cmd, bool with_offloads) +{ + uint64_t desc_qw1; + uint32_t desc_offset; + + desc_qw1 = (SXE2_TX_DESC_DTYPE_DATA | + ((uint64_t)desc_cmd) << SXE2_TX_DATA_DESC_CMD_SHIFT | + ((uint64_t)pkt->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT); + + desc_offset = SXE2_TX_DATA_DESC_MACLEN_VAL(pkt->l2_len); + desc_qw1 |= ((uint64_t)desc_offset) << SXE2_TX_DATA_DESC_OFFSET_SHIFT; + if (with_offloads) + sxe2_tx_desc_fill_offloads(pkt, &desc_qw1); + + uint64x2_t data_desc = { rte_pktmbuf_iova(pkt), desc_qw1 }; + + vst1q_u64((uint64_t *)desc, data_desc); +} + +static __rte_always_inline uint16_t +sxe2_tx_pkts_vec_neon_batch(struct sxe2_tx_queue *txq, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts, bool with_offloads) +{ + volatile union sxe2_tx_data_desc *desc; + struct sxe2_tx_buffer *buffer; + uint16_t next_use; + uint16_t res_num; + uint16_t tx_num; + uint16_t i; + + if (txq->desc_free_num < txq->free_thresh) + (void)sxe2_tx_bufs_free_vec(txq); + + nb_pkts = RTE_MIN(txq->desc_free_num, nb_pkts); + if (unlikely(nb_pkts == 0)) { + PMD_LOG_TX_DEBUG("Tx pkts neon batch: may not enough free desc, " + "free_desc=%u, need_tx_pkts=%u", + txq->desc_free_num, nb_pkts); + goto l_end; + } + tx_num = nb_pkts; + + next_use = txq->next_use; + desc = &txq->desc_ring[next_use]; + buffer = &txq->buffer_ring[next_use]; + + txq->desc_free_num -= nb_pkts; + + res_num = txq->ring_depth - txq->next_use; + + if (tx_num >= res_num) { + sxe2_tx_pkts_mbuf_fill(buffer, tx_pkts, res_num); + + for (i = 0; i < res_num - 1; ++i, ++tx_pkts, ++desc) { + sxe2_tx_desc_fill_one_neon(desc, *tx_pkts, + SXE2_TX_DATA_DESC_CMD_EOP, with_offloads); + } + + sxe2_tx_desc_fill_one_neon(desc, *tx_pkts++, + (SXE2_TX_DATA_DESC_CMD_EOP | SXE2_TX_DATA_DESC_CMD_RS), + with_offloads); + + tx_num -= res_num; + + next_use = 0; + txq->next_rs = txq->rs_thresh - 1; + desc = &txq->desc_ring[next_use]; + buffer = &txq->buffer_ring[next_use]; + } + + sxe2_tx_pkts_mbuf_fill(buffer, tx_pkts, tx_num); + + for (i = 0; i < tx_num; ++i, ++tx_pkts, ++desc) { + sxe2_tx_desc_fill_one_neon(desc, *tx_pkts, + SXE2_TX_DATA_DESC_CMD_EOP, with_offloads); + } + + next_use += tx_num; + if (next_use > txq->next_rs) { + txq->desc_ring[txq->next_rs].read.type_cmd_off_bsz_l2t |= + rte_cpu_to_le_64(SXE2_TX_DATA_DESC_CMD_RS_MASK); + + txq->next_rs += txq->rs_thresh; + } + txq->next_use = next_use; + + SXE2_PCI_REG_WRITE_WC(txq->tdt_reg_addr, txq->next_use); + +l_end: + return nb_pkts; +} + +static __rte_always_inline uint16_t +sxe2_tx_pkts_vec_neon_common(struct sxe2_tx_queue *txq, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts, bool with_offloads) +{ + uint16_t tx_done_num = 0; + uint16_t tx_once_num; + uint16_t tx_need_num; + + while (nb_pkts) { + tx_need_num = RTE_MIN(nb_pkts, txq->rs_thresh); + tx_once_num = sxe2_tx_pkts_vec_neon_batch(txq, + tx_pkts + tx_done_num, + tx_need_num, with_offloads); + + nb_pkts -= tx_once_num; + tx_done_num += tx_once_num; + + if (tx_once_num < tx_need_num) + break; + } + + PMD_LOG_TX_DEBUG("Tx pkts neon: port_id=%u, queue_id=%u, " + "nb_pkts=%u, tx_done_num=%u with_offloads=%u", + txq->port_id, txq->idx_in_pf, nb_pkts, tx_done_num, with_offloads); + + SXE2_TX_STATS_CNT(txq, tx_pkts_num, tx_done_num); + return tx_done_num; +} + +uint16_t sxe2_tx_pkts_vec_neon_simple(void *tx_queue, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + return sxe2_tx_pkts_vec_neon_common((struct sxe2_tx_queue *)tx_queue, + tx_pkts, nb_pkts, false); +} + +uint16_t sxe2_tx_pkts_vec_neon(void *tx_queue, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + return sxe2_tx_pkts_vec_neon_common((struct sxe2_tx_queue *)tx_queue, + tx_pkts, nb_pkts, true); +} + +static __rte_always_inline void +sxe2_rx_desc_ptype_fill_neon(uint16x8_t staterr, struct rte_mbuf **__rte_restrict rx_pkts, + const uint32_t *__rte_restrict ptype_tbl) +{ + uint16x8_t ptype_mask = { + 0, 0x3FFULL, + 0, 0x3FFULL, + 0, 0x3FFULL, + 0, 0x3FFULL, + }; + uint16x8_t ptype_all; + + ptype_all = vandq_u16(staterr, ptype_mask); + + rx_pkts[3]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 3)]; + rx_pkts[2]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 7)]; + rx_pkts[1]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 1)]; + rx_pkts[0]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 5)]; +} + +static __rte_always_inline uint32x4_t +sxe2_rx_desc_fnav_flags_neon(uint64x2_t descs_arr[4]) +{ + uint32x4_t descs_tmp1, descs_tmp2; + uint32x4_t descs_fnav_vld; + uint32x4_t v_zeros, v_ffff, v_u32_one; + uint32x4_t m_flags; + + const uint32x4_t fdir_flags = vdupq_n_u32(RTE_MBUF_F_RX_FDIR | + RTE_MBUF_F_RX_FDIR_ID); + + { + uint32x4_t d0 = vreinterpretq_u32_u64(descs_arr[0]); + uint32x4_t d1 = vreinterpretq_u32_u64(descs_arr[1]); + uint32x4_t d2 = vreinterpretq_u32_u64(descs_arr[2]); + uint32x4_t d3 = vreinterpretq_u32_u64(descs_arr[3]); + + descs_tmp1 = vzip1q_u32(d1, d + +static __rte_always_inline void +sxe2_rx_desc_offloads_para_fill_neon(struct sxe2_rx_queue *rxq, + volatile union sxe2_rx_desc *desc, + uint64x2_t descs[4], struct rte_mbuf **rx_pkts) +{ + uint32x4_t desc_lo, desc_hi, flags, tmp_flags; + const uint64x2_t mbuf_init = {rxq->mbuf_init_value, 0}; + uint64x2_t rearm0, rearm1, rearm2, rearm3; + + const uint32x4_t desc_msk = { + 0x00001C04, 0x00001C04, 0x00001C04, 0x00001C04}; + + const uint32x4_t rss_msk = { + 0x20000000, 0x20000000, 0x20000000, 0x20000000}; + + const uint32x4_t vlan_msk = { + RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, + RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, + RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, + RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED + }; + const uint8x16_t vlan_flags = { + 0, 0, 0, 0, + RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 + }; + + const uint8x16_t rss_flags = { + 0, 0, 0, 0, + RTE_MBUF_F_RX_RSS_HASH, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 + }; + + const uint32x4_t cksum_mask = { + RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, + RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, + RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, + RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, + }; + + const uint8x16_t cksum_flags = { + ((RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + 0, 0, 0, 0, 0, 0, 0, 0 + }; + + { + uint32x4_t d0 = vreinterpretq_u32_u64(descs[0]); + uint32x4_t d1 = vreinterpretq_u32_u64(descs[1]); + uint32x4_t d2 = vreinterpretq_u32_u64(descs[2]); + uint32x4_t d3 = vreinterpretq_u32_u64(descs[3]); + uint64x2_t f64, t64; + + flags = vzip2q_u32(d1, d0); + tmp_flags = vzip2q_u32(d3, d2); + f64 = vreinterpretq_u64_u32(flags); + t64 = vreinterpretq_u64_u32(tmp_flags); + desc_lo = vreinterpretq_u32_u64(vcombine_u64(vget_low_u64(f64), + vget_low_u64(t64))); + desc_hi = vreinterpretq_u32_u64(vcombine_u64(vget_high_u64(f64), + vget_high_u64(t64))); + } + + desc_lo = vandq_u32(desc_lo, desc_msk); + desc_hi = vandq_u32(desc_hi, rss_msk); + + tmp_flags = vreinterpretq_u32_u8(vqtbl1q_u8(vlan_flags, + vreinterpretq_u8_u32(desc_lo))); + flags = vandq_u32(tmp_flags, vlan_msk); + + desc_lo = vshrq_n_u32(desc_lo, 10); + tmp_flags = vreinterpretq_u32_u8(vqtbl1q_u8(cksum_flags, + vreinterpretq_u8_u32(desc_lo))); + tmp_flags = vshlq_n_u32(tmp_flags, 1); + tmp_flags = vandq_u32(tmp_flags, cksum_mask); + flags = vorrq_u32(flags, tmp_flags); + + desc_hi = vshrq_n_u32(desc_hi, 27); + tmp_flags = vreinterpretq_u32_u8(vqtbl1q_u8(rss_flags, + vreinterpretq_u8_u32(desc_hi))); + flags = vorrq_u32(flags, tmp_flags); + +#ifndef RTE_LIBRTE_SXE2_16BYTE_RX_DESC + if (rxq->fnav_enable) { + uint32x4_t tmp_fnav_flags = sxe2_rx_desc_fnav_flags_neon(descs); + flags = vorrq_u32(flags, tmp_fnav_flags); + + rx_pkts[0]->hash.fdir.hi = desc[0].wb.fd_filter_id; + rx_pkts[1]->hash.fdir.hi = desc[1].wb.fd_filter_id; + rx_pkts[2]->hash.fdir.hi = desc[2].wb.fd_filter_id; + rx_pkts[3]->hash.fdir.hi = desc[3].wb.fd_filter_id; + } +#endif + + rearm0 = vsetq_lane_u64(vgetq_lane_u32(flags, 0), mbuf_init, 1); + rearm1 = vsetq_lane_u64(vgetq_lane_u32(flags, 1), mbuf_init, 1); + rearm2 = vsetq_lane_u64(vgetq_lane_u32(flags, 2), mbuf_init, 1); + rearm3 = vsetq_lane_u64(vgetq_lane_u32(flags, 3), mbuf_init, 1); + + vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0); + vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1); + vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2); + vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3); +} + +static inline void sxe2_rx_queue_rearm_neon(struct sxe2_rx_queue *rxq) +{ + volatile union sxe2_rx_desc *desc; + struct rte_mbuf **buffer; + struct rte_mbuf *mbuf0, *mbuf1; + uint64x2_t dma_addr0, dma_addr1; + uint64x2_t zero = vdupq_n_u64(0); + uint64x2_t virt_addr0, virt_addr1; + uint64x2_t hdr_room = vdupq_n_u64(RTE_PKTMBUF_HEADROOM); + int32_t ret; + uint16_t i; + uint16_t new_tail; + + buffer = &rxq->buffer_ring[rxq->realloc_start]; + desc = &rxq->desc_ring[rxq->realloc_start]; + + ret = rte_mempool_get_bulk(rxq->mb_pool, (void *)buffer, + SXE2_RX_REARM_THRESH_VEC); + if (ret != 0) { + PMD_LOG_RX_INFO("Rx mbuf vec alloc failed port_id=%u " + "queue_id=%u", rxq->port_id, + rxq->idx_in_pf); + + if ((rxq->realloc_num + SXE2_RX_REARM_THRESH_VEC) >= rxq->ring_depth) { + for (i = 0; i < SXE2_RX_NUM_PER_LOOP_NEON; ++i) { + buffer[i] = &rxq->fake_mbuf; + vst1q_u64((uint64_t *)&desc[i].read, zero); + } + } + + rxq->vsi->adapter->dev_info.dev_data->rx_mbuf_alloc_failed += + SXE2_RX_REARM_THRESH_VEC; + goto l_end; + } + + for (i = 0; i < SXE2_RX_REARM_THRESH_VEC; i += 2, buffer += 2) { + mbuf0 = buffer[0]; + mbuf1 = buffer[1]; +#if RTE_IOVA_IN_MBUF + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); +#endif + virt_addr0 = vld1q_u64((uint64_t *)&mbuf0->buf_addr); + virt_addr1 = vld1q_u64((uint64_t *)&mbuf1->buf_addr); + +#if RTE_IOVA_IN_MBUF + dma_addr0 = vdupq_n_u64((uint64_t)vget_high_u64(virt_addr0)); + dma_addr1 = vdupq_n_u64((uint64_t)vget_high_u64(virt_addr1)); +#else + dma_addr0 = vdupq_n_u64((uint64_t)vget_low_u64(virt_addr0)); + dma_addr1 = vdupq_n_u64((uint64_t)vget_low_u64(virt_addr1)); +#endif + dma_addr0 = vaddq_u64(dma_addr0, hdr_room); + dma_addr1 = vaddq_u64(dma_addr1, hdr_room); + + vst1q_u64((uint64_t *)&desc++->read, dma_addr0); + vst1q_u64((uint64_t *)&desc++->read, dma_addr1); + } + + rxq->realloc_start += SXE2_RX_REARM_THRESH_VEC; + if (rxq->realloc_start >= rxq->ring_depth) + rxq->realloc_start = 0; + rxq->realloc_num -= SXE2_RX_REARM_THRESH_VEC; + + new_tail = (rxq->realloc_start == 0) ? + (rxq->ring_depth - 1) : (rxq->realloc_start - 1); + + SXE2_PCI_REG_WRITE_WC(rxq->rdt_reg_addr, new_tail); + +l_end: + return; +} + +static __rte_always_inline uint16_t +sxe2_rx_pkts_common_vec_neon(struct sxe2_rx_queue *rxq, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts, uint8_t *split_rxe_flags, uint8_t *umbcast_flags, + bool do_offload) +{ + volatile union sxe2_rx_desc *desc; + struct rte_mbuf **buffer; + uint32_t i; + uint16_t done_num = 0; + const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; + + uint8x16_t rvp_shuf_mask = { + 0xFF, 0xFF, 0xFF, 0xFF, + 12, 13, 0xFF, 0xFF, + 12, 13, + 2, 3, + 4, 5, 6, 7 + }; + + uint16x8_t crc_adjust = { + 0, 0, + rxq->crc_len, + 0, rxq->crc_len, + 0, 0, 0 + }; + + desc = &rxq->desc_ring[rxq->processing_idx]; + rte_prefetch0(desc); + + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, SXE2_RX_NUM_PER_LOOP_NEON); + + if (rxq->realloc_num > SXE2_RX_REARM_THRESH_VEC) + sxe2_rx_queue_rearm_neon(rxq); + + if ((rte_le_to_cpu_64(desc->wb.status_err_ptype_len) & + SXE2_RX_DESC_STATUS_DD_MASK) == 0) { + goto l_end; + } + + buffer = &rxq->buffer_ring[rxq->processing_idx]; + for (i = 0; i < nb_pkts; i += SXE2_RX_NUM_PER_LOOP_NEON, + desc += SXE2_RX_NUM_PER_LOOP_NEON) { + uint64x2_t descs[SXE2_RX_NUM_PER_LOOP_NEON]; + uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; + uint64x2_t mbp1, mbp2; + uint16x8_t staterr; + uint16x8_t tmp; + uint16_t bit_num; + + descs[3] = vld1q_u64((uint64_t *)(desc + 3)); + rte_atomic_thread_fence(rte_memory_order_acquire); + descs[2] = vld1q_u64((uint64_t *)(desc + 2)); + rte_atomic_thread_fence(rte_memory_order_acquire); + descs[1] = vld1q_u64((uint64_t *)(desc + 1)); + rte_atomic_thread_fence(rte_memory_order_acquire); + descs[0] = vld1q_u64((uint64_t *)(desc)); + + rte_atomic_thread_fence(rte_memory_order_acquire); + + descs[3] = vld1q_lane_u64((uint64_t *)(desc + 3), descs[3], 0); + descs[2] = vld1q_lane_u64((uint64_t *)(desc + 2), descs[2], 0); + descs[1] = vld1q_lane_u64((uint64_t *)(desc + 1), descs[1], 0); + descs[0] = vld1q_lane_u64((uint64_t *)(desc), descs[0], 0); + + mbp1 = vld1q_u64((uint64_t *)&buffer[i]); + mbp2 = vld1q_u64((uint64_t *)&buffer[i + 2]); + + vst1q_u64((uint64_t *)&rx_pkts[i], mbp1); + vst1q_u64((uint64_t *)&rx_pkts[i + 2], mbp2); + + if (split_rxe_flags) { + rte_mbuf_prefetch_part2(rx_pkts[i]); + rte_mbuf_prefetch_part2(rx_pkts[i + 1]); + rte_mbuf_prefetch_part2(rx_pkts[i + 2]); + rte_mbuf_prefetch_part2(rx_pkts[i + 3]); + } + + pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), rvp_shuf_mask); + pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), rvp_shuf_mask); + pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), rvp_shuf_mask); + pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), rvp_shuf_mask); + + if (do_offload) { + sxe2_rx_desc_offloads_para_fill_neon(rxq, desc, descs, &rx_pkts[i]); + } else { + const uint64x2_t mbuf_init = { + rxq->mbuf_init_value, + 0, + }; + + vst1q_u64((uint64_t *)&rx_pkts[i]->rearm_data, mbuf_init); + vst1q_u64((uint64_t *)&rx_pkts[i + 1]->rearm_data, mbuf_init); + vst1q_u64((uint64_t *)&rx_pkts[i + 2]->rearm_data, mbuf_init); + vst1q_u64((uint64_t *)&rx_pkts[i + 3]->rearm_data, mbuf_init); + } + + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); + pkt_mb4 = vreinterpretq_u8_u16(tmp); + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); + pkt_mb3 = vreinterpretq_u8_u16(tmp); + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); + pkt_mb2 = vreinterpretq_u8_u16(tmp); + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); + pkt_mb1 = vreinterpretq_u8_u16(tmp); + + vst1q_u8((void *)&rx_pkts[i + 3]->rx_descriptor_fields1, + pkt_mb4); + vst1q_u8((void *)&rx_pkts[i + 2]->rx_descriptor_fields1, + pkt_mb3); + vst1q_u8((void *)&rx_pkts[i + 1]->rx_descriptor_fields1, + pkt_mb2); + vst1q_u8((void *)&rx_pkts[i]->rx_descriptor_fields1, + pkt_mb1); + + if (likely(i + SXE2_RX_NUM_PER_LOOP_NEON < nb_pkts)) + rte_prefetch_non_temporal(desc + SXE2_RX_NUM_PER_LOOP_NEON); + + { + uint32x4_t d0 = vreinterpretq_u32_u64(descs[0]); + uint32x4_t d1 = vreinterpretq_u32_u64(descs[1]); + uint32x4_t d2 = vreinterpretq_u32_u64(descs[2]); + uint32x4_t d3 = vreinterpretq_u32_u64(descs[3]); + uint32x4_t sterr_tmp1 = vzip2q_u32(d1, d0); + uint32x4_t sterr_tmp2 = vzip2q_u32(d3, d2); + uint32x4_t sterr_u32 = vzip1q_u32(sterr_tmp1, sterr_tmp2); + + staterr = vreinterpretq_u16_u32(sterr_u32); + } + + sxe2_rx_desc_ptype_fill_neon(staterr, &rx_pkts[i], ptype_tbl); + + if (umbcast_flags != NULL) { + uint32x4_t umbcast_mask = { + SXE2_RX_DESC_STATUS_UMBCAST_MASK, SXE2_RX_DESC_STATUS_UMBCAST_MASK, + SXE2_RX_DESC_STATUS_UMBCAST_MASK, SXE2_RX_DESC_STATUS_UMBCAST_MASK, + }; + + uint8x16_t umbcast_shuf_mask = { + 0x0B, 0x03, 0x0F, 0x07, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, + }; + uint8x16_t umbcast_bits = + vreinterpretq_u8_u32(vandq_u32(vreinterpretq_u32_u16(staterr), + umbcast_mask)); + + umbcast_bits = vqtbl1q_u8(umbcast_bits, umbcast_shuf_mask); + vst1q_lane_u32((uint32_t *)umbcast_flags, + vreinterpretq_u32_u8(umbcast_bits), 0); + umbcast_flags += SXE2_RX_NUM_PER_LOOP_NEON; + } + + if (split_rxe_flags) { + uint8x16_t eop_shuf_mask = { + 0x08, 0x00, 0x0C, 0x04, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF}; + uint8x16_t eop_bits; + uint32x4_t rxe_mask = { + 0x2080, 0x2080, 0x2080, 0x2080 + }; + uint32x4_t rxe_bits; + uint32x4_t eop_mask; + + eop_mask = vshlq_n_u32(vdupq_n_u32(1), SXE2_RX_DESC_STATUS_EOP_SHIFT); + eop_bits = vandq_u8(vmvnq_u8(vreinterpretq_u8_u16(staterr)), + vreinterpretq_u8_u32(eop_mask)); + + rxe_bits = vandq_u32(vreinterpretq_u32_u16(staterr), rxe_mask); + rxe_bits = vshrq_n_u32(rxe_bits, 7); + + eop_bits = vorrq_u8(eop_bits, vreinterpretq_u8_u32(rxe_bits)); + + eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask); + + vst1q_lane_u32((uint32_t *)split_rxe_flags, + vreinterpretq_u32_u8(eop_bits), 0); + split_rxe_flags += SXE2_RX_NUM_PER_LOOP_NEON; + +#ifdef RTE_IOVA_IN_MBUF + rx_pkts[i]->next = NULL; + rx_pkts[i + 1]->next = NULL; + rx_pkts[i + 2]->next = NULL; + rx_pkts[i + 3]->next = NULL; +#endif + } + + { + uint32x4_t dd_mask = vdupq_n_u32(1); + uint32x4_t sterr_dd = vandq_u32(vreinterpretq_u32_u16(staterr), dd_mask); + uint16x4_t packed_lo = vmovn_u32(sterr_dd); + uint64_t dd64 = vget_lane_u64(vreinterpret_u64_u16(packed_lo), 0); + + bit_num = (uint16_t)rte_popcount64(dd64); + } + done_num += bit_num; + if (likely(bit_num != SXE2_RX_NUM_PER_LOOP_NEON)) + break; + } + + rxq->processing_idx += done_num; + rxq->processing_idx &= (rxq->ring_depth - 1); + rxq->realloc_num += done_num; + +l_end: + return done_num; +} + +static __rte_always_inline uint16_t +sxe2_rx_pkts_scattered_batch_vec_neon(struct sxe2_rx_queue *rxq, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts, bool do_offload) +{ + const uint64_t *split_flags64; + uint8_t split_rxe_flags[SXE2_RX_PKTS_BURST_BATCH_NUM_VEC] = {0}; + uint8_t umbcast_flags[SXE2_RX_PKTS_BURST_BATCH_NUM_VEC] = {0}; + uint16_t rx_done_num; + uint16_t rx_pkt_done_num; + + rx_pkt_done_num = 0; + + if (rxq->vsi->adapter->devargs.sw_stats_en) { + rx_done_num = sxe2_rx_pkts_common_vec_neon((struct sxe2_rx_queue *)rxq, + rx_pkts, nb_pkts, split_rxe_flags, umbcast_flags, + do_offload); + } else { + rx_done_num = sxe2_rx_pkts_common_vec_neon((struct sxe2_rx_queue *)rxq, + rx_pkts, nb_pkts, split_rxe_flags, NULL, + do_offload); + } + + if (rx_done_num == 0) + goto l_end; + + if (!rxq->vsi->adapter->devargs.sw_stats_en) { + split_flags64 = (uint64_t *)split_rxe_flags; + + if (rxq->pkt_first_seg == NULL && + split_flags64[0] == 0 && split_flags64[1] == 0 && + split_flags64[2] == 0 && split_flags64[3] == 0) { + rx_pkt_done_num = rx_done_num; + goto l_end; + } + + if (rxq->pkt_first_seg == NULL) { + while (rx_pkt_done_num < rx_done_num && + split_rxe_flags[rx_pkt_done_num] == 0) { + rx_pkt_done_num++; + } + + if (rx_pkt_done_num == rx_done_num) + goto l_end; + + rxq->pkt_first_seg = rx_pkts[rx_pkt_done_num]; + } + } + + rx_pkt_done_num += sxe2_rx_pkts_refactor(rxq, &rx_pkts[rx_pkt_done_num], + rx_done_num - rx_pkt_done_num, &split_rxe_flags[rx_pkt_done_num], + &umbcast_flags[rx_pkt_done_num]); + +l_end: + return rx_pkt_done_num; +} + +uint16_t sxe2_rx_pkts_scattered_vec_neon_offload(void *rx_queue, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + uint16_t done_num = 0; + uint16_t once_num; + + while (nb_pkts > SXE2_RX_PKTS_BURST_BATCH_NUM_VEC) { + once_num = sxe2_rx_pkts_scattered_batch_vec_neon((struct sxe2_rx_queue *)rx_queue, + rx_pkts + done_num, + SXE2_RX_PKTS_BURST_BATCH_NUM_VEC, + true); + + done_num += once_num; + nb_pkts -= once_num; + + if (once_num < SXE2_RX_PKTS_BURST_BATCH_NUM_VEC) + goto l_end; + } + + done_num += sxe2_rx_pkts_scattered_batch_vec_neon((struct sxe2_rx_queue *)rx_queue, + rx_pkts + done_num, + nb_pkts, + true); +l_end: + SXE2_RX_STATS_CNT(rx_queue, rx_pkts_num, done_num); + return done_num; +} + +uint16_t sxe2_rx_pkts_scattered_vec_neon(void *rx_queue, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + uint16_t done_num = 0; + uint16_t once_num; + + while (nb_pkts > SXE2_RX_PKTS_BURST_BATCH_NUM_VEC) { + once_num = sxe2_rx_pkts_scattered_batch_vec_neon((struct sxe2_rx_queue *)rx_queue, + rx_pkts + done_num, + SXE2_RX_PKTS_BURST_BATCH_NUM_VEC, + false); + + done_num += once_num; + nb_pkts -= once_num; + + if (once_num < SXE2_RX_PKTS_BURST_BATCH_NUM_VEC) + goto l_end; + } + + done_num += sxe2_rx_pkts_scattered_batch_vec_neon((struct sxe2_rx_queue *)rx_queue, + rx_pkts + done_num, + nb_pkts, + false); +l_end: + SXE2_RX_STATS_CNT(rx_queue, rx_pkts_num, done_num); + return done_num; +} +#endif -- 2.47.3

