Added the scattered burst function on AArch64 so that we can leverage the NEON-optimised Rx raw burst function to handle scattered packets for the legacy 32B descriptor.
Signed-off-by: Jay Wang <[email protected]> --- drivers/net/intel/iavf/iavf.h | 1 + drivers/net/intel/iavf/iavf_rxtx.c | 16 ++- drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 110 +++++++++++++++++++- drivers/net/intel/iavf/meson.build | 2 +- 4 files changed, 122 insertions(+), 7 deletions(-) diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h index 403c61e2e8..e4936f3566 100644 --- a/drivers/net/intel/iavf/iavf.h +++ b/drivers/net/intel/iavf/iavf.h @@ -334,6 +334,7 @@ enum iavf_rx_func_type { IAVF_RX_BULK_ALLOC, IAVF_RX_BULK_ALLOC_FLEX_RXD, IAVF_RX_NEON, + IAVF_RX_NEON_SCATTERED, IAVF_RX_AVX2, IAVF_RX_AVX2_SCATTERED, IAVF_RX_AVX2_OFFLOAD, diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c index 4ff6c18dc4..15566a0e18 100644 --- a/drivers/net/intel/iavf/iavf_rxtx.c +++ b/drivers/net/intel/iavf/iavf_rxtx.c @@ -3551,16 +3551,26 @@ static const struct ci_rx_path_info iavf_rx_path_infos[] = { } }, #endif -#elif defined RTE_ARCH_ARM +#elif defined(RTE_ARCH_ARM64) [IAVF_RX_NEON] = { .pkt_burst = iavf_recv_pkts_vec, .info = "Vector Neon", .features = { - .rx_offloads = IAVF_RX_SCALAR_OFFLOADS, + .rx_offloads = IAVF_RX_VECTOR_OFFLOADS, .simd_width = RTE_VECT_SIMD_128, .bulk_alloc = true } }, + [IAVF_RX_NEON_SCATTERED] = { + .pkt_burst = iavf_recv_scattered_pkts_vec, + .info = "Vector Scattered Neon", + .features = { + .rx_offloads = IAVF_RX_VECTOR_OFFLOADS | RTE_ETH_RX_OFFLOAD_SCATTER, + .simd_width = RTE_VECT_SIMD_128, + .scattered = true, + .bulk_alloc = true + } + }, #endif }; @@ -3839,7 +3849,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev) if (adapter->rx_bulk_alloc_allowed) { req_features.bulk_alloc = true; default_path = IAVF_RX_BULK_ALLOC; -#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM) +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) if (iavf_rx_vec_dev_check(dev) != -1) req_features.simd_width = iavf_get_max_simd_bitwidth(); #endif diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c index 28c90b2a72..45e377d728 100644 --- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c +++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2022 Intel Corporation - * Copyright(c) 2022 Arm Limited + * Copyright(c) 2022-2026 Arm Limited */ #include <stdint.h> @@ -145,8 +145,6 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq, struct rte_mbuf **__rte_restrict rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { - RTE_SET_USED(split_packet); - volatile union ci_rx_desc *rxdp; struct ci_rx_entry *sw_ring; uint16_t nb_pkts_recd; @@ -164,6 +162,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq, 4, 5, 6, 7 /* octet 4~7, 32bits rss */ }; + uint8x16_t eop_check = { + 0x02, 0x00, 0x02, 0x00, + 0x02, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 + }; + uint16x8_t crc_adjust = { 0, 0, /* ignore pkt_type field */ rxq->crc_len, /* sub crc on pkt_len */ @@ -238,6 +243,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq, vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); + if (split_packet) { + rte_mbuf_prefetch_part2(rx_pkts[pos]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); + } + /* pkts shift the pktlen field to be 16-bit aligned*/ uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]), len_shl); @@ -306,6 +318,32 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq, staterr = vzipq_u16(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0]; + /* C* extract and record EOP bit */ + if (split_packet) { + uint8x16_t eop_shuf_mask = { + 0x00, 0x02, 0x04, 0x06, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF + }; + uint8x16_t eop_bits; + + /* and with mask to extract bits, flipping 1-0 */ + eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr)); + eop_bits = vandq_u8(eop_bits, eop_check); + /* the staterr values are not in order, as the count + * of dd bits doesn't care. However, for end of + * packet tracking, we do care, so shuffle. This also + * compresses the 32-bit values to 8-bit + */ + eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask); + + /* store the resulting 32-bit value */ + vst1q_lane_u32((uint32_t *)split_packet, + vreinterpretq_u32_u8(eop_bits), 0); + split_packet += IAVF_VPMD_DESCS_PER_LOOP; + } + staterr = vshlq_n_u16(staterr, IAVF_UINT16_BIT - 1); staterr = vreinterpretq_u16_s16( vshrq_n_s16(vreinterpretq_s16_u16(staterr), @@ -341,6 +379,72 @@ iavf_recv_pkts_vec(void *__rte_restrict rx_queue, return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); } +/* + * vPMD receive routine that reassembles single burst of 32 scattered + * packets. + * + * Notice: + * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet + */ +static __rte_always_inline uint16_t +iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + struct ci_rx_queue *rxq = rx_queue; + uint8_t split_flags[IAVF_VPMD_RX_BURST] = {0}; + + /* get some new buffers */ + uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts, + split_flags); + + if (nb_bufs == 0) + return 0; + + /* happy day case, full burst + no packets to be assembled */ + const uint64_t *split_fl64 = (uint64_t *)split_flags; + if (!rxq->pkt_first_seg && + split_fl64[0] == 0 && split_fl64[1] == 0 && + split_fl64[2] == 0 && split_fl64[3] == 0) + return nb_bufs; + + /* reassmble any packets that need reassembly */ + unsigned int i = 0; + if (!rxq->pkt_first_seg) { + /* find the first split flag, and only reassmeble then */ + while (i < nb_bufs && !split_flags[i]) + i++; + if (i == nb_bufs) + return nb_bufs; + rxq->pkt_first_seg = rx_pkts[i]; + } + return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, + &split_flags[i], &rxq->pkt_first_seg, &rxq->pkt_last_seg, + rxq->crc_len); +} + +/* + * vPMD receive routine that reassembles scattered packets. + */ +uint16_t +iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + uint16_t retval = 0; + + while (nb_pkts > IAVF_VPMD_RX_BURST) { + uint16_t burst; + burst = iavf_recv_scattered_burst_vec(rx_queue, + rx_pkts + retval, IAVF_VPMD_RX_BURST); + retval += burst; + nb_pkts -= burst; + if (burst < IAVF_VPMD_RX_BURST) + return retval; + } + /* The last one burst or nb_pkts <= IAVF_VPMD_RX_BURST */ + return retval + iavf_recv_scattered_burst_vec(rx_queue, + rx_pkts + retval, nb_pkts); +} + void __rte_cold iavf_rx_queue_release_mbufs_neon(struct ci_rx_queue *rxq) { diff --git a/drivers/net/intel/iavf/meson.build b/drivers/net/intel/iavf/meson.build index f9576586f6..50630a88c8 100644 --- a/drivers/net/intel/iavf/meson.build +++ b/drivers/net/intel/iavf/meson.build @@ -29,7 +29,7 @@ sources = files( if arch_subdir == 'x86' sources_avx2 += files('iavf_rxtx_vec_avx2.c') sources_avx512 += files('iavf_rxtx_vec_avx512.c') -elif arch_subdir == 'arm' +elif arch_subdir == 'arm' and dpdk_conf.get('RTE_ARCH_64') sources += files('iavf_rxtx_vec_neon.c') endif -- 2.43.0

