This patch adds the NEON-optimised Tx burst function for Intel IAVF driver on AArch64.
Signed-off-by: Jay Wang <[email protected]> --- drivers/net/intel/iavf/iavf.h | 1 + drivers/net/intel/iavf/iavf_rxtx.c | 15 ++- drivers/net/intel/iavf/iavf_rxtx.h | 2 - drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 120 ++++++++++++++++++++ 4 files changed, 133 insertions(+), 5 deletions(-) diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h index e4936f3566..3e71d345a9 100644 --- a/drivers/net/intel/iavf/iavf.h +++ b/drivers/net/intel/iavf/iavf.h @@ -356,6 +356,7 @@ enum iavf_rx_func_type { enum iavf_tx_func_type { IAVF_TX_DISABLED, IAVF_TX_DEFAULT, + IAVF_TX_NEON, IAVF_TX_AVX2, IAVF_TX_AVX2_OFFLOAD, IAVF_TX_AVX512, diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c index 15566a0e18..645bc5ccf6 100644 --- a/drivers/net/intel/iavf/iavf_rxtx.c +++ b/drivers/net/intel/iavf/iavf_rxtx.c @@ -3662,6 +3662,15 @@ static const struct ci_tx_path_info iavf_tx_path_infos[] = { } }, #endif +#elif defined(RTE_ARCH_ARM64) + [IAVF_TX_NEON] = { + .pkt_burst = iavf_xmit_pkts_vec, + .info = "Vector Neon", + .features = { + .tx_offloads = IAVF_TX_VECTOR_OFFLOADS, + .simd_width = RTE_VECT_SIMD_128 + } + }, #endif }; @@ -3878,7 +3887,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev) IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private); int mbuf_check = adapter->devargs.mbuf_check; int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down; -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) struct ci_tx_queue *txq; int i; const struct ci_tx_path_features *selected_features; @@ -3892,7 +3901,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev) if (dev->data->dev_started) goto out; -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) if (iavf_tx_vec_dev_check(dev) != -1) req_features.simd_width = iavf_get_max_simd_bitwidth(); @@ -3915,7 +3924,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev) IAVF_TX_DEFAULT); out: -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) selected_features = &iavf_tx_path_infos[adapter->tx_func_type].features; for (i = 0; i < dev->data->nb_tx_queues; i++) { txq = dev->data->tx_queues[i]; diff --git a/drivers/net/intel/iavf/iavf_rxtx.h b/drivers/net/intel/iavf/iavf_rxtx.h index 80b06518b0..8b8e55e66f 100644 --- a/drivers/net/intel/iavf/iavf_rxtx.h +++ b/drivers/net/intel/iavf/iavf_rxtx.h @@ -558,8 +558,6 @@ uint16_t iavf_recv_scattered_pkts_vec(void *rx_queue, uint16_t iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); -uint16_t iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, - uint16_t nb_pkts); uint16_t iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); uint16_t iavf_recv_pkts_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts, diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c index 45e377d728..9c91b6bac1 100644 --- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c +++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c @@ -445,6 +445,120 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, rx_pkts + retval, nb_pkts); } +static __rte_always_inline void +iavf_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt, + uint64_t flags) +{ + uint64_t high_qw = (CI_TX_DESC_DTYPE_DATA | + ((uint64_t)flags << CI_TXD_QW1_CMD_S) | + ((uint64_t)pkt->data_len << CI_TXD_QW1_TX_BUF_SZ_S)); + + uint64x2_t descriptor = {rte_pktmbuf_iova(pkt), high_qw}; + vst1q_u64(RTE_CAST_PTR(uint64_t *, txdp), descriptor); +} + +static __rte_always_inline void +iavf_vtx(volatile struct ci_tx_desc *txdp, struct rte_mbuf **pkt, + uint16_t nb_pkts, uint64_t flags) +{ + int i; + + for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt) + iavf_vtx1(txdp, *pkt, flags); +} + +static __rte_always_inline uint16_t +iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; + volatile struct ci_tx_desc *txdp; + struct ci_tx_entry_vec *txep; + uint16_t n, nb_commit, tx_id; + uint64_t flags = CI_TX_DESC_CMD_DEFAULT; + uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT; + int i; + + /* cross rx_thresh boundary is not allowed */ + nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); + + if (txq->nb_tx_free < txq->tx_free_thresh) + ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, false); + + nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); + nb_commit = nb_pkts; + if (unlikely(nb_pkts == 0)) + return 0; + + tx_id = txq->tx_tail; + txdp = &txq->ci_tx_ring[tx_id]; + txep = &txq->sw_ring_vec[tx_id]; + + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); + + n = (uint16_t)(txq->nb_tx_desc - tx_id); + if (nb_commit >= n) { + ci_tx_backlog_entry_vec(txep, tx_pkts, n); + + for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp) + iavf_vtx1(txdp, *tx_pkts, flags); + + /* write with RS for the last descriptor in the segment */ + iavf_vtx1(txdp, *tx_pkts++, rs); + + nb_commit = (uint16_t)(nb_commit - n); + + tx_id = 0; + txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); + + /* avoid reach the end of ring */ + txdp = &txq->ci_tx_ring[tx_id]; + txep = &txq->sw_ring_vec[tx_id]; + } + + ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit); + + iavf_vtx(txdp, tx_pkts, nb_commit, flags); + + tx_id = (uint16_t)(tx_id + nb_commit); + if (tx_id > txq->tx_next_rs) { + txq->ci_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= + rte_cpu_to_le_64(((uint64_t)CI_TX_DESC_CMD_RS) << + CI_TXD_QW1_CMD_S); + txq->tx_next_rs = + (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); + } + + txq->tx_tail = tx_id; + + IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); + + return nb_pkts; +} + +uint16_t +iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + uint16_t nb_tx = 0; + struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; + + while (nb_pkts) { + uint16_t ret, num; + + /* cross rs_thresh boundary is not allowed */ + num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); + ret = iavf_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx], + num); + nb_tx += ret; + nb_pkts -= ret; + if (ret < num) + break; + } + + return nb_tx; +} + void __rte_cold iavf_rx_queue_release_mbufs_neon(struct ci_rx_queue *rxq) { @@ -465,6 +579,12 @@ iavf_rx_vec_dev_check(struct rte_eth_dev *dev) return iavf_rx_vec_dev_check_default(dev); } +int __rte_cold +iavf_tx_vec_dev_check(struct rte_eth_dev *dev) +{ + return iavf_tx_vec_dev_check_default(dev); +} + enum rte_vect_max_simd iavf_get_max_simd_bitwidth(void) { -- 2.43.0

