From: Junfeng Guo <junfeng....@intel.com> Add support of AVX512 vector data path for single queue model.
Signed-off-by: Wenjun Wu <wenjun1...@intel.com> Signed-off-by: Junfeng Guo <junfeng....@intel.com> --- doc/guides/nics/idpf.rst | 19 + drivers/net/idpf/idpf_ethdev.c | 3 +- drivers/net/idpf/idpf_ethdev.h | 5 + drivers/net/idpf/idpf_rxtx.c | 145 ++++ drivers/net/idpf/idpf_rxtx.h | 21 + drivers/net/idpf/idpf_rxtx_vec_avx512.c | 857 ++++++++++++++++++++++++ drivers/net/idpf/idpf_rxtx_vec_common.h | 100 +++ drivers/net/idpf/meson.build | 28 + 8 files changed, 1177 insertions(+), 1 deletion(-) create mode 100644 drivers/net/idpf/idpf_rxtx_vec_avx512.c create mode 100644 drivers/net/idpf/idpf_rxtx_vec_common.h diff --git a/doc/guides/nics/idpf.rst b/doc/guides/nics/idpf.rst index c1001d5d0c..3039c61748 100644 --- a/doc/guides/nics/idpf.rst +++ b/doc/guides/nics/idpf.rst @@ -64,3 +64,22 @@ Refer to the document :ref:`compiling and testing a PMD for a NIC <pmd_build_and for details. +Features +-------- + +Vector PMD +~~~~~~~~~~ + +Vector path for RX and TX path are selected automatically. The paths +are chosen based on 2 conditions. + +- ``CPU`` + On the X86 platform, the driver checks if the CPU supports AVX512. + If the CPU supports AVX512 and EAL argument ``--force-max-simd-bitwidth`` + is set to 512, AVX512 paths will be chosen. + +- ``Offload features`` + The supported HW offload features are described in the document idpf.ini, + A value "P" means the offload feature is not supported by vector path. + If any not supported features are used, idpf vector PMD is disabled and the + scalar paths are chosen. diff --git a/drivers/net/idpf/idpf_ethdev.c b/drivers/net/idpf/idpf_ethdev.c index 084426260c..cd4ebcc2c6 100644 --- a/drivers/net/idpf/idpf_ethdev.c +++ b/drivers/net/idpf/idpf_ethdev.c @@ -69,7 +69,8 @@ idpf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_TCP_TSO | - RTE_ETH_TX_OFFLOAD_MULTI_SEGS; + RTE_ETH_TX_OFFLOAD_MULTI_SEGS | + RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; dev_info->default_txconf = (struct rte_eth_txconf) { .tx_free_thresh = IDPF_DEFAULT_TX_FREE_THRESH, diff --git a/drivers/net/idpf/idpf_ethdev.h b/drivers/net/idpf/idpf_ethdev.h index 8d0804f603..7d54e5db60 100644 --- a/drivers/net/idpf/idpf_ethdev.h +++ b/drivers/net/idpf/idpf_ethdev.h @@ -162,6 +162,11 @@ struct idpf_adapter { uint32_t max_txq_per_msg; uint32_t ptype_tbl[IDPF_MAX_PKT_TYPE] __rte_cache_min_aligned; + + bool rx_vec_allowed; + bool tx_vec_allowed; + bool rx_use_avx512; + bool tx_use_avx512; }; TAILQ_HEAD(idpf_adapter_list, idpf_adapter); diff --git a/drivers/net/idpf/idpf_rxtx.c b/drivers/net/idpf/idpf_rxtx.c index cc296d7ab1..9e20f2b9d3 100644 --- a/drivers/net/idpf/idpf_rxtx.c +++ b/drivers/net/idpf/idpf_rxtx.c @@ -4,9 +4,11 @@ #include <ethdev_driver.h> #include <rte_net.h> +#include <rte_vect.h> #include "idpf_ethdev.h" #include "idpf_rxtx.h" +#include "idpf_rxtx_vec_common.h" static int check_rx_thresh(uint16_t nb_desc, uint16_t thresh) @@ -252,6 +254,8 @@ reset_single_rx_queue(struct idpf_rx_queue *rxq) rxq->pkt_first_seg = NULL; rxq->pkt_last_seg = NULL; + rxq->rxrearm_start = 0; + rxq->rxrearm_nb = 0; } static void @@ -2073,25 +2077,166 @@ idpf_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts, return i; } +static void __rte_cold +release_rxq_mbufs_vec(struct idpf_rx_queue *rxq) +{ + const uint16_t mask = rxq->nb_rx_desc - 1; + uint16_t i; + + if (rxq->sw_ring == NULL || rxq->rxrearm_nb >= rxq->nb_rx_desc) + return; + + /* free all mbufs that are valid in the ring */ + if (rxq->rxrearm_nb == 0) { + for (i = 0; i < rxq->nb_rx_desc; i++) { + if (rxq->sw_ring[i] != NULL) + rte_pktmbuf_free_seg(rxq->sw_ring[i]); + } + } else { + for (i = rxq->rx_tail; i != rxq->rxrearm_start; i = (i + 1) & mask) { + if (rxq->sw_ring[i] != NULL) + rte_pktmbuf_free_seg(rxq->sw_ring[i]); + } + } + + rxq->rxrearm_nb = rxq->nb_rx_desc; + + /* set all entries to NULL */ + memset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_rx_desc); +} + +static const struct idpf_rxq_ops def_singleq_rx_ops_vec = { + .release_mbufs = release_rxq_mbufs_vec, +}; + +static inline int +idpf_singleq_rx_vec_setup_default(struct idpf_rx_queue *rxq) +{ + uintptr_t p; + struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */ + + mb_def.nb_segs = 1; + mb_def.data_off = RTE_PKTMBUF_HEADROOM; + mb_def.port = rxq->port_id; + rte_mbuf_refcnt_set(&mb_def, 1); + + /* prevent compiler reordering: rearm_data covers previous fields */ + rte_compiler_barrier(); + p = (uintptr_t)&mb_def.rearm_data; + rxq->mbuf_initializer = *(uint64_t *)p; + return 0; +} + +int __rte_cold +idpf_singleq_rx_vec_setup(struct idpf_rx_queue *rxq) +{ + rxq->ops = &def_singleq_rx_ops_vec; + return idpf_singleq_rx_vec_setup_default(rxq); +} + void idpf_set_rx_function(struct rte_eth_dev *dev) { struct idpf_vport *vport = dev->data->dev_private; +#ifdef RTE_ARCH_X86 + struct idpf_adapter *ad = vport->adapter; + struct idpf_rx_queue *rxq; + int i; + + if (idpf_rx_vec_dev_check_default(dev) == IDPF_VECTOR_PATH && + rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { + ad->rx_vec_allowed = true; + + if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512) +#ifdef CC_AVX512_SUPPORT + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1) + ad->rx_use_avx512 = true; +#else + PMD_DRV_LOG(NOTICE, + "AVX512 is not supported in build env"); +#endif /* CC_AVX512_SUPPORT */ + } else { + ad->rx_vec_allowed = false; + } +#endif /* RTE_ARCH_X86 */ + +#ifdef RTE_ARCH_X86 + if (vport->rxq_model == VIRTCHNL2_QUEUE_MODEL_SPLIT) { + dev->rx_pkt_burst = idpf_splitq_recv_pkts; + } else { + if (ad->rx_vec_allowed) { + for (i = 0; i < dev->data->nb_tx_queues; i++) { + rxq = dev->data->rx_queues[i]; + (void)idpf_singleq_rx_vec_setup(rxq); + } +#ifdef CC_AVX512_SUPPORT + if (ad->rx_use_avx512) { + dev->rx_pkt_burst = idpf_singleq_recv_pkts_avx512; + return; + } +#endif /* CC_AVX512_SUPPORT */ + } + dev->rx_pkt_burst = idpf_singleq_recv_pkts; + } +#else if (vport->rxq_model == VIRTCHNL2_QUEUE_MODEL_SPLIT) dev->rx_pkt_burst = idpf_splitq_recv_pkts; else dev->rx_pkt_burst = idpf_singleq_recv_pkts; +#endif /* RTE_ARCH_X86 */ } void idpf_set_tx_function(struct rte_eth_dev *dev) { struct idpf_vport *vport = dev->data->dev_private; +#ifdef RTE_ARCH_X86 + struct idpf_adapter *ad = vport->adapter; +#ifdef CC_AVX512_SUPPORT + struct idpf_tx_queue *txq; + int i; +#endif /* CC_AVX512_SUPPORT */ + + if (idpf_rx_vec_dev_check_default(dev) == IDPF_VECTOR_PATH && + rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { + ad->tx_vec_allowed = true; + if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512) +#ifdef CC_AVX512_SUPPORT + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1) + ad->tx_use_avx512 = true; +#else + PMD_DRV_LOG(NOTICE, + "AVX512 is not supported in build env"); +#endif /* CC_AVX512_SUPPORT */ + } else { + ad->tx_vec_allowed = false; + } +#endif /* RTE_ARCH_X86 */ + if (vport->txq_model == VIRTCHNL2_QUEUE_MODEL_SPLIT) { dev->tx_pkt_burst = idpf_splitq_xmit_pkts; dev->tx_pkt_prepare = idpf_prep_pkts; } else { +#ifdef RTE_ARCH_X86 + if (ad->tx_vec_allowed) { +#ifdef CC_AVX512_SUPPORT + if (ad->tx_use_avx512) { + for (i = 0; i < dev->data->nb_tx_queues; i++) { + txq = dev->data->tx_queues[i]; + if (txq == NULL) + continue; + idpf_singleq_tx_vec_setup_avx512(txq); + } + dev->tx_pkt_burst = idpf_singleq_xmit_pkts_avx512; + dev->tx_pkt_prepare = idpf_prep_pkts; + return; + } +#endif /* CC_AVX512_SUPPORT */ + } +#endif /* RTE_ARCH_X86 */ dev->tx_pkt_burst = idpf_singleq_xmit_pkts; dev->tx_pkt_prepare = idpf_prep_pkts; } diff --git a/drivers/net/idpf/idpf_rxtx.h b/drivers/net/idpf/idpf_rxtx.h index 54d297aac6..506bf8a57d 100644 --- a/drivers/net/idpf/idpf_rxtx.h +++ b/drivers/net/idpf/idpf_rxtx.h @@ -18,6 +18,12 @@ #define IDPF_RX_MAX_BURST 32 #define IDPF_DEFAULT_RX_FREE_THRESH 32 +/* used for Vector PMD */ +#define IDPF_VPMD_RX_MAX_BURST 32 +#define IDPF_VPMD_TX_MAX_BURST 32 +#define IDPF_VPMD_DESCS_PER_LOOP 4 +#define IDPF_RXQ_REARM_THRESH 64 + #define IDPF_DEFAULT_TX_RS_THRESH 32 #define IDPF_DEFAULT_TX_FREE_THRESH 32 @@ -54,6 +60,11 @@ struct idpf_rx_queue { struct rte_mbuf *pkt_last_seg; /* last segment of current packet */ struct rte_mbuf fake_mbuf; /* dummy mbuf */ + /* used for VPMD */ + uint16_t rxrearm_nb; /* number of remaining to be re-armed */ + uint16_t rxrearm_start; /* the idx we start the re-arming from */ + uint64_t mbuf_initializer; /* value to init mbufs */ + uint16_t rx_nb_avail; uint16_t rx_next_avail; @@ -84,6 +95,10 @@ struct idpf_tx_entry { uint16_t last_id; }; +struct idpf_tx_vec_entry { + struct rte_mbuf *mbuf; +}; + /* Structure associated with each TX queue. */ struct idpf_tx_queue { const struct rte_memzone *mz; /* memzone for Tx ring */ @@ -149,6 +164,7 @@ int idpf_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx, uint16_t nb_desc, unsigned int socket_id, const struct rte_eth_rxconf *rx_conf, struct rte_mempool *mp); +int idpf_singleq_tx_vec_setup_avx512(struct idpf_tx_queue *txq); int idpf_rx_queue_init(struct rte_eth_dev *dev, uint16_t rx_queue_id); int idpf_rx_queue_start(struct rte_eth_dev *dev, uint16_t rx_queue_id); int idpf_rx_queue_stop(struct rte_eth_dev *dev, uint16_t rx_queue_id); @@ -157,16 +173,21 @@ void idpf_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid); int idpf_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx, uint16_t nb_desc, unsigned int socket_id, const struct rte_eth_txconf *tx_conf); +int idpf_singleq_rx_vec_setup(struct idpf_rx_queue *rxq); int idpf_tx_queue_init(struct rte_eth_dev *dev, uint16_t tx_queue_id); int idpf_tx_queue_start(struct rte_eth_dev *dev, uint16_t tx_queue_id); int idpf_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id); void idpf_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid); uint16_t idpf_singleq_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); +uint16_t idpf_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); uint16_t idpf_splitq_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); uint16_t idpf_singleq_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +uint16_t idpf_singleq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts); uint16_t idpf_splitq_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); uint16_t idpf_prep_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, diff --git a/drivers/net/idpf/idpf_rxtx_vec_avx512.c b/drivers/net/idpf/idpf_rxtx_vec_avx512.c new file mode 100644 index 0000000000..fb2b6bb53c --- /dev/null +++ b/drivers/net/idpf/idpf_rxtx_vec_avx512.c @@ -0,0 +1,857 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2022 Intel Corporation + */ + +#include "idpf_rxtx_vec_common.h" + +#include <rte_vect.h> + +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + +#define IDPF_DESCS_PER_LOOP_AVX 8 +#define PKTLEN_SHIFT 10 + +static __rte_always_inline void +idpf_singleq_rearm_common(struct idpf_rx_queue *rxq) +{ + struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start]; + volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; + uint16_t rx_id; + int i; + + rxdp += rxq->rxrearm_start; + + /* Pull 'n' more MBUFs into the software ring */ + if (rte_mempool_get_bulk(rxq->mp, + (void *)rxp, + IDPF_RXQ_REARM_THRESH) < 0) { + if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= + rxq->nb_rx_desc) { + __m128i dma_addr0; + + dma_addr0 = _mm_setzero_si128(); + for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { + rxp[i] = &rxq->fake_mbuf; + _mm_store_si128((__m128i *)&rxdp[i].read, + dma_addr0); + } + } + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += + IDPF_RXQ_REARM_THRESH; + return; + } + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + struct rte_mbuf *mb4, *mb5, *mb6, *mb7; + __m512i dma_addr0_3, dma_addr4_7; + __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); + /* Initialize the mbufs in vector, process 8 mbufs in one loop */ + for (i = 0; i < IDPF_RXQ_REARM_THRESH; + i += 8, rxp += 8, rxdp += 8) { + __m128i vaddr0, vaddr1, vaddr2, vaddr3; + __m128i vaddr4, vaddr5, vaddr6, vaddr7; + __m256i vaddr0_1, vaddr2_3; + __m256i vaddr4_5, vaddr6_7; + __m512i vaddr0_3, vaddr4_7; + + mb0 = rxp[0]; + mb1 = rxp[1]; + mb2 = rxp[2]; + mb3 = rxp[3]; + mb4 = rxp[4]; + mb5 = rxp[5]; + mb6 = rxp[6]; + mb7 = rxp[7]; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); + vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr); + vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr); + vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr); + vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr); + + /** + * merge 0 & 1, by casting 0 to 256-bit and inserting 1 + * into the high lanes. Similarly for 2 & 3, and so on. + */ + vaddr0_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), + vaddr1, 1); + vaddr2_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), + vaddr3, 1); + vaddr4_5 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4), + vaddr5, 1); + vaddr6_7 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6), + vaddr7, 1); + vaddr0_3 = + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1), + vaddr2_3, 1); + vaddr4_7 = + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5), + vaddr6_7, 1); + + /* convert pa to dma_addr hdr/data */ + dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3); + dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7); + + /* add headroom to pa values */ + dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room); + dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room); + + /* flush desc with pa dma_addr */ + _mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3); + _mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7); + } + + rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH; + if (rxq->rxrearm_start >= rxq->nb_rx_desc) + rxq->rxrearm_start = 0; + + rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; + + rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? + (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); + + /* Update the tail pointer on the NIC */ + IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id); +} + +static __rte_always_inline void +idpf_singleq_rearm(struct idpf_rx_queue *rxq) +{ + int i; + uint16_t rx_id; + volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; + struct rte_mempool_cache *cache = + rte_mempool_default_cache(rxq->mp, rte_lcore_id()); + struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start]; + + rxdp += rxq->rxrearm_start; + + if (unlikely(cache == NULL)) + return idpf_singleq_rearm_common(rxq); + + /* We need to pull 'n' more MBUFs into the software ring from mempool + * We inline the mempool function here, so we can vectorize the copy + * from the cache into the shadow ring. + */ + + /* Can this be satisfied from the cache? */ + if (cache->len < IDPF_RXQ_REARM_THRESH) { + /* No. Backfill the cache first, and then fill from it */ + uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - + cache->len); + + /* How many do we require i.e. number to fill the cache + the request */ + int ret = rte_mempool_ops_dequeue_bulk + (rxq->mp, &cache->objs[cache->len], req); + if (ret == 0) { + cache->len += req; + } else { + if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= + rxq->nb_rx_desc) { + __m128i dma_addr0; + + dma_addr0 = _mm_setzero_si128(); + for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { + rxp[i] = &rxq->fake_mbuf; + _mm_storeu_si128((__m128i *)&rxdp[i].read, + dma_addr0); + } + } + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += + IDPF_RXQ_REARM_THRESH; + return; + } + } + + const __m512i iova_offsets = _mm512_set1_epi64(offsetof + (struct rte_mbuf, buf_iova)); + const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); + + /* to shuffle the addresses to correct slots. Values 4-7 will contain + * zeros, so use 7 for a zero-value. + */ + const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0); + + /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking + * from mempool cache and populating both shadow and HW rings + */ + for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) { + const __m512i mbuf_ptrs = _mm512_loadu_si512 + (&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]); + _mm512_storeu_si512(rxp, mbuf_ptrs); + + const __m512i iova_base_addrs = _mm512_i64gather_epi64 + (_mm512_add_epi64(mbuf_ptrs, iova_offsets), + 0, /* base */ + 1 /* scale */); + const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs, + headroom); + const __m512i iovas0 = _mm512_castsi256_si512 + (_mm512_extracti64x4_epi64(iova_addrs, 0)); + const __m512i iovas1 = _mm512_castsi256_si512 + (_mm512_extracti64x4_epi64(iova_addrs, 1)); + + /* permute leaves desc 2-3 addresses in header address slots 0-1 + * but these are ignored by driver since header split not + * enabled. Similarly for desc 6 & 7. + */ + const __m512i desc0_1 = _mm512_permutexvar_epi64 + (permute_idx, + iovas0); + const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8); + + const __m512i desc4_5 = _mm512_permutexvar_epi64 + (permute_idx, + iovas1); + const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8); + + _mm512_storeu_si512((void *)rxdp, desc0_1); + _mm512_storeu_si512((void *)(rxdp + 2), desc2_3); + _mm512_storeu_si512((void *)(rxdp + 4), desc4_5); + _mm512_storeu_si512((void *)(rxdp + 6), desc6_7); + + rxp += IDPF_DESCS_PER_LOOP_AVX; + rxdp += IDPF_DESCS_PER_LOOP_AVX; + cache->len -= IDPF_DESCS_PER_LOOP_AVX; + } + + rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH; + if (rxq->rxrearm_start >= rxq->nb_rx_desc) + rxq->rxrearm_start = 0; + + rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; + + rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? + (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); + + /* Update the tail pointer on the NIC */ + IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id); +} + +#define IDPF_RX_LEN_MASK 0x80808080 +static __rte_always_inline uint16_t +_idpf_singleq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq, + struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + const uint32_t *type_table = rxq->adapter->ptype_tbl; + + const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, + rxq->mbuf_initializer); + struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail]; + volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; + + rxdp += rxq->rx_tail; + + rte_prefetch0(rxdp); + + /* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */ + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX); + + /* See if we need to rearm the RX queue - gives the prefetch a bit + * of time to act + */ + if (rxq->rxrearm_nb > IDPF_RXQ_REARM_THRESH) + idpf_singleq_rearm(rxq); + + /* Before we start moving massive data around, check to see if + * there is actually a packet available + */ + if ((rxdp->flex_nic_wb.status_error0 & + rte_cpu_to_le_32(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_DD_S)) == 0) + return 0; + + /* 8 packets DD mask, LSB in each 32-bit value */ + const __m256i dd_check = _mm256_set1_epi32(1); + + /* mask to shuffle from desc. to mbuf (4 descriptors)*/ + const __m512i shuf_msk = + _mm512_set_epi32 + (/* 1st descriptor */ + 0xFFFFFFFF, /* rss set as unknown */ + 0xFFFF0504, /* vlan_macip set as unknown */ + /* octet 15~14, 16 bits data_len */ + 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ + /* octet 15~14, low 16 bits pkt_len */ + 0xFFFFFFFF, /* pkt_type set as unknown */ + /* 2nd descriptor */ + 0xFFFFFFFF, /* rss set as unknown */ + 0xFFFF0504, /* vlan_macip set as unknown */ + /* octet 15~14, 16 bits data_len */ + 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ + /* octet 15~14, low 16 bits pkt_len */ + 0xFFFFFFFF, /* pkt_type set as unknown */ + /* 3rd descriptor */ + 0xFFFFFFFF, /* rss set as unknown */ + 0xFFFF0504, /* vlan_macip set as unknown */ + /* octet 15~14, 16 bits data_len */ + 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ + /* octet 15~14, low 16 bits pkt_len */ + 0xFFFFFFFF, /* pkt_type set as unknown */ + /* 4th descriptor */ + 0xFFFFFFFF, /* rss set as unknown */ + 0xFFFF0504, /* vlan_macip set as unknown */ + /* octet 15~14, 16 bits data_len */ + 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ + /* octet 15~14, low 16 bits pkt_len */ + 0xFFFFFFFF /* pkt_type set as unknown */ + ); + /** + * compile-time check the shuffle layout is correct. + * NOTE: the first field (lowest address) is given last in set_epi + * calls above. + */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); + + uint16_t i, received; + + for (i = 0, received = 0; i < nb_pkts; + i += IDPF_DESCS_PER_LOOP_AVX, + rxdp += IDPF_DESCS_PER_LOOP_AVX) { + /* step 1, copy over 8 mbuf pointers to rx_pkts array */ + _mm256_storeu_si256((void *)&rx_pkts[i], + _mm256_loadu_si256((void *)&sw_ring[i])); +#ifdef RTE_ARCH_X86_64 + _mm256_storeu_si256 + ((void *)&rx_pkts[i + 4], + _mm256_loadu_si256((void *)&sw_ring[i + 4])); +#endif + + __m512i raw_desc0_3, raw_desc4_7; + const __m128i raw_desc7 = + _mm_load_si128((void *)(rxdp + 7)); + rte_compiler_barrier(); + const __m128i raw_desc6 = + _mm_load_si128((void *)(rxdp + 6)); + rte_compiler_barrier(); + const __m128i raw_desc5 = + _mm_load_si128((void *)(rxdp + 5)); + rte_compiler_barrier(); + const __m128i raw_desc4 = + _mm_load_si128((void *)(rxdp + 4)); + rte_compiler_barrier(); + const __m128i raw_desc3 = + _mm_load_si128((void *)(rxdp + 3)); + rte_compiler_barrier(); + const __m128i raw_desc2 = + _mm_load_si128((void *)(rxdp + 2)); + rte_compiler_barrier(); + const __m128i raw_desc1 = + _mm_load_si128((void *)(rxdp + 1)); + rte_compiler_barrier(); + const __m128i raw_desc0 = + _mm_load_si128((void *)(rxdp + 0)); + + raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4); + raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1); + raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2); + raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3); + raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0); + raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1); + raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2); + raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3); + + /** + * convert descriptors 4-7 into mbufs, adjusting length and + * re-arranging fields. Then write into the mbuf + */ + const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7, + PKTLEN_SHIFT); + const __m512i desc4_7 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK, + raw_desc4_7, + len4_7); + __m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk); + + /** + * to get packet types, shift 64-bit values down 30 bits + * and so ptype is in lower 8-bits in each + */ + const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16); + const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1); + const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0); + const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16); + const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0); + const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16); + const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0); + + const __m512i ptype4_7 = _mm512_set_epi32 + (0, 0, 0, type_table[ptype7], + 0, 0, 0, type_table[ptype6], + 0, 0, 0, type_table[ptype5], + 0, 0, 0, type_table[ptype4]); + mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); + + /** + * convert descriptors 0-3 into mbufs, adjusting length and + * re-arranging fields. Then write into the mbuf + */ + const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3, + PKTLEN_SHIFT); + const __m512i desc0_3 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK, + raw_desc0_3, + len0_3); + __m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk); + + /* get the packet types */ + const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16); + const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1); + const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0); + const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16); + const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0); + const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16); + const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0); + + const __m512i ptype0_3 = _mm512_set_epi32 + (0, 0, 0, type_table[ptype3], + 0, 0, 0, type_table[ptype2], + 0, 0, 0, type_table[ptype1], + 0, 0, 0, type_table[ptype0]); + mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); + + /** + * use permute/extract to get status content + * After the operations, the packets status flags are in the + * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] + */ + /* merge the status bits into one register */ + const __m512i status_permute_msk = _mm512_set_epi32 + (0, 0, 0, 0, + 0, 0, 0, 0, + 22, 30, 6, 14, + 18, 26, 2, 10); + const __m512i raw_status0_7 = _mm512_permutex2var_epi32 + (raw_desc4_7, status_permute_msk, raw_desc0_3); + __m256i status0_7 = _mm512_extracti64x4_epi64 + (raw_status0_7, 0); + + /* now do flag manipulation */ + + /** + * At this point, we have the 8 sets of flags in the low 16-bits + * of each 32-bit value. + * We want to extract these, and merge them with the mbuf init + * data so we can do a single write to the mbuf to set the flags + * and all the other initialization fields. Extracting the + * appropriate flags means that we have to do a shift and blend + * for each mbuf before we do the write. However, we can also + * add in the previously computed rx_descriptor fields to + * make a single 256-bit write per mbuf + */ + /* check the structure matches expectations */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != + offsetof(struct rte_mbuf, rearm_data) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != + RTE_ALIGN(offsetof(struct rte_mbuf, + rearm_data), + 16)); + /* build up data and do writes */ + __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, + rearm6, rearm7; + const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); + const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); + const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); + const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); + + rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20); + rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20); + rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20); + rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20); + + /* write to mbuf */ + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, + rearm6); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, + rearm4); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, + rearm2); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, + rearm0); + + rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0); + rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0); + rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0); + rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0); + + /* again write to mbufs */ + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, + rearm7); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, + rearm5); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, + rearm3); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, + rearm1); + + /* perform dd_check */ + status0_7 = _mm256_and_si256(status0_7, dd_check); + status0_7 = _mm256_packs_epi32(status0_7, + _mm256_setzero_si256()); + + uint64_t burst = __builtin_popcountll + (_mm_cvtsi128_si64 + (_mm256_extracti128_si256 + (status0_7, 1))); + burst += __builtin_popcountll + (_mm_cvtsi128_si64 + (_mm256_castsi256_si128(status0_7))); + received += burst; + if (burst != IDPF_DESCS_PER_LOOP_AVX) + break; + } + + /* update tail pointers */ + rxq->rx_tail += received; + rxq->rx_tail &= (rxq->nb_rx_desc - 1); + if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */ + rxq->rx_tail--; + received--; + } + rxq->rxrearm_nb += received; + return received; +} + +/** + * Notice: + * - nb_pkts < IDPF_DESCS_PER_LOOP, just return no packet + */ +uint16_t +idpf_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts); +} + +static __rte_always_inline int +idpf_tx_free_bufs_avx512(struct idpf_tx_queue *txq) +{ + struct idpf_tx_vec_entry *txep; + uint32_t n; + uint32_t i; + int nb_free = 0; + struct rte_mbuf *m, *free[txq->rs_thresh]; + + /* check DD bits on threshold descriptor */ + if ((txq->tx_ring[txq->next_dd].qw1.cmd_dtype & + rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) != + rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE)) + return 0; + + n = txq->rs_thresh; + + /* first buffer to free from S/W ring is at index + * tx_next_dd - (tx_rs_thresh-1) + */ + txep = (void *)txq->sw_ring; + txep += txq->next_dd - (n - 1); + + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { + struct rte_mempool *mp = txep[0].mbuf->pool; + struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, + rte_lcore_id()); + void **cache_objs; + + if (cache == NULL || cache->len == 0) + goto normal; + + cache_objs = &cache->objs[cache->len]; + + if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { + rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); + goto done; + } + + /* The cache follows the following algorithm + * 1. Add the objects to the cache + * 2. Anything greater than the cache min value (if it crosses the + * cache flush threshold) is flushed to the ring. + */ + /* Add elements back into the cache */ + uint32_t copied = 0; + /* n is multiple of 32 */ + while (copied < n) { + const __m512i a = _mm512_loadu_si512(&txep[copied]); + const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); + const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); + const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); + + _mm512_storeu_si512(&cache_objs[copied], a); + _mm512_storeu_si512(&cache_objs[copied + 8], b); + _mm512_storeu_si512(&cache_objs[copied + 16], c); + _mm512_storeu_si512(&cache_objs[copied + 24], d); + copied += 32; + } + cache->len += n; + + if (cache->len >= cache->flushthresh) { + rte_mempool_ops_enqueue_bulk(mp, + &cache->objs[cache->size], + cache->len - cache->size); + cache->len = cache->size; + } + goto done; + } + +normal: + m = rte_pktmbuf_prefree_seg(txep[0].mbuf); + if (likely(m != NULL)) { + free[0] = m; + nb_free = 1; + for (i = 1; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (likely(m != NULL)) { + if (likely(m->pool == free[0]->pool)) { + free[nb_free++] = m; + } else { + rte_mempool_put_bulk(free[0]->pool, + (void *)free, + nb_free); + free[0] = m; + nb_free = 1; + } + } + } + rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); + } else { + for (i = 1; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (m != NULL) + rte_mempool_put(m->pool, m); + } + } + +done: + /* buffers were freed, update counters */ + txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh); + txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh); + if (txq->next_dd >= txq->nb_tx_desc) + txq->next_dd = (uint16_t)(txq->rs_thresh - 1); + + return txq->rs_thresh; +} + +static __rte_always_inline void +tx_backlog_entry_avx512(struct idpf_tx_vec_entry *txep, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + int i; + + for (i = 0; i < (int)nb_pkts; ++i) + txep[i].mbuf = tx_pkts[i]; +} + +#define IDPF_FLEX_TXD_QW1_BUF_SZ_S 48 +static __rte_always_inline void +idpf_vtx1(volatile struct idpf_flex_tx_desc *txdp, + struct rte_mbuf *pkt, uint64_t flags) +{ + uint64_t high_qw = + (IDPF_TX_DESC_DTYPE_FLEX_DATA << IDPF_FLEX_TXD_QW1_DTYPE_S | + ((uint64_t)flags << IDPF_FLEX_TXD_QW1_CMD_S) | + ((uint64_t)pkt->data_len << IDPF_FLEX_TXD_QW1_BUF_SZ_S)); + + __m128i descriptor = _mm_set_epi64x(high_qw, + pkt->buf_iova + pkt->data_off); + _mm_storeu_si128((__m128i *)txdp, descriptor); +} + +#define IDPF_TX_LEN_MASK 0xAA +#define IDPF_TX_OFF_MASK 0x55 +static __rte_always_inline void +idpf_vtx(volatile struct idpf_flex_tx_desc *txdp, + struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) +{ + const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_DATA | + ((uint64_t)flags << IDPF_FLEX_TXD_QW1_CMD_S)); + + /* if unaligned on 32-bit boundary, do one to align */ + if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { + idpf_vtx1(txdp, *pkt, flags); + nb_pkts--, txdp++, pkt++; + } + + /* do 4 at a time while possible, in bursts */ + for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { + uint64_t hi_qw3 = + hi_qw_tmpl | + ((uint64_t)pkt[3]->data_len << + IDPF_FLEX_TXD_QW1_BUF_SZ_S); + uint64_t hi_qw2 = + hi_qw_tmpl | + ((uint64_t)pkt[2]->data_len << + IDPF_FLEX_TXD_QW1_BUF_SZ_S); + uint64_t hi_qw1 = + hi_qw_tmpl | + ((uint64_t)pkt[1]->data_len << + IDPF_FLEX_TXD_QW1_BUF_SZ_S); + uint64_t hi_qw0 = + hi_qw_tmpl | + ((uint64_t)pkt[0]->data_len << + IDPF_FLEX_TXD_QW1_BUF_SZ_S); + + __m512i desc0_3 = + _mm512_set_epi64 + (hi_qw3, + pkt[3]->buf_iova + pkt[3]->data_off, + hi_qw2, + pkt[2]->buf_iova + pkt[2]->data_off, + hi_qw1, + pkt[1]->buf_iova + pkt[1]->data_off, + hi_qw0, + pkt[0]->buf_iova + pkt[0]->data_off); + _mm512_storeu_si512((void *)txdp, desc0_3); + } + + /* do any last ones */ + while (nb_pkts) { + idpf_vtx1(txdp, *pkt, flags); + txdp++, pkt++, nb_pkts--; + } +} + +static __rte_always_inline uint16_t +idpf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + struct idpf_tx_queue *txq = tx_queue; + volatile struct idpf_flex_tx_desc *txdp; + struct idpf_tx_vec_entry *txep; + uint16_t n, nb_commit, tx_id; + uint64_t flags = IDPF_TX_FLEX_DESC_CMD_EOP; + uint64_t rs = IDPF_TX_FLEX_DESC_CMD_RS | flags; + + /* cross rx_thresh boundary is not allowed */ + nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh); + + if (txq->nb_free < txq->free_thresh) + idpf_tx_free_bufs_avx512(txq); + + nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts); + nb_commit = nb_pkts; + if (unlikely(nb_pkts == 0)) + return 0; + + tx_id = txq->tx_tail; + txdp = &txq->tx_ring[tx_id]; + txep = (void *)txq->sw_ring; + txep += tx_id; + + txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts); + + n = (uint16_t)(txq->nb_tx_desc - tx_id); + if (nb_commit >= n) { + tx_backlog_entry_avx512(txep, tx_pkts, n); + + idpf_vtx(txdp, tx_pkts, n - 1, flags); + tx_pkts += (n - 1); + txdp += (n - 1); + + idpf_vtx1(txdp, *tx_pkts++, rs); + + nb_commit = (uint16_t)(nb_commit - n); + + tx_id = 0; + txq->next_rs = (uint16_t)(txq->rs_thresh - 1); + + /* avoid reach the end of ring */ + txdp = &txq->tx_ring[tx_id]; + txep = (void *)txq->sw_ring; + txep += tx_id; + } + + tx_backlog_entry_avx512(txep, tx_pkts, nb_commit); + + idpf_vtx(txdp, tx_pkts, nb_commit, flags); + + tx_id = (uint16_t)(tx_id + nb_commit); + if (tx_id > txq->next_rs) { + txq->tx_ring[txq->next_rs].qw1.cmd_dtype |= + rte_cpu_to_le_64(((uint64_t)IDPF_TX_FLEX_DESC_CMD_RS) << + IDPF_FLEX_TXD_QW1_CMD_S); + txq->next_rs = + (uint16_t)(txq->next_rs + txq->rs_thresh); + } + + txq->tx_tail = tx_id; + + IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); + + return nb_pkts; +} + +static __rte_always_inline uint16_t +idpf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + uint16_t nb_tx = 0; + struct idpf_tx_queue *txq = tx_queue; + + while (nb_pkts) { + uint16_t ret, num; + + num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh); + ret = idpf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx], + num); + nb_tx += ret; + nb_pkts -= ret; + if (ret < num) + break; + } + + return nb_tx; +} + +uint16_t +idpf_singleq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + return idpf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts); +} + +static inline void +idpf_singleq_tx_release_mbufs_avx512(struct idpf_tx_queue *txq) +{ + unsigned int i; + const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1); + struct idpf_tx_vec_entry *swr = (void *)txq->sw_ring; + + if (txq->sw_ring == NULL || txq->nb_free == max_desc) + return; + + i = txq->next_dd - txq->rs_thresh + 1; + if (txq->tx_tail < i) { + for (; i < txq->nb_tx_desc; i++) { + rte_pktmbuf_free_seg(swr[i].mbuf); + swr[i].mbuf = NULL; + } + i = 0; + } +} + +static const struct idpf_txq_ops avx512_singleq_tx_vec_ops = { + .release_mbufs = idpf_singleq_tx_release_mbufs_avx512, +}; + +int __rte_cold +idpf_singleq_tx_vec_setup_avx512(struct idpf_tx_queue *txq) +{ + txq->ops = &avx512_singleq_tx_vec_ops; + return 0; +} diff --git a/drivers/net/idpf/idpf_rxtx_vec_common.h b/drivers/net/idpf/idpf_rxtx_vec_common.h new file mode 100644 index 0000000000..0f4e10e154 --- /dev/null +++ b/drivers/net/idpf/idpf_rxtx_vec_common.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2022 Intel Corporation + */ + +#ifndef _IDPF_RXTX_VEC_COMMON_H_ +#define _IDPF_RXTX_VEC_COMMON_H_ +#include <stdint.h> +#include <ethdev_driver.h> +#include <rte_malloc.h> + +#include "idpf_ethdev.h" +#include "idpf_rxtx.h" + +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + +#define IDPF_VECTOR_PATH 0 +#define ICE_RX_NO_VECTOR_FLAGS ( \ + RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | \ + RTE_ETH_RX_OFFLOAD_UDP_CKSUM | \ + RTE_ETH_RX_OFFLOAD_TCP_CKSUM | \ + RTE_ETH_RX_OFFLOAD_OUTER_IPV4_CKSUM | \ + RTE_ETH_RX_OFFLOAD_TIMESTAMP) +#define ICE_TX_NO_VECTOR_FLAGS ( \ + RTE_ETH_TX_OFFLOAD_TCP_TSO | \ + RTE_ETH_TX_OFFLOAD_MULTI_SEGS) + +static inline int +idpf_rx_vec_queue_default(struct idpf_rx_queue *rxq) +{ + if (rxq == NULL) + return -1; + + if (rte_is_power_of_2(rxq->nb_rx_desc) == 0) + return -1; + + if (rxq->rx_free_thresh < IDPF_VPMD_RX_MAX_BURST) + return -1; + + if ((rxq->nb_rx_desc % rxq->rx_free_thresh) != 0) + return -1; + + if ((rxq->offloads & ICE_RX_NO_VECTOR_FLAGS) != 0) + return -1; + + return IDPF_VECTOR_PATH; +} + +static inline int +idpf_tx_vec_queue_default(struct idpf_tx_queue *txq) +{ + if (txq == NULL) + return -1; + + if (txq->rs_thresh < IDPF_VPMD_TX_MAX_BURST || + (txq->rs_thresh & 3) != 0) + return -1; + + if ((txq->offloads & ICE_TX_NO_VECTOR_FLAGS) != 0) + return -1; + + return IDPF_VECTOR_PATH; +} + +static inline int +idpf_rx_vec_dev_check_default(struct rte_eth_dev *dev) +{ + int i; + struct idpf_rx_queue *rxq; + int ret = 0; + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + rxq = dev->data->rx_queues[i]; + ret = (idpf_rx_vec_queue_default(rxq)); + if (ret < 0) + return -1; + } + + return IDPF_VECTOR_PATH; +} + +static inline int +idpf_tx_vec_dev_check_default(struct rte_eth_dev *dev) +{ + int i; + struct idpf_tx_queue *txq; + int ret = 0; + + for (i = 0; i < dev->data->nb_tx_queues; i++) { + txq = dev->data->tx_queues[i]; + ret = idpf_tx_vec_queue_default(txq); + if (ret < 0) + return -1; + } + + return IDPF_VECTOR_PATH; +} + +#endif /*_IDPF_RXTX_VEC_COMMON_H_*/ diff --git a/drivers/net/idpf/meson.build b/drivers/net/idpf/meson.build index b632b76656..da99c098ab 100644 --- a/drivers/net/idpf/meson.build +++ b/drivers/net/idpf/meson.build @@ -14,3 +14,31 @@ sources = files( 'idpf_rxtx.c', 'idpf_vchnl.c', ) + +if arch_subdir == 'x86' + idpf_avx512_cpu_support = ( + cc.get_define('__AVX512F__', args: machine_args) != '' and + cc.get_define('__AVX512BW__', args: machine_args) != '' + ) + + idpf_avx512_cc_support = ( + not machine_args.contains('-mno-avx512f') and + cc.has_argument('-mavx512f') and + cc.has_argument('-mavx512bw') + ) + + if idpf_avx512_cpu_support == true or idpf_avx512_cc_support == true + cflags += ['-DCC_AVX512_SUPPORT'] + avx512_args = [cflags, '-mavx512f', '-mavx512bw'] + if cc.has_argument('-march=skylake-avx512') + avx512_args += '-march=skylake-avx512' + endif + idpf_avx512_lib = static_library('idpf_avx512_lib', + 'idpf_rxtx_vec_avx512.c', + dependencies: [static_rte_common_idpf, static_rte_ethdev, static_rte_bus_pci, + static_rte_kvargs, static_rte_hash], + include_directories: includes, + c_args: avx512_args) + objs += idpf_avx512_lib.extract_objects('idpf_rxtx_vec_avx512.c') + endif +endif -- 2.26.2