Add AVX2 vectorized split queue Rx path.
In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Signed-off-by: Shaiq Wani <[email protected]>
---
 drivers/net/intel/idpf/idpf_common_device.h   |   1 +
 drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++++++
 drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
 .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 138 ++++++++++++++++++
 .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 -------
 5 files changed, 203 insertions(+), 56 deletions(-)

diff --git a/drivers/net/intel/idpf/idpf_common_device.h 
b/drivers/net/intel/idpf/idpf_common_device.h
index bbc969c734..1424046a16 100644
--- a/drivers/net/intel/idpf/idpf_common_device.h
+++ b/drivers/net/intel/idpf/idpf_common_device.h
@@ -70,6 +70,7 @@ enum idpf_rx_func_type {
        IDPF_RX_SINGLEQ,
        IDPF_RX_SINGLEQ_SCATTERED,
        IDPF_RX_SINGLEQ_AVX2,
+       IDPF_RX_AVX2,
        IDPF_RX_AVX512,
        IDPF_RX_SINGLEQ_AVX512,
        IDPF_RX_MAX
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c 
b/drivers/net/intel/idpf/idpf_common_rxtx.c
index b8f6418d4a..ead31fd0f8 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -253,6 +253,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
        cq->expected_gen_id = 1;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
+void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+       struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+       volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+       uint16_t rx_id;
+       int i;
+
+       rxdp += rx_bufq->rxrearm_start;
+
+       /* Pull 'n' more MBUFs into the software ring */
+       if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+                       (void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
+               if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+                               rx_bufq->nb_rx_desc) {
+                       for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+                               rxp[i] = &rx_bufq->fake_mbuf;
+                               rxdp[i] = (union virtchnl2_rx_buf_desc){0};
+                       }
+               }
+               
rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+                       IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
+               return;
+       }
+
+       /* Initialize the mbufs in vector, process 8 mbufs in one loop */
+       for (i = 0; i < IDPF_RXQ_REARM_THRESH;
+                       i += 8, rxp += 8, rxdp += 8) {
+               rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
+               rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
+               rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
+               rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
+               rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
+               rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
+               rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
+               rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
+       }
+
+       rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+       if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+               rx_bufq->rxrearm_start = 0;
+
+       rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+       rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+                            (rx_bufq->nb_rx_desc - 1) : 
(rx_bufq->rxrearm_start - 1));
+
+       /* Update the tail pointer on the NIC */
+       IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
 void
 idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
@@ -1506,6 +1558,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
                        .rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
                        .simd_width = RTE_VECT_SIMD_256,
                        .single_queue = true}},
+       [IDPF_RX_AVX2] = {
+               .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+               .info = "Split AVX2 Vector",
+               .features = {
+                       .rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+                       .simd_width = RTE_VECT_SIMD_256,
+                       }},
 #ifdef CC_AVX512_SUPPORT
        [IDPF_RX_AVX512] = {
                .pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h 
b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 914cab0f25..256e9ff54c 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -197,6 +197,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
 __rte_internal
+void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
+__rte_internal
 void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
 __rte_internal
 void idpf_qc_rx_queue_release(void *rxq);
@@ -249,6 +251,9 @@ __rte_internal
 uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf 
**tx_pkts,
                                         uint16_t nb_pkts);
 __rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+                                    uint16_t nb_pkts);
+__rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf 
**rx_pkts,
                          uint16_t nb_pkts);
 __rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c 
b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index e228b72fa5..c2f41db9f6 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,144 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct 
rte_mbuf **rx_pkts, uint16
        return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
+uint16_t
+idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t 
nb_pkts)
+{
+       struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
+       const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
+       struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
+       volatile union virtchnl2_rx_desc *rxdp =
+               (volatile union virtchnl2_rx_desc *)queue->rx_ring + 
queue->rx_tail;
+       const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, 
queue->mbuf_initializer);
+       uint64_t head_gen;
+       uint16_t received = 0;
+       int i;
+
+       /* Shuffle mask: picks fields from each 16-byte descriptor pair into the
+        * layout that will be merged into mbuf->rearm_data candidates.
+        */
+       const __m256i shuf = _mm256_set_epi8(
+               /* high 128 bits (desc 3 then desc 2 lanes) */
+               0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+               0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
+               /* low 128 bits (desc 1 then desc 0 lanes) */
+               0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
+               0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
+       );
+
+       /* mask that clears bits 14 and 15 of the packet length word  */
+       const __m256i len_mask = _mm256_set_epi32(
+               0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
+               0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
+       );
+
+       const __m256i ptype_mask = 
_mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+
+       rte_prefetch0(rxdp);
+       nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
+
+       if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
+               idpf_splitq_rearm_common(queue->bufq2);
+
+       /* head gen check */
+       head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+       if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+                VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
+               return 0;
+
+       for (i = nb_pkts; i >= IDPF_VPMD_DESCS_PER_LOOP; i -= 
IDPF_VPMD_DESCS_PER_LOOP) {
+               rxdp -= IDPF_VPMD_DESCS_PER_LOOP;
+
+               uint64_t g3 = rxdp[3].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+               uint64_t g2 = rxdp[2].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+               uint64_t g1 = rxdp[1].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+               uint64_t g0 = rxdp[0].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+
+               /* Extract DD bits */
+               bool dd3 = (g3 & 1ULL) != 0ULL;
+               bool dd2 = (g2 & 1ULL) != 0ULL;
+               bool dd1 = (g1 & 1ULL) != 0ULL;
+               bool dd0 = (g0 & 1ULL) != 0ULL;
+
+               /* Extract generation bits */
+               uint64_t gen3 = (g3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+                                                       
VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+               uint64_t gen2 = (g2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+                                                       
VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+               uint64_t gen1 = (g1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+                                                       
VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+               uint64_t gen0 = (g0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
+                                                       
VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+
+               /* Validate descriptors */
+               bool valid3 = dd3 && (gen3 == queue->expected_gen_id);
+               bool valid2 = dd2 && (gen2 == queue->expected_gen_id);
+               bool valid1 = dd1 && (gen1 == queue->expected_gen_id);
+               bool valid0 = dd0 && (gen0 == queue->expected_gen_id);
+
+               if (!(valid0 && valid1 && valid2 && valid3))
+                       break;
+
+               /* copy mbuf pointers */
+               memcpy(&rx_pkts[i - IDPF_VPMD_DESCS_PER_LOOP],
+                       &sw_ring[i - IDPF_VPMD_DESCS_PER_LOOP],
+                       sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP);
+
+               __m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, 
&rxdp[3]));
+               __m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, 
&rxdp[2]));
+               __m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, 
&rxdp[1]));
+               __m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, 
&rxdp[0]));
+
+               __m256i d23 = _mm256_set_m128i(d3, d2);
+               __m256i d01 = _mm256_set_m128i(d1, d0);
+
+               /* mask length and shuffle to build mbuf rearm data */
+               __m256i desc01 = _mm256_and_si256(d01, len_mask);
+               __m256i desc23 = _mm256_and_si256(d23, len_mask);
+               __m256i mb10 = _mm256_shuffle_epi8(desc01, shuf);
+               __m256i mb32 = _mm256_shuffle_epi8(desc23, shuf);
+
+               /* Extract ptypes */
+               __m256i pt10 = _mm256_and_si256(d01, ptype_mask);
+               __m256i pt32 = _mm256_and_si256(d23, ptype_mask);
+
+               uint16_t ptype0 = (uint16_t)_mm256_extract_epi16(pt10, 1);
+               uint16_t ptype1 = (uint16_t)_mm256_extract_epi16(pt10, 9);
+               uint16_t ptype2 = (uint16_t)_mm256_extract_epi16(pt32, 1);
+               uint16_t ptype3 = (uint16_t)_mm256_extract_epi16(pt32, 9);
+
+               mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype1], 2);
+               mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype0], 0);
+               mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype3], 2);
+               mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype2], 0);
+
+               /* Build rearm data for each mbuf */
+               __m256i rearm0 = _mm256_permute2f128_si256(mbuf_init, mb10, 
0x20);
+               __m256i rearm1 = _mm256_blend_epi32(mbuf_init, mb10, 0xF0);
+               __m256i rearm2 = _mm256_permute2f128_si256(mbuf_init, mb32, 
0x20);
+               __m256i rearm3 = _mm256_blend_epi32(mbuf_init, mb32, 0xF0);
+
+               /* Write out mbuf rearm data */
+               _mm256_storeu_si256((__m256i *)&rx_pkts[i - 1]->rearm_data, 
rearm3);
+               _mm256_storeu_si256((__m256i *)&rx_pkts[i - 2]->rearm_data, 
rearm2);
+               _mm256_storeu_si256((__m256i *)&rx_pkts[i - 3]->rearm_data, 
rearm1);
+               _mm256_storeu_si256((__m256i *)&rx_pkts[i - 4]->rearm_data, 
rearm0);
+
+               received += IDPF_VPMD_DESCS_PER_LOOP;
+       }
+
+       queue->rx_tail += received;
+       queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0);
+       queue->rx_tail &= (queue->nb_rx_desc - 1);
+       if ((queue->rx_tail & 1) == 1 && received > 1) {
+               queue->rx_tail--;
+               received--;
+       }
+       queue->bufq2->rxrearm_nb += received;
+       return received;
+}
+
 static inline void
 idpf_singleq_vtx1(volatile struct ci_tx_desc *txdp,
                  struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c 
b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
index fe870617bc..eda5f929cf 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
@@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct 
rte_mbuf **rx_pkts,
        return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
 }
 
-static __rte_always_inline void
-idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
-{
-       struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
-       volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
-       uint16_t rx_id;
-       int i;
-
-       rxdp += rx_bufq->rxrearm_start;
-
-       /* Pull 'n' more MBUFs into the software ring */
-       if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
-                                (void *)rxp,
-                                IDPF_RXQ_REARM_THRESH) < 0) {
-               if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
-                   rx_bufq->nb_rx_desc) {
-                       __m128i dma_addr0;
-
-                       dma_addr0 = _mm_setzero_si128();
-                       for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
-                               rxp[i] = &rx_bufq->fake_mbuf;
-                               _mm_store_si128(RTE_CAST_PTR(__m128i *, 
&rxdp[i]),
-                                               dma_addr0);
-                       }
-               }
-       rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
-                          IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
-               return;
-       }
-
-       /* Initialize the mbufs in vector, process 8 mbufs in one loop */
-       for (i = 0; i < IDPF_RXQ_REARM_THRESH;
-                       i += 8, rxp += 8, rxdp += 8) {
-               rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
-               rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
-               rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
-               rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
-               rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
-               rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
-               rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
-               rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + 
RTE_PKTMBUF_HEADROOM;
-       }
-
-       rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
-       if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
-               rx_bufq->rxrearm_start = 0;
-
-       rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
-
-       rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
-                            (rx_bufq->nb_rx_desc - 1) : 
(rx_bufq->rxrearm_start - 1));
-
-       /* Update the tail pointer on the NIC */
-       IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
-}
-
 static __rte_always_inline void
 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
 {
-- 
2.34.1

Reply via email to