idpf: enable AVX2 for split queue Rx

Burakov, Anatoly Tue, 30 Sep 2025 06:28:19 -0700

On 9/30/2025 11:07 AM, Shaiq Wani wrote:

In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.


In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.

Signed-off-by: Shaiq Wani <[email protected]>
---


Hi Shaiq,

<snip>

+
+       /* Shuffle mask: picks fields from each 16-byte descriptor pair into the
+        * layout that will be merged into mbuf->rearm_data candidates.
+        */
+       const __m256i shuf = _mm256_set_epi8(
+               /* high 128 bits (desc 3 then desc 2 lanes) */
+               (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, 11, 10, 5, 4,
+               (char)0xFF, (char)0xFF, 5, 4, (char)0xFF, (char)0xFF, 
(char)0xFF, (char)0xFF,
+               /* low 128 bits (desc 1 then desc 0 lanes) */
+               (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, 11, 10, 5, 4,
+               (char)0xFF, (char)0xFF, 5, 4, (char)0xFF, (char)0xFF, 
(char)0xFF, (char)0xFF
+       );
+
+       /* mask that clears the high 16 bits of packet length word */
+       const __m256i len_mask = _mm256_set_epi32(
+               0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
+               0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
+       );
+
+       const __m256i ptype_mask = 
_mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
+
+       for (uint16_t i = 0; i < nb_pkts; i += 4, rxdp += 4) {

Same suggestion as in the other patch: I would prefer us using definedconstants rather than raw numbers, as it makes it easier to make changesdown the line.

+               /* Step 1: copy 4 mbuf pointers (64-bit each) into rx_pkts[] */
+               __m128i ptrs_lo = _mm_loadu_si128((const __m128i *)&sw_ring[i]);
+               __m128i ptrs_hi = _mm_loadu_si128((const __m128i *)&sw_ring[i + 
2]);
+               _mm_storeu_si128((__m128i *)&rx_pkts[i], ptrs_lo);
+               _mm_storeu_si128((__m128i *)&rx_pkts[i + 2], ptrs_hi);

Please correct me if I'm wrong here, but pointers are only 64-bit on64-bit platforms, so this code will not work correctly on 32-bit platforms.

+
+               /* Step 2: load four 128-bit descriptors */
+               __m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, 
&rxdp[0]));
+               rte_compiler_barrier();
+               __m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, 
&rxdp[1]));
+               rte_compiler_barrier();
+               __m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, 
&rxdp[2]));
+               rte_compiler_barrier();
+               __m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, 
&rxdp[3]));
+
+               /* Build 256-bit descriptor-pairs */
+               __m256i d01 = _mm256_set_m128i(d1, d0); /* low lane: d0, d1 */
+               __m256i d23 = _mm256_set_m128i(d3, d2); /* high lane: d2, d3 */
+
+               /* mask off high pkt_len bits  */
+               __m256i desc01 = _mm256_and_si256(d01, len_mask);
+               __m256i desc23 = _mm256_and_si256(d23, len_mask);
+
+               /* Step 3: shuffle relevant bytes into mbuf rearm candidates */
+               __m256i mb01 = _mm256_shuffle_epi8(desc01, shuf);
+               __m256i mb23 = _mm256_shuffle_epi8(desc23, shuf);
+
+               /* Step 4: extract ptypes from descriptors and translate via 
table */
+               __m256i pt01 = _mm256_and_si256(d01, ptype_mask);
+               __m256i pt23 = _mm256_and_si256(d23, ptype_mask);
+
+               uint16_t ptype0 = (uint16_t)_mm256_extract_epi16(pt01, 1);
+               uint16_t ptype1 = (uint16_t)_mm256_extract_epi16(pt01, 9);
+               uint16_t ptype2 = (uint16_t)_mm256_extract_epi16(pt23, 1);
+               uint16_t ptype3 = (uint16_t)_mm256_extract_epi16(pt23, 9);
+
+               mb01 = _mm256_insert_epi32(mb01, (int)ptype_tbl[ptype1], 2);
+               mb01 = _mm256_insert_epi32(mb01, (int)ptype_tbl[ptype0], 0);
+               mb23 = _mm256_insert_epi32(mb23, (int)ptype_tbl[ptype3], 2);
+               mb23 = _mm256_insert_epi32(mb23, (int)ptype_tbl[ptype2], 0);
+
+               /* Step 5: build rearm vectors */
+               __m128i mb01_lo = _mm256_castsi256_si128(mb01);
+               __m128i mb01_hi = _mm256_extracti128_si256(mb01, 1);
+               __m128i mb23_lo = _mm256_castsi256_si128(mb23);
+               __m128i mb23_hi = _mm256_extracti128_si256(mb23, 1);
+
+               __m256i rearm0 = _mm256_permute2f128_si256(mbuf_init, 
_mm256_set_m128i
+                                                       (mb01_hi, mb01_lo), 
0x20);
+               __m256i rearm1 = _mm256_blend_epi32(mbuf_init, _mm256_set_m128i
+                                                       (mb01_hi, mb01_lo), 
0xF0);
+               __m256i rearm2 = _mm256_permute2f128_si256(mbuf_init, 
_mm256_set_m128i
+                                                       (mb23_hi, mb23_lo), 
0x20);
+               __m256i rearm3 = _mm256_blend_epi32(mbuf_init, _mm256_set_m128i
+                                                       (mb23_hi, mb23_lo), 
0xF0);

I don't particularly like the newlines here, I would prefer having_mm256_set_m128i on the same line as its arguments, as this looks verymisleading.

+
+               /* Step 6: per-descriptor scalar validity checks */
+               bool valid0 = false, valid1 = false, valid2 = false, valid3 = 
false;
+               {
+                       uint64_t g0 = 
rxdp[0].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+                       uint64_t g1 = 
rxdp[1].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+                       uint64_t g2 = 
rxdp[2].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+                       uint64_t g3 = 
rxdp[3].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
+
+                       bool dd0 = (g0 & 1ULL) != 0ULL;
+                       bool dd1 = (g1 & 1ULL) != 0ULL;
+                       bool dd2 = (g2 & 1ULL) != 0ULL;
+                       bool dd3 = (g3 & 1ULL) != 0ULL;
+
+                       uint64_t gen0 = (g0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) 
&
+                                               
VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+                       uint64_t gen1 = (g1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) 
&
+                                               
VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+                       uint64_t gen2 = (g2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) 
&
+                                               
VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+                       uint64_t gen3 = (g3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) 
&
+                                               
VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
+
+                       valid0 = dd0 && (gen0 == queue->expected_gen_id);
+                       valid1 = dd1 && (gen1 == queue->expected_gen_id);
+                       valid2 = dd2 && (gen2 == queue->expected_gen_id);
+                       valid3 = dd3 && (gen3 == queue->expected_gen_id);
+               }
+
+               unsigned int mask =     (valid0 ? 1U : 0U) | (valid1 ? 2U : 0U)
+                                               | (valid2 ? 4U : 0U) | (valid3 
? 8U : 0U);


Whitespace is a bit weird here

--
Thanks,
Anatoly

Re: [PATCH v4 1/2] net/idpf: enable AVX2 for split queue Rx

Reply via email to