Replace the clearing of lower 32 bits of XMM register with blend of zero register. Remove the clearing of upper 64 bits of tmp1 as it is redundant. tmp1 after clearing upper bits was being xor with tmp2 before the bits 96:65 from tmp2 were returned. The xor operation of bits 96:65 remains unchanged due to tmp1 having bits 96:64 cleared to 0. After removing the xor operation, the clearing of upper 64 bits of tmp1 becomes redundant and hence can be removed.
Signed-off-by: Shreesh Adiga <[email protected]> --- lib/net/net_crc_sse.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c index 112dc94ac1..94d847b301 100644 --- a/lib/net/net_crc_sse.c +++ b/lib/net/net_crc_sse.c @@ -96,23 +96,14 @@ crcr32_reduce_128_to_64(__m128i data128, __m128i precomp) static __rte_always_inline uint32_t crcr32_reduce_64_to_32(__m128i data64, __m128i precomp) { - static const alignas(16) uint32_t mask1[4] = { - 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 - }; - - static const alignas(16) uint32_t mask2[4] = { - 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff - }; __m128i tmp0, tmp1, tmp2; - tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2)); + tmp0 = _mm_blend_epi16(data64, _mm_setzero_si128(), 0x3); tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00); tmp1 = _mm_xor_si128(tmp1, tmp0); - tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1)); tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10); - tmp2 = _mm_xor_si128(tmp2, tmp1); tmp2 = _mm_xor_si128(tmp2, tmp0); return _mm_extract_epi32(tmp2, 2); -- 2.51.0

