Add a 64-byte loop that maintains 4 fold registers and processes 64 bytes at a time. The 4x fold registers is then reduced to 16 byte single fold, similar to AVX512 implementation. This technique is described in the paper by Intel: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
This results in roughly 50% performance improvement due to better ILP for large input sizes like 1024. Signed-off-by: Shreesh Adiga <[email protected]> --- lib/net/net_crc_sse.c | 59 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c index 3b6fbfecac..dfef8ecc59 100644 --- a/lib/net/net_crc_sse.c +++ b/lib/net/net_crc_sse.c @@ -14,6 +14,7 @@ /** PCLMULQDQ CRC computation context structure */ struct crc_pclmulqdq_ctx { __m128i rk1_rk2; + __m128i rk3_rk4; __m128i rk5_rk6; __m128i rk7_rk8; }; @@ -150,9 +151,36 @@ crc32_eth_calc_pclmulqdq( temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0); /** - * Folding all data into single 16 byte data block - * Assumes: fold holds first 16 bytes of data + * Folding all data into 4 parallel 16 byte data block + * Later folds 4 parallel blocks into single fold block */ + if (likely(data_len >= 64)) { + __m128i fold1, fold2, fold3, fold4; + __m128i temp1, temp2, temp3, temp4; + fold1 = _mm_loadu_si128((const __m128i *)(data + 0)); + fold2 = _mm_loadu_si128((const __m128i *)(data + 16)); + fold3 = _mm_loadu_si128((const __m128i *)(data + 32)); + fold4 = _mm_loadu_si128((const __m128i *)(data + 48)); + fold1 = _mm_xor_si128(fold1, temp); + k = params->rk1_rk2; + + for (n = 64; (n + 64) <= data_len; n += 64) { + temp1 = _mm_loadu_si128((const __m128i *)&data[n]); + temp2 = _mm_loadu_si128((const __m128i *)&data[n + 16]); + temp3 = _mm_loadu_si128((const __m128i *)&data[n + 32]); + temp4 = _mm_loadu_si128((const __m128i *)&data[n + 48]); + fold1 = crcr32_folding_round(temp1, k, fold1); + fold2 = crcr32_folding_round(temp2, k, fold2); + fold3 = crcr32_folding_round(temp3, k, fold3); + fold4 = crcr32_folding_round(temp4, k, fold4); + } + + k = params->rk3_rk4; + fold1 = crcr32_folding_round(fold2, k, fold1); + fold1 = crcr32_folding_round(fold3, k, fold1); + fold = crcr32_folding_round(fold4, k, fold1); + goto single_fold_loop; + } if (unlikely(data_len < 32)) { if (unlikely(data_len == 16)) { @@ -182,7 +210,7 @@ crc32_eth_calc_pclmulqdq( fold = _mm_loadu_si128((const __m128i *)data); fold = _mm_xor_si128(fold, temp); n = 16; - k = params->rk1_rk2; + k = params->rk3_rk4; goto partial_bytes; } @@ -191,9 +219,12 @@ crc32_eth_calc_pclmulqdq( fold = _mm_loadu_si128((const __m128i *)data); fold = _mm_xor_si128(fold, temp); - /** Main folding loop - the last 16 bytes is processed separately */ - k = params->rk1_rk2; - for (n = 16; (n + 16) <= data_len; n += 16) { + /** Single folding loop - the last 16 bytes is processed separately */ + k = params->rk3_rk4; + n = 16; + +single_fold_loop: + for (; (n + 16) <= data_len; n += 16) { temp = _mm_loadu_si128((const __m128i *)&data[n]); fold = crcr32_folding_round(temp, k, fold); } @@ -236,12 +267,14 @@ crc32_eth_calc_pclmulqdq( void rte_net_crc_sse42_init(void) { - uint64_t k1, k2, k5, k6; + uint64_t k1, k2, k3, k4, k5, k6; uint64_t p = 0, q = 0; /** Initialize CRC16 data */ - k1 = 0x189aeLLU; - k2 = 0x8e10LLU; + k1 = 0x14ff2LLU; + k2 = 0x19a3cLLU; + k3 = 0x189aeLLU; + k4 = 0x8e10LLU; k5 = 0x189aeLLU; k6 = 0x114aaLLU; q = 0x11c581910LLU; @@ -249,12 +282,15 @@ rte_net_crc_sse42_init(void) /** Save the params in context structure */ crc16_ccitt_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1); + crc16_ccitt_pclmulqdq.rk3_rk4 = _mm_set_epi64x(k4, k3); crc16_ccitt_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5); crc16_ccitt_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q); /** Initialize CRC32 data */ - k1 = 0xccaa009eLLU; - k2 = 0x1751997d0LLU; + k1 = 0x1c6e41596LLU; + k2 = 0x154442bd4LLU; + k3 = 0xccaa009eLLU; + k4 = 0x1751997d0LLU; k5 = 0xccaa009eLLU; k6 = 0x163cd6124LLU; q = 0x1f7011640LLU; @@ -262,6 +298,7 @@ rte_net_crc_sse42_init(void) /** Save the params in context structure */ crc32_eth_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1); + crc32_eth_pclmulqdq.rk3_rk4 = _mm_set_epi64x(k4, k3); crc32_eth_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5); crc32_eth_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q); } -- 2.53.0

