Vector code reorganisation/deduplication: To avoid maintaining two nearly identical implementations of calc_addr() (one for SSE, another for AVX2), replace it with a new macro that suits both SSE and AVX2 code-paths. Also remove no needed any more MM_* macros.
Signed-off-by: Konstantin Ananyev <konstantin.ananyev at intel.com> --- lib/librte_acl/acl_run_avx2.h | 87 +++++------- lib/librte_acl/acl_run_sse.h | 178 ++++++++---------------- lib/librte_acl/acl_vect.h | 132 ++++++++---------- lib/librte_eal/common/include/rte_common_vect.h | 12 ++ 4 files changed, 160 insertions(+), 249 deletions(-) diff --git a/lib/librte_acl/acl_run_avx2.h b/lib/librte_acl/acl_run_avx2.h index 1688c50..b01a46a 100644 --- a/lib/librte_acl/acl_run_avx2.h +++ b/lib/librte_acl/acl_run_avx2.h @@ -73,51 +73,19 @@ static const rte_ymm_t ymm_ones_16 = { }, }; -static inline __attribute__((always_inline)) ymm_t -calc_addr_avx2(ymm_t index_mask, ymm_t next_input, ymm_t shuffle_input, - ymm_t ones_16, ymm_t tr_lo, ymm_t tr_hi) -{ - ymm_t in, node_type, r, t; - ymm_t dfa_msk, dfa_ofs, quad_ofs; - ymm_t addr; - - const ymm_t range_base = _mm256_set_epi32( - 0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00, - 0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00); - - t = _mm256_xor_si256(index_mask, index_mask); - in = _mm256_shuffle_epi8(next_input, shuffle_input); - - /* Calc node type and node addr */ - node_type = _mm256_andnot_si256(index_mask, tr_lo); - addr = _mm256_and_si256(index_mask, tr_lo); - - /* DFA calculations. */ - - dfa_msk = _mm256_cmpeq_epi32(node_type, t); - - r = _mm256_srli_epi32(in, 30); - r = _mm256_add_epi8(r, range_base); - - t = _mm256_srli_epi32(in, 24); - r = _mm256_shuffle_epi8(tr_hi, r); - - dfa_ofs = _mm256_sub_epi32(t, r); - - /* QUAD/SINGLE caluclations. */ - - t = _mm256_cmpgt_epi8(in, tr_hi); - t = _mm256_sign_epi8(t, t); - t = _mm256_maddubs_epi16(t, t); - quad_ofs = _mm256_madd_epi16(t, ones_16); - - /* blend DFA and QUAD/SINGLE. */ - t = _mm256_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk); - - addr = _mm256_add_epi32(addr, t); - return addr; -} +static const rte_ymm_t ymm_range_base = { + .u32 = { + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + }, +}; +/* + * Process 8 transitions in parallel. + * tr_lo contains low 32 bits for 8 transition. + * tr_hi contains high 32 bits for 8 transition. + * next_input contains up to 4 input bytes for 8 flows. + */ static inline __attribute__((always_inline)) ymm_t transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi) { @@ -126,8 +94,10 @@ transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi) tr = (const int32_t *)(uintptr_t)trans; - addr = calc_addr_avx2(ymm_index_mask.y, next_input, ymm_shuffle_input.y, - ymm_ones_16.y, *tr_lo, *tr_hi); + /* Calculate the address (array index) for all 8 transitions. */ + ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input, + ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y, + *tr_lo, *tr_hi); /* load lower 32 bits of 8 transactions at once. */ *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0])); @@ -140,6 +110,11 @@ transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi) return next_input; } +/* + * Process matches for 8 flows. + * tr_lo contains low 32 bits for 8 transition. + * tr_hi contains high 32 bits for 8 transition. + */ static inline void acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms, struct acl_flow_data *flows, uint32_t slot, @@ -155,6 +130,11 @@ acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx, l0 = _mm256_castsi256_si128(*tr_lo); for (i = 0; i != RTE_DIM(tr) / 2; i++) { + + /* + * Extract low 32bits of each transition. + * That's enough to process the match. + */ tr[i] = (uint32_t)_mm_cvtsi128_si32(l0); tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1); @@ -167,12 +147,14 @@ acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx, ctx, parms, flows, resolve_priority_sse); } + /* Collect new transitions into 2 YMM registers. */ t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]); t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]); - lo = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88); - hi = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd); + /* For each transition: put low 32 into tr_lo and high 32 into tr_hi */ + ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi); + /* Keep transitions wth NOMATCH intact. */ *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches); *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches); } @@ -200,6 +182,9 @@ acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms, } } +/* + * Execute trie traversal for up to 16 flows in parallel. + */ static inline int search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data, uint32_t *results, uint32_t total_packets, uint32_t categories) @@ -225,16 +210,14 @@ search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data, t1 = _mm256_set_epi64x(index_array[7], index_array[6], index_array[3], index_array[2]); - tr_lo[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88); - tr_hi[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd); + ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]); t0 = _mm256_set_epi64x(index_array[13], index_array[12], index_array[9], index_array[8]); t1 = _mm256_set_epi64x(index_array[15], index_array[14], index_array[11], index_array[10]); - tr_lo[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88); - tr_hi[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd); + ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]); /* Check for any matches. */ acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0], diff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h index 4a174e9..ad40a67 100644 --- a/lib/librte_acl/acl_run_sse.h +++ b/lib/librte_acl/acl_run_sse.h @@ -67,6 +67,12 @@ static const rte_xmm_t xmm_index_mask = { }, }; +static const rte_xmm_t xmm_range_base = { + .u32 = { + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + }, +}; + /* * Resolve priority for multiple results (sse version). * This consists comparing the priority of the current traversal with the @@ -90,25 +96,28 @@ resolve_priority_sse(uint64_t transition, int n, const struct rte_acl_ctx *ctx, (xmm_t *)(&parms[n].cmplt->priority[x]); /* get results and priorities for completed trie */ - results = MM_LOADU((const xmm_t *)&p[transition].results[x]); - priority = MM_LOADU((const xmm_t *)&p[transition].priority[x]); + results = _mm_loadu_si128( + (const xmm_t *)&p[transition].results[x]); + priority = _mm_loadu_si128( + (const xmm_t *)&p[transition].priority[x]); /* if this is not the first completed trie */ if (parms[n].cmplt->count != ctx->num_tries) { /* get running best results and their priorities */ - results1 = MM_LOADU(saved_results); - priority1 = MM_LOADU(saved_priority); + results1 = _mm_loadu_si128(saved_results); + priority1 = _mm_loadu_si128(saved_priority); /* select results that are highest priority */ - selector = MM_CMPGT32(priority1, priority); - results = MM_BLENDV8(results, results1, selector); - priority = MM_BLENDV8(priority, priority1, selector); + selector = _mm_cmpgt_epi32(priority1, priority); + results = _mm_blendv_epi8(results, results1, selector); + priority = _mm_blendv_epi8(priority, priority1, + selector); } /* save running best results and their priorities */ - MM_STOREU(saved_results, results); - MM_STOREU(saved_priority, priority); + _mm_storeu_si128(saved_results, results); + _mm_storeu_si128(saved_priority, priority); } } @@ -122,11 +131,11 @@ acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx, uint64_t transition1, transition2; /* extract transition from low 64 bits. */ - transition1 = MM_CVT64(*indices); + transition1 = _mm_cvtsi128_si64(*indices); /* extract transition from high 64 bits. */ - *indices = MM_SHUFFLE32(*indices, SHUFFLE32_SWAP64); - transition2 = MM_CVT64(*indices); + *indices = _mm_shuffle_epi32(*indices, SHUFFLE32_SWAP64); + transition2 = _mm_cvtsi128_si64(*indices); transition1 = acl_match_check(transition1, slot, ctx, parms, flows, resolve_priority_sse); @@ -134,7 +143,7 @@ acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx, parms, flows, resolve_priority_sse); /* update indices with new transitions. */ - *indices = MM_SET64(transition2, transition1); + *indices = _mm_set_epi64x(transition2, transition1); } /* @@ -148,98 +157,24 @@ acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, xmm_t temp; /* put low 32 bits of each transition into one register */ - temp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2, + temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2, 0x88); /* test for match node */ - temp = MM_AND(match_mask, temp); + temp = _mm_and_si128(match_mask, temp); - while (!MM_TESTZ(temp, temp)) { + while (!_mm_testz_si128(temp, temp)) { acl_process_matches(indices1, slot, ctx, parms, flows); acl_process_matches(indices2, slot + 2, ctx, parms, flows); - temp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, + temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2, 0x88); - temp = MM_AND(match_mask, temp); + temp = _mm_and_si128(match_mask, temp); } } /* - * Calculate the address of the next transition for - * all types of nodes. Note that only DFA nodes and range - * nodes actually transition to another node. Match - * nodes don't move. - */ -static inline __attribute__((always_inline)) xmm_t -calc_addr_sse(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, - xmm_t ones_16, xmm_t tr_lo, xmm_t tr_hi) -{ - xmm_t addr, node_types; - xmm_t dfa_msk, dfa_ofs, quad_ofs; - xmm_t in, r, t; - - const xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08, - 0xffffff04, 0xffffff00); - - /* - * Note that no transition is done for a match - * node and therefore a stream freezes when - * it reaches a match. - */ - - t = MM_XOR(index_mask, index_mask); - - /* shuffle input byte to all 4 positions of 32 bit value */ - in = MM_SHUFFLE8(next_input, shuffle_input); - - /* Calc node type and node addr */ - node_types = MM_ANDNOT(index_mask, tr_lo); - addr = MM_AND(index_mask, tr_lo); - - /* - * Calc addr for DFAs - addr = dfa_index + input_byte - */ - - /* mask for DFA type (0) nodes */ - dfa_msk = MM_CMPEQ32(node_types, t); - - r = _mm_srli_epi32(in, 30); - r = _mm_add_epi8(r, range_base); - - t = _mm_srli_epi32(in, 24); - r = _mm_shuffle_epi8(tr_hi, r); - - dfa_ofs = _mm_sub_epi32(t, r); - - /* - * Calculate number of range boundaries that are less than the - * input value. Range boundaries for each node are in signed 8 bit, - * ordered from -128 to 127 in the indices2 register. - * This is effectively a popcnt of bytes that are greater than the - * input byte. - */ - - /* check ranges */ - t = MM_CMPGT8(in, tr_hi); - - /* convert -1 to 1 (bytes greater than input byte */ - t = MM_SIGN8(t, t); - - /* horizontal add pairs of bytes into words */ - t = MM_MADD8(t, t); - - /* horizontal add pairs of words into dwords */ - quad_ofs = MM_MADD16(t, ones_16); - - /* blend DFA and QUAD/SINGLE. */ - t = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk); - - /* add index into node position */ - return MM_ADD32(addr, t); -} - -/* - * Process 4 transitions (in 2 SIMD registers) in parallel + * Process 4 transitions (in 2 XMM registers) in parallel */ static inline __attribute__((always_inline)) xmm_t transition4(xmm_t next_input, const uint64_t *trans, @@ -249,39 +184,36 @@ transition4(xmm_t next_input, const uint64_t *trans, uint64_t trans0, trans2; /* Shuffle low 32 into tr_lo and high 32 into tr_hi */ - tr_lo = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2, - 0x88); - tr_hi = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2, - 0xdd); + ACL_TR_HILO(mm, __m128, *indices1, *indices2, tr_lo, tr_hi); /* Calculate the address (array index) for all 4 transitions. */ - - addr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x, - xmm_ones_16.x, tr_lo, tr_hi); + ACL_TR_CALC_ADDR(mm, 128, addr, xmm_index_mask.x, next_input, + xmm_shuffle_input.x, xmm_ones_16.x, xmm_range_base.x, + tr_lo, tr_hi); /* Gather 64 bit transitions and pack back into 2 registers. */ - trans0 = trans[MM_CVT32(addr)]; + trans0 = trans[_mm_cvtsi128_si32(addr)]; /* get slot 2 */ /* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */ - addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT2); - trans2 = trans[MM_CVT32(addr)]; + addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT2); + trans2 = trans[_mm_cvtsi128_si32(addr)]; /* get slot 1 */ /* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */ - addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1); - *indices1 = MM_SET64(trans[MM_CVT32(addr)], trans0); + addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT1); + *indices1 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans0); /* get slot 3 */ /* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */ - addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT3); - *indices2 = MM_SET64(trans[MM_CVT32(addr)], trans2); + addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT3); + *indices2 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans2); - return MM_SRL32(next_input, CHAR_BIT); + return _mm_srli_epi32(next_input, CHAR_BIT); } /* @@ -314,11 +246,11 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data, * indices4 contains index_array[6,7] */ - indices1 = MM_LOADU((xmm_t *) &index_array[0]); - indices2 = MM_LOADU((xmm_t *) &index_array[2]); + indices1 = _mm_loadu_si128((xmm_t *) &index_array[0]); + indices2 = _mm_loadu_si128((xmm_t *) &index_array[2]); - indices3 = MM_LOADU((xmm_t *) &index_array[4]); - indices4 = MM_LOADU((xmm_t *) &index_array[6]); + indices3 = _mm_loadu_si128((xmm_t *) &index_array[4]); + indices4 = _mm_loadu_si128((xmm_t *) &index_array[6]); /* Check for any matches. */ acl_match_check_x4(0, ctx, parms, &flows, @@ -332,14 +264,14 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data, input0 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0)); input1 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 4)); - input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1); - input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 5), 1); + input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 1), 1); + input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 5), 1); - input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 2), 2); - input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 6), 2); + input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 2), 2); + input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 6), 2); - input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 3), 3); - input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 7), 3); + input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 3), 3); + input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 7), 3); /* Process the 4 bytes of input on each stream. */ @@ -395,8 +327,8 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data, index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); } - indices1 = MM_LOADU((xmm_t *) &index_array[0]); - indices2 = MM_LOADU((xmm_t *) &index_array[2]); + indices1 = _mm_loadu_si128((xmm_t *) &index_array[0]); + indices2 = _mm_loadu_si128((xmm_t *) &index_array[2]); /* Check for any matches. */ acl_match_check_x4(0, ctx, parms, &flows, @@ -406,9 +338,9 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data, /* Gather 4 bytes of input data for each stream. */ input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0)); - input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1); - input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2); - input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3); + input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 1), 1); + input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 2), 2); + input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 3), 3); /* Process the 4 bytes of input on each stream. */ input = transition4(input, flows.trans, &indices1, &indices2); diff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h index d813600..6cc1999 100644 --- a/lib/librte_acl/acl_vect.h +++ b/lib/librte_acl/acl_vect.h @@ -44,86 +44,70 @@ extern "C" { #endif -#define MM_ADD16(a, b) _mm_add_epi16(a, b) -#define MM_ADD32(a, b) _mm_add_epi32(a, b) -#define MM_ALIGNR8(a, b, c) _mm_alignr_epi8(a, b, c) -#define MM_AND(a, b) _mm_and_si128(a, b) -#define MM_ANDNOT(a, b) _mm_andnot_si128(a, b) -#define MM_BLENDV8(a, b, c) _mm_blendv_epi8(a, b, c) -#define MM_CMPEQ16(a, b) _mm_cmpeq_epi16(a, b) -#define MM_CMPEQ32(a, b) _mm_cmpeq_epi32(a, b) -#define MM_CMPEQ8(a, b) _mm_cmpeq_epi8(a, b) -#define MM_CMPGT32(a, b) _mm_cmpgt_epi32(a, b) -#define MM_CMPGT8(a, b) _mm_cmpgt_epi8(a, b) -#define MM_CVT(a) _mm_cvtsi32_si128(a) -#define MM_CVT32(a) _mm_cvtsi128_si32(a) -#define MM_CVTU32(a) _mm_cvtsi32_si128(a) -#define MM_INSERT16(a, c, b) _mm_insert_epi16(a, c, b) -#define MM_INSERT32(a, c, b) _mm_insert_epi32(a, c, b) -#define MM_LOAD(a) _mm_load_si128(a) -#define MM_LOADH_PI(a, b) _mm_loadh_pi(a, b) -#define MM_LOADU(a) _mm_loadu_si128(a) -#define MM_MADD16(a, b) _mm_madd_epi16(a, b) -#define MM_MADD8(a, b) _mm_maddubs_epi16(a, b) -#define MM_MOVEMASK8(a) _mm_movemask_epi8(a) -#define MM_OR(a, b) _mm_or_si128(a, b) -#define MM_SET1_16(a) _mm_set1_epi16(a) -#define MM_SET1_32(a) _mm_set1_epi32(a) -#define MM_SET1_64(a) _mm_set1_epi64(a) -#define MM_SET1_8(a) _mm_set1_epi8(a) -#define MM_SET32(a, b, c, d) _mm_set_epi32(a, b, c, d) -#define MM_SHUFFLE32(a, b) _mm_shuffle_epi32(a, b) -#define MM_SHUFFLE8(a, b) _mm_shuffle_epi8(a, b) -#define MM_SHUFFLEPS(a, b, c) _mm_shuffle_ps(a, b, c) -#define MM_SIGN8(a, b) _mm_sign_epi8(a, b) -#define MM_SLL64(a, b) _mm_sll_epi64(a, b) -#define MM_SRL128(a, b) _mm_srli_si128(a, b) -#define MM_SRL16(a, b) _mm_srli_epi16(a, b) -#define MM_SRL32(a, b) _mm_srli_epi32(a, b) -#define MM_STORE(a, b) _mm_store_si128(a, b) -#define MM_STOREU(a, b) _mm_storeu_si128(a, b) -#define MM_TESTZ(a, b) _mm_testz_si128(a, b) -#define MM_XOR(a, b) _mm_xor_si128(a, b) - -#define MM_SET16(a, b, c, d, e, f, g, h) \ - _mm_set_epi16(a, b, c, d, e, f, g, h) - -#define MM_SET8(c0, c1, c2, c3, c4, c5, c6, c7, \ - c8, c9, cA, cB, cC, cD, cE, cF) \ - _mm_set_epi8(c0, c1, c2, c3, c4, c5, c6, c7, \ - c8, c9, cA, cB, cC, cD, cE, cF) - -#ifdef RTE_ARCH_X86_64 - -#define MM_CVT64(a) _mm_cvtsi128_si64(a) - -#else - -#define MM_CVT64(a) ({ \ - rte_xmm_t m; \ - m.m = (a); \ - (m.u64[0]); \ -}) - -#endif /*RTE_ARCH_X86_64 */ /* - * Prior to version 12.1 icc doesn't support _mm_set_epi64x. + * Takes 2 SIMD registers containing N transitions eachi (tr0, tr1). + * Shuffles it into different representation: + * lo - contains low 32 bits of given N transitions. + * hi - contains high 32 bits of given N transitions. */ -#if (defined(__ICC) && __ICC < 1210) +#define ACL_TR_HILO(P, TC, tr0, tr1, lo, hi) do { \ + lo = (typeof(lo))_##P##_shuffle_ps((TC)(tr0), (TC)(tr1), 0x88); \ + hi = (typeof(hi))_##P##_shuffle_ps((TC)(tr0), (TC)(tr1), 0xdd); \ +} while (0) -#define MM_SET64(a, b) ({ \ - rte_xmm_t m; \ - m.u64[0] = b; \ - m.u64[1] = a; \ - (m.m); \ -}) -#else - -#define MM_SET64(a, b) _mm_set_epi64x(a, b) +/* + * Calculate the address of the next transition for + * all types of nodes. Note that only DFA nodes and range + * nodes actually transition to another node. Match + * nodes not supposed to be encountered here. + * For quad range nodes: + * Calculate number of range boundaries that are less than the + * input value. Range boundaries for each node are in signed 8 bit, + * ordered from -128 to 127. + * This is effectively a popcnt of bytes that are greater than the + * input byte. + * Single nodes are processed in the same ways as quad range nodes. +*/ +#define ACL_TR_CALC_ADDR(P, S, \ + addr, index_mask, next_input, shuffle_input, \ + ones_16, range_base, tr_lo, tr_hi) do { \ + \ + typeof(addr) in, node_type, r, t; \ + typeof(addr) dfa_msk, dfa_ofs, quad_ofs; \ + \ + t = _##P##_xor_si##S(index_mask, index_mask); \ + in = _##P##_shuffle_epi8(next_input, shuffle_input); \ + \ + /* Calc node type and node addr */ \ + node_type = _##P##_andnot_si##S(index_mask, tr_lo); \ + addr = _##P##_and_si##S(index_mask, tr_lo); \ + \ + /* mask for DFA type(0) nodes */ \ + dfa_msk = _##P##_cmpeq_epi32(node_type, t); \ + \ + /* DFA calculations. */ \ + r = _##P##_srli_epi32(in, 30); \ + r = _##P##_add_epi8(r, range_base); \ + t = _##P##_srli_epi32(in, 24); \ + r = _##P##_shuffle_epi8(tr_hi, r); \ + \ + dfa_ofs = _##P##_sub_epi32(t, r); \ + \ + /* QUAD/SINGLE caluclations. */ \ + t = _##P##_cmpgt_epi8(in, tr_hi); \ + t = _##P##_sign_epi8(t, t); \ + t = _##P##_maddubs_epi16(t, t); \ + quad_ofs = _##P##_madd_epi16(t, ones_16); \ + \ + /* blend DFA and QUAD/SINGLE. */ \ + t = _##P##_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk); \ + \ + /* calculate address for next transitions. */ \ + addr = _##P##_add_epi32(addr, t); \ +} while (0) -#endif /* (defined(__ICC) && __ICC < 1210) */ #ifdef __cplusplus } diff --git a/lib/librte_eal/common/include/rte_common_vect.h b/lib/librte_eal/common/include/rte_common_vect.h index 617470b..54ec70f 100644 --- a/lib/librte_eal/common/include/rte_common_vect.h +++ b/lib/librte_eal/common/include/rte_common_vect.h @@ -109,6 +109,18 @@ typedef union rte_ymm { }) #endif +/* + * Prior to version 12.1 icc doesn't support _mm_set_epi64x. + */ +#if (defined(__ICC) && __ICC < 1210) +#define _mm_set_epi64x(a, b) ({ \ + rte_xmm_t m; \ + m.u64[0] = b; \ + m.u64[1] = a; \ + (m.x); \ +}) +#endif /* (defined(__ICC) && __ICC < 1210) */ + #ifdef __cplusplus } #endif -- 1.8.5.3