Hi,
> Current hitmask includes padding due to Intel's SIMD > implementation detail. This patch allows non Intel SIMD > implementations to benefit from a dense hitmask. > In addition, the new dense hitmask interweave the primary > and secondary matches which allow a better cache usage and > enable future improvements for the SIMD implementations > > Signed-off-by: Yoan Picchi <yoan.pic...@arm.com> > Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com> > Reviewed-by: Nathan Brown <nathan.br...@arm.com> > --- > .mailmap | 2 + > lib/hash/arch/arm/compare_signatures.h | 61 +++++++ > lib/hash/arch/common/compare_signatures.h | 38 +++++ > lib/hash/arch/x86/compare_signatures.h | 53 ++++++ > lib/hash/rte_cuckoo_hash.c | 192 ++++++++++++---------- > 5 files changed, 255 insertions(+), 91 deletions(-) > create mode 100644 lib/hash/arch/arm/compare_signatures.h > create mode 100644 lib/hash/arch/common/compare_signatures.h > create mode 100644 lib/hash/arch/x86/compare_signatures.h > > diff --git a/.mailmap b/.mailmap > index 66ebc20666..00b50414d3 100644 > --- a/.mailmap > +++ b/.mailmap > @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vem...@intel.com> > Harini Ramakrishnan <harini.ramakrish...@microsoft.com> > Hariprasad Govindharajan <hariprasad.govindhara...@intel.com> > Harish Patil <harish.pa...@cavium.com> <harish.pa...@qlogic.com> > +Harjot Singh <harjot.si...@arm.com> > Harman Kalra <hka...@marvell.com> > Harneet Singh <harneet.si...@intel.com> > Harold Huang <baymaxhu...@gmail.com> > @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.w...@intel.com> > Yi Yang <yangy...@inspur.com> <yi.y.y...@intel.com> > Yi Zhang <zhang.y...@zte.com.cn> > Yoann Desmouceaux <ydesm...@cisco.com> > +Yoan Picchi <yoan.pic...@arm.com> > Yogesh Jangra <yogesh.jan...@intel.com> > Yogev Chaimovich <yo...@cgstowernetworks.com> > Yongjie Gu <yongjiex...@intel.com> > diff --git a/lib/hash/arch/arm/compare_signatures.h > b/lib/hash/arch/arm/compare_signatures.h > new file mode 100644 > index 0000000000..1af6ba8190 > --- /dev/null > +++ b/lib/hash/arch/arm/compare_signatures.h > @@ -0,0 +1,61 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2010-2016 Intel Corporation > + * Copyright(c) 2018-2024 Arm Limited > + */ > + > +/* > + * Arm's version uses a densely packed hitmask buffer: > + * Every bit is in use. > + */ > + > +#include <inttypes.h> > +#include <rte_common.h> > +#include <rte_vect.h> > +#include "rte_cuckoo_hash.h" > + > +#define DENSE_HASH_BULK_LOOKUP 1 > + > +static inline void > +compare_signatures_dense(uint16_t *hitmask_buffer, > + const uint16_t *prim_bucket_sigs, > + const uint16_t *sec_bucket_sigs, > + uint16_t sig, > + enum rte_hash_sig_compare_function sig_cmp_fn) > +{ > + > + static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8), > + "The hitmask must be exactly wide enough to accept the whole hitmask if > it is dense"); > + > + /* For match mask every bits indicates the match */ > + switch (sig_cmp_fn) { > +#if RTE_HASH_BUCKET_ENTRIES <= 8 > + case RTE_HASH_COMPARE_NEON: { > + uint16x8_t vmat, vsig, x; > + int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7}; > + uint16_t low, high; > + > + vsig = vld1q_dup_u16((uint16_t const *)&sig); > + /* Compare all signatures in the primary bucket */ > + vmat = vceqq_u16(vsig, > + vld1q_u16((uint16_t const *)prim_bucket_sigs)); > + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift); > + low = (uint16_t)(vaddvq_u16(x)); > + /* Compare all signatures in the secondary bucket */ > + vmat = vceqq_u16(vsig, > + vld1q_u16((uint16_t const *)sec_bucket_sigs)); > + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift); > + high = (uint16_t)(vaddvq_u16(x)); > + *hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES; > + > + } > + break; > +#endif > + default: > + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { > + *hitmask_buffer |= > + ((sig == prim_bucket_sigs[i]) << i); > + *hitmask_buffer |= > + ((sig == sec_bucket_sigs[i]) << i) << > RTE_HASH_BUCKET_ENTRIES; > + } > + } > +} > diff --git a/lib/hash/arch/common/compare_signatures.h > b/lib/hash/arch/common/compare_signatures.h > new file mode 100644 > index 0000000000..dcf9444032 > --- /dev/null > +++ b/lib/hash/arch/common/compare_signatures.h > @@ -0,0 +1,38 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2010-2016 Intel Corporation > + * Copyright(c) 2018-2024 Arm Limited > + */ > + > +/* > + * The generic version could use either a dense or sparsely packed hitmask > buffer, > + * but the dense one is slightly faster. > + */ > + > +#include <inttypes.h> > +#include <rte_common.h> > +#include <rte_vect.h> > +#include "rte_cuckoo_hash.h" > + > +#define DENSE_HASH_BULK_LOOKUP 1 > + > +static inline void > +compare_signatures_dense(uint16_t *hitmask_buffer, > + const uint16_t *prim_bucket_sigs, > + const uint16_t *sec_bucket_sigs, > + uint16_t sig, > + enum rte_hash_sig_compare_function sig_cmp_fn) > +{ > + (void) sig_cmp_fn; > + > + static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8), > + "The hitmask must be exactly wide enough to accept the whole hitmask if > it is dense"); > + > + /* For match mask every bits indicates the match */ > + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { > + *hitmask_buffer |= > + ((sig == prim_bucket_sigs[i]) << i); > + *hitmask_buffer |= > + ((sig == sec_bucket_sigs[i]) << i) << > RTE_HASH_BUCKET_ENTRIES; > + } > + > +} Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way. One question I have - does it mean that now for x86 we always use 'sparse' while for all other ARM and non-ARM platforms we switch to 'dense'? > diff --git a/lib/hash/arch/x86/compare_signatures.h > b/lib/hash/arch/x86/compare_signatures.h > new file mode 100644 > index 0000000000..7eec499e1f > --- /dev/null > +++ b/lib/hash/arch/x86/compare_signatures.h > @@ -0,0 +1,53 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2010-2016 Intel Corporation > + * Copyright(c) 2018-2024 Arm Limited > + */ > + > +/* > + * x86's version uses a sparsely packed hitmask buffer: > + * Every other bit is padding. > + */ > + > +#include <inttypes.h> > +#include <rte_common.h> > +#include <rte_vect.h> > +#include "rte_cuckoo_hash.h" > + > +#define DENSE_HASH_BULK_LOOKUP 0 > + > +static inline void > +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t > *sec_hash_matches, > + const struct rte_hash_bucket *prim_bkt, > + const struct rte_hash_bucket *sec_bkt, > + uint16_t sig, > + enum rte_hash_sig_compare_function sig_cmp_fn) > +{ > + /* For match mask the first bit of every two bits indicates the match */ > + switch (sig_cmp_fn) { > +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8 > + case RTE_HASH_COMPARE_SSE: > + /* Compare all signatures in the bucket */ > + *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( > + _mm_load_si128( > + (__m128i const *)prim_bkt->sig_current), > + _mm_set1_epi16(sig))); > + /* Extract the even-index bits only */ > + *prim_hash_matches &= 0x5555; > + /* Compare all signatures in the bucket */ > + *sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( > + _mm_load_si128( > + (__m128i const *)sec_bkt->sig_current), > + _mm_set1_epi16(sig))); > + /* Extract the even-index bits only */ > + *sec_hash_matches &= 0x5555; > + break; > +#endif /* defined(__SSE2__) */ > + default: > + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { > + *prim_hash_matches |= > + ((sig == prim_bkt->sig_current[i]) << (i << 1)); > + *sec_hash_matches |= > + ((sig == sec_bkt->sig_current[i]) << (i << 1)); > + } > + } > +}