Hello Yoan, On Wed, Jul 3, 2024 at 7:13 PM Yoan Picchi <yoan.pic...@arm.com> wrote: > > Current hitmask includes padding due to Intel's SIMD > implementation detail. This patch allows non Intel SIMD > implementations to benefit from a dense hitmask. > In addition, the new dense hitmask interweave the primary > and secondary matches which allow a better cache usage and > enable future improvements for the SIMD implementations > The default non SIMD path now use this dense mask. > > Signed-off-by: Yoan Picchi <yoan.pic...@arm.com> > Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com> > Reviewed-by: Nathan Brown <nathan.br...@arm.com>
This patch does too many things at the same time. There is code movement and behavior modifications all mixed in. As there was still no review from the lib maintainer... I am going a bit more in depth this time. Please split this patch to make it less hard to understand. I can see the need for at least one patch for isolating the change on sig_cmp_fn from the exposed API, then one patch for moving the code to per arch headers with *no behavior change*, and one patch for introducing/switching to "dense hitmask". More comments below. > --- > .mailmap | 1 + > lib/hash/compare_signatures_arm_pvt.h | 60 +++++++ > lib/hash/compare_signatures_generic_pvt.h | 37 +++++ > lib/hash/compare_signatures_x86_pvt.h | 49 ++++++ > lib/hash/hash_sig_cmp_func_pvt.h | 20 +++ > lib/hash/rte_cuckoo_hash.c | 190 +++++++++++----------- > lib/hash/rte_cuckoo_hash.h | 10 +- > 7 files changed, 267 insertions(+), 100 deletions(-) > create mode 100644 lib/hash/compare_signatures_arm_pvt.h > create mode 100644 lib/hash/compare_signatures_generic_pvt.h > create mode 100644 lib/hash/compare_signatures_x86_pvt.h > create mode 100644 lib/hash/hash_sig_cmp_func_pvt.h > > diff --git a/.mailmap b/.mailmap > index f76037213d..ec525981fe 100644 > --- a/.mailmap > +++ b/.mailmap > @@ -1661,6 +1661,7 @@ Yixue Wang <yixue.w...@intel.com> > Yi Yang <yangy...@inspur.com> <yi.y.y...@intel.com> > Yi Zhang <zhang.y...@zte.com.cn> > Yoann Desmouceaux <ydesm...@cisco.com> > +Yoan Picchi <yoan.pic...@arm.com> > Yogesh Jangra <yogesh.jan...@intel.com> > Yogev Chaimovich <yo...@cgstowernetworks.com> > Yongjie Gu <yongjiex...@intel.com> > diff --git a/lib/hash/compare_signatures_arm_pvt.h > b/lib/hash/compare_signatures_arm_pvt.h > new file mode 100644 > index 0000000000..e83bae9912 > --- /dev/null > +++ b/lib/hash/compare_signatures_arm_pvt.h I guess pvt stands for private. No need for such suffix, this header won't be exported in any case. > @@ -0,0 +1,60 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2010-2016 Intel Corporation > + * Copyright(c) 2018-2024 Arm Limited > + */ > + > +/* > + * Arm's version uses a densely packed hitmask buffer: > + * Every bit is in use. > + */ Please put a header guard. #ifndef <UPPERCASE_HEADER_NAME>_H #define <UPPERCASE_HEADER_NAME>_H > + > +#include <inttypes.h> > +#include <rte_common.h> > +#include <rte_vect.h> > + > +#include "rte_cuckoo_hash.h" > +#include "hash_sig_cmp_func_pvt.h" > + > +#define DENSE_HASH_BULK_LOOKUP 1 > + > +static inline void > +compare_signatures_dense(uint16_t *hitmask_buffer, > + const uint16_t *prim_bucket_sigs, > + const uint16_t *sec_bucket_sigs, > + uint16_t sig, > + enum rte_hash_sig_compare_function sig_cmp_fn) > +{ > + > + static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES > / 8), > + "hitmask_buffer must be wide enough to fit a dense hitmask"); > + > + /* For match mask every bits indicates the match */ > + switch (sig_cmp_fn) { > +#if RTE_HASH_BUCKET_ENTRIES <= 8 > + case RTE_HASH_COMPARE_NEON: { > + uint16x8_t vmat, vsig, x; > + int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7}; > + uint16_t low, high; > + > + vsig = vld1q_dup_u16((uint16_t const *)&sig); > + /* Compare all signatures in the primary bucket */ > + vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const > *)prim_bucket_sigs)); > + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift); > + low = (uint16_t)(vaddvq_u16(x)); > + /* Compare all signatures in the secondary bucket */ > + vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const > *)sec_bucket_sigs)); > + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift); > + high = (uint16_t)(vaddvq_u16(x)); > + *hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES; > + > + } > + break; > +#endif > + default: > + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { > + *hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i; > + *hitmask_buffer |= > + ((sig == sec_bucket_sigs[i]) << i) << > RTE_HASH_BUCKET_ENTRIES; > + } > + } > +} IIRC, this code is copied in all three headers. It is a common scalar version, so the ARM code could simply call the "generic" implementation rather than copy/paste. [snip] > diff --git a/lib/hash/compare_signatures_x86_pvt.h > b/lib/hash/compare_signatures_x86_pvt.h > new file mode 100644 > index 0000000000..932912ba19 > --- /dev/null > +++ b/lib/hash/compare_signatures_x86_pvt.h > @@ -0,0 +1,49 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2010-2016 Intel Corporation > + * Copyright(c) 2018-2024 Arm Limited > + */ > + > +/* > + * x86's version uses a sparsely packed hitmask buffer: > + * Every other bit is padding. > + */ > + > +#include <inttypes.h> > +#include <rte_common.h> > +#include <rte_vect.h> > + > +#include "rte_cuckoo_hash.h" > +#include "hash_sig_cmp_func_pvt.h" > + > +#define DENSE_HASH_BULK_LOOKUP 0 > + > +static inline void > +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t > *sec_hash_matches, > + const struct rte_hash_bucket *prim_bkt, > + const struct rte_hash_bucket *sec_bkt, > + uint16_t sig, > + enum rte_hash_sig_compare_function sig_cmp_fn) > +{ > + /* For match mask the first bit of every two bits indicates the match > */ > + switch (sig_cmp_fn) { > +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8 The check on RTE_HASH_BUCKET_ENTRIES <= 8 seems new. It was not present in the previous implementation for SSE2, and this difference is not explained. > + case RTE_HASH_COMPARE_SSE: > + /* Compare all signatures in the bucket */ > + *prim_hash_matches = > _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128( > + (__m128i const *)prim_bkt->sig_current), > _mm_set1_epi16(sig))); > + /* Extract the even-index bits only */ > + *prim_hash_matches &= 0x5555; > + /* Compare all signatures in the bucket */ > + *sec_hash_matches = > _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128( > + (__m128i const *)sec_bkt->sig_current), > _mm_set1_epi16(sig))); > + /* Extract the even-index bits only */ > + *sec_hash_matches &= 0x5555; > + break; > +#endif /* defined(__SSE2__) */ > + default: > + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { > + *prim_hash_matches |= (sig == > prim_bkt->sig_current[i]) << (i << 1); > + *sec_hash_matches |= (sig == sec_bkt->sig_current[i]) > << (i << 1); > + } > + } > +} > diff --git a/lib/hash/hash_sig_cmp_func_pvt.h > b/lib/hash/hash_sig_cmp_func_pvt.h > new file mode 100644 > index 0000000000..d8d2fbffaf > --- /dev/null > +++ b/lib/hash/hash_sig_cmp_func_pvt.h Please rename as compare_signatures.h or maybe a simpler option is to move this enum declaration in rte_cuckoo_hash.c before including the per arch headers. > @@ -0,0 +1,20 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2024 Arm Limited > + */ > + > +#ifndef _SIG_CMP_FUNC_H_ > +#define _SIG_CMP_FUNC_H_ If keeping a header, this guard must reflect the file name. > + > +/** Enum used to select the implementation of the signature comparison > function to use /* is enough, doxygen only parses public headers. > + * eg: A system supporting SVE might want to use a NEON implementation. > + * Those may change and are for internal use only > + */ > +enum rte_hash_sig_compare_function { > + RTE_HASH_COMPARE_SCALAR = 0, > + RTE_HASH_COMPARE_SSE, > + RTE_HASH_COMPARE_NEON, > + RTE_HASH_COMPARE_SVE, > + RTE_HASH_COMPARE_NUM > +}; > + > +#endif [snip] > diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h > index a528f1d1a0..26a992419a 100644 > --- a/lib/hash/rte_cuckoo_hash.h > +++ b/lib/hash/rte_cuckoo_hash.h > @@ -134,14 +134,6 @@ struct rte_hash_key { > char key[0]; > }; > > -/* All different signature compare functions */ > -enum rte_hash_sig_compare_function { > - RTE_HASH_COMPARE_SCALAR = 0, > - RTE_HASH_COMPARE_SSE, > - RTE_HASH_COMPARE_NEON, > - RTE_HASH_COMPARE_NUM > -}; > - > /** Bucket structure */ > struct __rte_cache_aligned rte_hash_bucket { > uint16_t sig_current[RTE_HASH_BUCKET_ENTRIES]; > @@ -199,7 +191,7 @@ struct __rte_cache_aligned rte_hash { > /**< Custom function used to compare keys. */ > enum cmp_jump_table_case cmp_jump_table_idx; > /**< Indicates which compare function to use. */ > - enum rte_hash_sig_compare_function sig_cmp_fn; > + unsigned int sig_cmp_fn; >From an ABI perspective, it looks ok. We may be breaking users that would inspect this public object, but I think it is ok. In any case, put this change in a separate patch so it is more visible. > /**< Indicates which signature compare function to use. */ > uint32_t bucket_bitmask; > /**< Bitmask for getting bucket index from hash signature. */ > -- > 2.25.1 > -- David Marchand