On Tue, Apr 30, 2024 at 6:28 PM Yoan Picchi <yoan.pic...@arm.com> wrote: > > - Implemented SVE code for comparing signatures in bulk lookup. > - New SVE code is ~5% slower than optimized NEON for N2 processor for > 128b vectors. > > Signed-off-by: Yoan Picchi <yoan.pic...@arm.com> > Signed-off-by: Harjot Singh <harjot.si...@arm.com> > Reviewed-by: Nathan Brown <nathan.br...@arm.com> > Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com> > --- > lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++ > lib/hash/rte_cuckoo_hash.c | 7 +++- > lib/hash/rte_cuckoo_hash.h | 1 + > 3 files changed, 65 insertions(+), 1 deletion(-) > > diff --git a/lib/hash/arch/arm/compare_signatures.h > b/lib/hash/arch/arm/compare_signatures.h > index 72bd171484..b4b4cf04e9 100644 > --- a/lib/hash/arch/arm/compare_signatures.h > +++ b/lib/hash/arch/arm/compare_signatures.h > @@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer, > *hitmask_buffer = vaddvq_u16(hit2); > } > break; > +#endif > +#if defined(RTE_HAS_SVE_ACLE) > + case RTE_HASH_COMPARE_SVE: { > + svuint16_t vsign, shift, sv_matches; > + svbool_t pred, match, bucket_wide_pred; > + int i = 0; > + uint64_t vl = svcnth(); > + > + vsign = svdup_u16(sig); > + shift = svindex_u16(0, 1); > + > + if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && > RTE_HASH_BUCKET_ENTRIES <= 8) { > + svuint16_t primary_array_vect, secondary_array_vect; > + bucket_wide_pred = svwhilelt_b16(0, > RTE_HASH_BUCKET_ENTRIES); > + primary_array_vect = svld1_u16(bucket_wide_pred, > prim_bucket_sigs); > + secondary_array_vect = svld1_u16(bucket_wide_pred, > sec_bucket_sigs); > + > + /* We merged the two vectors so we can do both > comparisons at once */ > + primary_array_vect = svsplice_u16(bucket_wide_pred, > + primary_array_vect, > + secondary_array_vect); > + pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES); > + > + /* Compare all signatures in the buckets */ > + match = svcmpeq_u16(pred, vsign, primary_array_vect); > + if (svptest_any(svptrue_b16(), match)) { > + sv_matches = svdup_u16(1); > + sv_matches = svlsl_u16_z(match, sv_matches, > shift); > + *hitmask_buffer = svorv_u16(svptrue_b16(), > sv_matches); > + } > + } else { > + do { > + pred = svwhilelt_b16(i, > RTE_HASH_BUCKET_ENTRIES); > + uint16_t lower_half = 0; > + uint16_t upper_half = 0; > + /* Compare all signatures in the primary > bucket */ > + match = svcmpeq_u16(pred, vsign, > svld1_u16(pred, > + > &prim_bucket_sigs[i])); > + if (svptest_any(svptrue_b16(), match)) { > + sv_matches = svdup_u16(1); > + sv_matches = svlsl_u16_z(match, > sv_matches, shift); > + lower_half = svorv_u16(svptrue_b16(), > sv_matches); > + } > + /* Compare all signatures in the secondary > bucket */ > + match = svcmpeq_u16(pred, vsign, > svld1_u16(pred, > + &sec_bucket_sigs[i])); > + if (svptest_any(svptrue_b16(), match)) { > + sv_matches = svdup_u16(1); > + sv_matches = svlsl_u16_z(match, > sv_matches, shift); > + upper_half = svorv_u16(svptrue_b16(), > sv_matches) > + << RTE_HASH_BUCKET_ENTRIES; > + } > + hitmask_buffer[i / 8] = upper_half | > lower_half; > + i += vl; > + } while (i < RTE_HASH_BUCKET_ENTRIES); > + } > + } > + break; > #endif > default: > for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { > diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c > index 0697743cdf..75f555ba2c 100644 > --- a/lib/hash/rte_cuckoo_hash.c > +++ b/lib/hash/rte_cuckoo_hash.c > @@ -450,8 +450,13 @@ rte_hash_create(const struct rte_hash_parameters *params) > h->sig_cmp_fn = RTE_HASH_COMPARE_SSE; > else > #elif defined(RTE_ARCH_ARM64) > - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) { > h->sig_cmp_fn = RTE_HASH_COMPARE_NEON; > +#if defined(RTE_HAS_SVE_ACLE) > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE)) > + h->sig_cmp_fn = RTE_HASH_COMPARE_SVE; > +#endif > + } > else > #endif > h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR; > diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h > index a528f1d1a0..01ad01c258 100644 > --- a/lib/hash/rte_cuckoo_hash.h > +++ b/lib/hash/rte_cuckoo_hash.h > @@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function { > RTE_HASH_COMPARE_SCALAR = 0, > RTE_HASH_COMPARE_SSE, > RTE_HASH_COMPARE_NEON, > + RTE_HASH_COMPARE_SVE, > RTE_HASH_COMPARE_NUM > };
I am surprised the ABI check does not complain over this change. RTE_HASH_COMPARE_NUM is not used and knowing the number of compare function implementations should not be of interest for an application. But it still seem an ABI breakage to me. RTE_HASH_COMPARE_NUM can be removed in v24.11. And ideally, sig_cmp_fn should be made opaque (or moved to an opaque struct out of the rte_hash public struct). -- David Marchand