On Mon, Nov 27, 2017 at 03:49:45PM +0800, Herbert Guan wrote: > This patch provides an option to do rte_memcpy() using 'restrict' > qualifier, which can induce GCC to do optimizations by using more > efficient instructions, providing some performance gain over memcpy() > on some AArch64 platforms/enviroments. > > The memory copy performance differs between different AArch64 > platforms. And a more recent glibc (e.g. 2.23 or later) > can provide a better memcpy() performance compared to old glibc > versions. It's always suggested to use a more recent glibc if > possible, from which the entire system can get benefit. If for some > reason an old glibc has to be used, this patch is provided for an > alternative. > > This implementation can improve memory copy on some AArch64 > platforms, when an old glibc (e.g. 2.19, 2.17...) is being used. > It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY" > defined to activate. It's not always proving better performance > than memcpy() so users need to run DPDK unit test > "memcpy_perf_autotest" and customize parameters in "customization > section" in rte_memcpy_64.h for best performance. > > Compiler version will also impact the rte_memcpy() performance. > It's observed on some platforms and with the same code, GCC 7.2.0 > compiled binary can provide better performance than GCC 4.8.5. It's > suggested to use GCC 5.4.0 or later. > > Signed-off-by: Herbert Guan <herbert.g...@arm.com> > --- > .../common/include/arch/arm/rte_memcpy_64.h | 193 > +++++++++++++++++++++ > 1 file changed, 193 insertions(+) > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h > b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h > index b80d8ba..1f42b3c 100644 > --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h > +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h > @@ -42,6 +42,197 @@ > > #include "generic/rte_memcpy.h" > > +#ifdef RTE_ARCH_ARM64_MEMCPY
There is an existing flag for arm32 to enable neon based memcpy RTE_ARCH_ARM_NEON_MEMCPY we could reuse that here as restrict does the same. > +#include <rte_common.h> > +#include <rte_branch_prediction.h> > + > +/******************************************************************************* > + * The memory copy performance differs on different AArch64 > micro-architectures. > + * And the most recent glibc (e.g. 2.23 or later) can provide a better > memcpy() > + * performance compared to old glibc versions. It's always suggested to use a > + * more recent glibc if possible, from which the entire system can get > benefit. > + * > + * This implementation improves memory copy on some aarch64 > micro-architectures, > + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by > + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not > + * always providing better performance than memcpy() so users need to run > unit > + * test "memcpy_perf_autotest" and customize parameters in customization > section > + * below for best performance. > + * > + * Compiler version will also impact the rte_memcpy() performance. It's > observed > + * on some platforms and with the same code, GCC 7.2.0 compiled binaries can > + * provide better performance than GCC 4.8.5 compiled binaries. > + > ******************************************************************************/ > + > +/************************************** > + * Beginning of customization section > + **************************************/ > +#define ALIGNMENT_MASK 0x0F > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN > +// Only src unalignment will be treaed as unaligned copy > +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK) We can use existing `rte_is_aligned` function instead. > +#else > +// Both dst and src unalignment will be treated as unaligned copy > +#define IS_UNALIGNED_COPY(dst, src) \ > + (((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK) > +#endif > + > + > +// If copy size is larger than threshold, memcpy() will be used. > +// Run "memcpy_perf_autotest" to determine the proper threshold. > +#define ALIGNED_THRESHOLD ((size_t)(0xffffffff)) > +#define UNALIGNED_THRESHOLD ((size_t)(0xffffffff)) > + > + > +/************************************** > + * End of customization section > + **************************************/ > +#ifdef RTE_TOOLCHAIN_GCC > +#if (GCC_VERSION < 50400) > +#warning "The GCC version is quite old, which may result in sub-optimal \ > +performance of the compiled code. It is suggested that at least GCC 5.4.0 \ > +be used." > +#endif > +#endif > + > +static inline void __attribute__ ((__always_inline__)) use __rte_always_inline instead. > +rte_mov16(uint8_t *restrict dst, const uint8_t *restrict src) > +{ > + __int128 * restrict dst128 = (__int128 * restrict)dst; > + const __int128 * restrict src128 = (const __int128 * restrict)src; > + *dst128 = *src128; > +} > + > +static inline void __attribute__ ((__always_inline__)) > +rte_mov64(uint8_t *restrict dst, const uint8_t *restrict src) > +{ > + __int128 * restrict dst128 = (__int128 * restrict)dst; ISO C does not support ‘__int128’ please use '__int128_t' or '__uint128_t'. > + const __int128 * restrict src128 = (const __int128 * restrict)src; > + dst128[0] = src128[0]; > + dst128[1] = src128[1]; > + dst128[2] = src128[2]; > + dst128[3] = src128[3]; > +} > + <snip> Would doing this still benifit if size is compile time constant? i.e. when __builtin_constant_p(n) is true. > + > +static inline void *__attribute__ ((__always_inline__)) > +rte_memcpy(void *restrict dst, const void *restrict src, size_t n) > +{ > + if (n < 16) { > + rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n); > + return dst; > + } > + if (n < 64) { > + rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n); > + return dst; > + } > + __builtin_prefetch(src, 0, 0); > + __builtin_prefetch(dst, 1, 0); > + if (likely( > + (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD) > + || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD) > + )) { > + rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n); > + return dst; > + } else > + return memcpy(dst, src, n); > +} > + > + > +#else > static inline void > rte_mov16(uint8_t *dst, const uint8_t *src) > { > @@ -80,6 +271,8 @@ > > #define rte_memcpy(d, s, n) memcpy((d), (s), (n)) > > +#endif > + > #ifdef __cplusplus > } > #endif > -- > 1.8.3.1 > Regards, Pavan.