[AMD Official Use Only - AMD Internal Distribution Only] Hi All,
I was occupied with a few works at Dec 2025. Let me look at this in weeked. > -----Original Message----- > From: Morten Brørup <[email protected]> > Sent: Saturday, January 3, 2026 11:23 PM > To: [email protected]; Bruce Richardson <[email protected]>; Konstantin > Ananyev <[email protected]>; Varghese, Vipin > <[email protected]> > Cc: Stephen Hemminger <[email protected]> > Subject: RE: [PATCH v5] eal/x86: optimize memcpy of small sizes > > Caution: This message originated from an External Source. Use proper caution > when opening attachments, clicking links, or responding. > > > PING for review. > > Regardless if you - for good and valid reasons - generally prefer memcpy() > over > rte_memcpy(), rte_memcpy() is not deprecated or being phased out, so > improvements should be allowed. > > > > From: Morten Brørup [mailto:[email protected]] > > Sent: Monday, 1 December 2025 16.55 > > > > eal/x86: optimize memcpy of small sizes > > > > The implementation for copying up to 64 bytes does not depend on > > address alignment with the size of the CPU's vector registers, so the > > code handling this was moved from the various implementations to the > > common function. > > > > Furthermore, the function for copying less than 16 bytes was replaced > > with a smarter implementation using fewer branches and potentially > > fewer load/store operations. > > This function was also extended to handle copying of up to 16 bytes, > > instead of up to 15 bytes. This small extension reduces the code path > > for copying two pointers. > > > > These changes provide two benefits: > > 1. The memory footprint of the copy function is reduced. > > Previously there were two instances of the compiled code to copy up to > > 64 > > bytes, one in the "aligned" code path, and one in the "generic" code > > path. > > Now there is only one instance, in the "common" code path. > > 2. The performance for copying up to 64 bytes is improved. > > The memcpy performance test shows cache-to-cache copying of up to 32 > > bytes now typically only takes 2 cycles (4 cycles for 64 bytes) versus > > ca. 6.5 cycles before this patch. > > > > And finally, the missing implementation of rte_mov48() was added. > > > > Signed-off-by: Morten Brørup <[email protected]> > > --- > > v5: > > * Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3 > > _mm_lddqu_si128(). > > It was slower. > > * Improved some comments. (Konstantin Ananyev) > > * Moved the size range 17..32 inside the size <= 64 branch, so when > > building for SSE, the generated code can start copying the first > > 16 bytes before comparing if the size is greater than 32 or not. > > * Just require RTE_MEMCPY_AVX for using rte_mov32() in > > rte_mov33_to_64(). > > v4: > > * Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128(). > > v3: > > * Fixed typo in comment. > > v2: > > * Updated patch title to reflect that the performance is improved. > > * Use the design pattern of two overlapping stores for small copies > > too. > > * Expanded first branch from size < 16 to size <= 16. > > * Handle more build time constant copy sizes. > > --- > > lib/eal/x86/include/rte_memcpy.h | 346 > > +++++++++++++++++-------------- > > 1 file changed, 186 insertions(+), 160 deletions(-) > > > > diff --git a/lib/eal/x86/include/rte_memcpy.h > > b/lib/eal/x86/include/rte_memcpy.h > > index 46d34b8081..04d1a474d1 100644 > > --- a/lib/eal/x86/include/rte_memcpy.h > > +++ b/lib/eal/x86/include/rte_memcpy.h > > @@ -55,52 +55,6 @@ extern "C" { > > static __rte_always_inline void * > > rte_memcpy(void *dst, const void *src, size_t n); > > > > -/** > > - * Copy bytes from one location to another, > > - * locations should not overlap. > > - * Use with n <= 15. > > - */ > > -static __rte_always_inline void * > > -rte_mov15_or_less(void *dst, const void *src, size_t n) -{ > > - /** > > - * Use the following structs to avoid violating C standard > > - * alignment requirements and to avoid strict aliasing bugs > > - */ > > - struct __rte_packed_begin rte_uint64_alias { > > - uint64_t val; > > - } __rte_packed_end __rte_may_alias; > > - struct __rte_packed_begin rte_uint32_alias { > > - uint32_t val; > > - } __rte_packed_end __rte_may_alias; > > - struct __rte_packed_begin rte_uint16_alias { > > - uint16_t val; > > - } __rte_packed_end __rte_may_alias; > > - > > - void *ret = dst; > > - if (n & 8) { > > - ((struct rte_uint64_alias *)dst)->val = > > - ((const struct rte_uint64_alias *)src)->val; > > - src = (const uint64_t *)src + 1; > > - dst = (uint64_t *)dst + 1; > > - } > > - if (n & 4) { > > - ((struct rte_uint32_alias *)dst)->val = > > - ((const struct rte_uint32_alias *)src)->val; > > - src = (const uint32_t *)src + 1; > > - dst = (uint32_t *)dst + 1; > > - } > > - if (n & 2) { > > - ((struct rte_uint16_alias *)dst)->val = > > - ((const struct rte_uint16_alias *)src)->val; > > - src = (const uint16_t *)src + 1; > > - dst = (uint16_t *)dst + 1; > > - } > > - if (n & 1) > > - *(uint8_t *)dst = *(const uint8_t *)src; > > - return ret; > > -} > > - > > /** > > * Copy 16 bytes from one location to another, > > * locations should not overlap. > > @@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src) > > #endif } > > > > +/** > > + * Copy 48 bytes from one location to another, > > + * locations should not overlap. > > + */ > > +static __rte_always_inline void > > +rte_mov48(uint8_t *dst, const uint8_t *src) { #if defined > > +RTE_MEMCPY_AVX > > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > + rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 + > > 48); > > +#else /* SSE implementation */ > > + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * > > 16); > > + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * > > 16); > > + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * > > 16); > > +#endif > > +} > > + > > /** > > * Copy 64 bytes from one location to another, > > * locations should not overlap. > > @@ -172,6 +143,137 @@ rte_mov256(uint8_t *dst, const uint8_t *src) > > rte_mov128(dst + 1 * 128, src + 1 * 128); } > > > > +/** > > + * Copy bytes from one location to another, > > + * locations should not overlap. > > + * Use with n <= 16. > > + * > > + * Note: Copying uninitialized memory is perfectly acceptable. > > + * Using e.g. memcpy(dst, src, 8) instead of > > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src > > + * avoids compiler warnings about source data may be uninitialized > > + * [-Wmaybe-uninitialized]. > > + */ > > +static __rte_always_inline void * > > +rte_mov16_or_less(void *dst, const void *src, size_t n) { > > + /* Faster way when size is known at build time. */ > > + if (__rte_constant(n)) { > > + if (n == 2) > > + return memcpy(dst, src, 2); > > + if (n == 4) > > + return memcpy(dst, src, 4); > > + if (n == 6) /* 4 + 2 */ > > + return memcpy(dst, src, 6); > > + if (n == 8) > > + return memcpy(dst, src, 8); > > + if (n == 10) /* 8 + 2 */ > > + return memcpy(dst, src, 10); > > + if (n == 12) /* 8 + 4 */ > > + return memcpy(dst, src, 12); > > + if (n == 16) { > > + rte_mov16((uint8_t *)dst, (const uint8_t *)src); > > + return dst; > > + } > > + } > > + > > + /* > > + * Note: Using "n & X" generates 3-byte "test" instructions, > > + * instead of "n >= X", which would generate 4-byte "cmp" > > instructions. > > + */ > > + if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18. > > */ > > + /* Copy 8 ~ 16 bytes. */ > > + memcpy(dst, src, 8); > > + memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 > > + + > > n, 8); > > + } else if (n & 0x4) { > > + /* Copy 4 ~ 7 bytes. */ > > + memcpy(dst, src, 4); > > + memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 > > + + > > n, 4); > > + } else if (n & 0x2) { > > + /* Copy 2 ~ 3 bytes. */ > > + memcpy(dst, src, 2); > > + memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 > > + + > > n, 2); > > + } else if (n & 0x1) { > > + /* Copy 1 byte. */ > > + memcpy(dst, src, 1); > > + } > > + return dst; > > +} > > + > > +/** > > + * Copy bytes from one location to another, > > + * locations should not overlap. > > + * Use with 17 (or 16) < n <= 32. > > + */ > > +static __rte_always_inline void * > > +rte_mov17_to_32(void *dst, const void *src, size_t n) { > > + /* Faster way when size is known at build time. */ > > + if (__rte_constant(n)) { > > + if (n == 16) { > > + rte_mov16((uint8_t *)dst, (const uint8_t *)src); > > + return dst; > > + } > > + if (n == 18) /* 16 + 2 */ > > + return memcpy(dst, src, 18); > > + if (n == 20) /* 16 + 4 */ > > + return memcpy(dst, src, 20); > > + if (n == 24) /* 16 + 8 */ > > + return memcpy(dst, src, 24); > > + if (n == 32) { > > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > + return dst; > > + } > > + } > > + > > + /* Copy 17 (or 16) ~ 32 bytes. */ > > + rte_mov16((uint8_t *)dst, (const uint8_t *)src); > > + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > > n); > > + return dst; > > +} > > + > > +/** > > + * Copy bytes from one location to another, > > + * locations should not overlap. > > + * Use with 33 (or 32) < n <= 64. > > + */ > > +static __rte_always_inline void * > > +rte_mov33_to_64(void *dst, const void *src, size_t n) { > > + /* Faster way when size is known at build time. */ > > + if (__rte_constant(n)) { > > + if (n == 32) { > > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > + return dst; > > + } > > + if (n == 36) /* 32 + 4 */ > > + return memcpy(dst, src, 36); > > + if (n == 40) /* 32 + 8 */ > > + return memcpy(dst, src, 40); > > + if (n == 48) { > > + rte_mov48((uint8_t *)dst, (const uint8_t *)src); > > + return dst; > > + } > > + if (n == 64) { > > + rte_mov64((uint8_t *)dst, (const uint8_t *)src); > > + return dst; > > + } > > + } > > + > > + /* Copy 33 (or 32) ~ 64 bytes. */ #if defined RTE_MEMCPY_AVX > > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > + rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + > > n); > > +#else /* SSE implementation */ > > + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * > > 16); > > + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * > > 16); > > + if (n > 48) > > + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src > > + + 2 > > * 16); > > + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > > n); > > +#endif > > + return dst; > > +} > > + > > #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > > > /** > > @@ -232,45 +334,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t > > *src, size_t n) > > } > > } > > > > +/** > > + * Copy bytes from one location to another, > > + * locations should not overlap. > > + * Use with n > 64. > > + */ > > static __rte_always_inline void * > > -rte_memcpy_generic(void *dst, const void *src, size_t n) > > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n) > > { > > void *ret = dst; > > size_t dstofss; > > size_t bits; > > > > - /** > > - * Copy less than 16 bytes > > - */ > > - if (n < 16) { > > - return rte_mov15_or_less(dst, src, n); > > - } > > - > > /** > > * Fast way when copy size doesn't exceed 512 bytes > > */ > > - if (__rte_constant(n) && n == 32) { > > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > - return ret; > > - } > > - if (n <= 32) { > > - rte_mov16((uint8_t *)dst, (const uint8_t *)src); > > - if (__rte_constant(n) && n == 16) > > - return ret; /* avoid (harmless) duplicate copy */ > > - rte_mov16((uint8_t *)dst - 16 + n, > > - (const uint8_t *)src - 16 + n); > > - return ret; > > - } > > - if (__rte_constant(n) && n == 64) { > > - rte_mov64((uint8_t *)dst, (const uint8_t *)src); > > - return ret; > > - } > > - if (n <= 64) { > > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > - rte_mov32((uint8_t *)dst - 32 + n, > > - (const uint8_t *)src - 32 + n); > > - return ret; > > - } > > if (n <= 512) { > > if (n >= 256) { > > n -= 256; > > @@ -381,41 +459,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t > > *src, size_t n) > > } > > } > > > > +/** > > + * Copy bytes from one location to another, > > + * locations should not overlap. > > + * Use with n > 64. > > + */ > > static __rte_always_inline void * > > -rte_memcpy_generic(void *dst, const void *src, size_t n) > > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n) > > { > > void *ret = dst; > > size_t dstofss; > > size_t bits; > > > > - /** > > - * Copy less than 16 bytes > > - */ > > - if (n < 16) { > > - return rte_mov15_or_less(dst, src, n); > > - } > > - > > /** > > * Fast way when copy size doesn't exceed 256 bytes > > */ > > - if (__rte_constant(n) && n == 32) { > > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > - return ret; > > - } > > - if (n <= 32) { > > - rte_mov16((uint8_t *)dst, (const uint8_t *)src); > > - if (__rte_constant(n) && n == 16) > > - return ret; /* avoid (harmless) duplicate copy */ > > - rte_mov16((uint8_t *)dst - 16 + n, > > - (const uint8_t *)src - 16 + n); > > - return ret; > > - } > > - if (n <= 64) { > > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > - rte_mov32((uint8_t *)dst - 32 + n, > > - (const uint8_t *)src - 32 + n); > > - return ret; > > - } > > if (n <= 256) { > > if (n >= 128) { > > n -= 128; > > @@ -573,38 +631,22 @@ rte_memcpy_generic(void *dst, const void *src, > > size_t n) > > } > > \ > > } > > > > +/** > > + * Copy bytes from one location to another, > > + * locations should not overlap. > > + * Use with n > 64. > > + */ > > static __rte_always_inline void * > > -rte_memcpy_generic(void *dst, const void *src, size_t n) > > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n) > > { > > __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; > > void *ret = dst; > > size_t dstofss; > > size_t srcofs; > > > > - /** > > - * Copy less than 16 bytes > > - */ > > - if (n < 16) { > > - return rte_mov15_or_less(dst, src, n); > > - } > > - > > /** > > * Fast way when copy size doesn't exceed 512 bytes > > */ > > - if (n <= 32) { > > - rte_mov16((uint8_t *)dst, (const uint8_t *)src); > > - if (__rte_constant(n) && n == 16) > > - return ret; /* avoid (harmless) duplicate copy */ > > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - > > 16 + n); > > - return ret; > > - } > > - if (n <= 64) { > > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > - if (n > 48) > > - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + > > 32); > > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - > > 16 + n); > > - return ret; > > - } > > if (n <= 128) { > > goto COPY_BLOCK_128_BACK15; > > } > > @@ -696,44 +738,16 @@ rte_memcpy_generic(void *dst, const void *src, > > size_t n) > > > > #endif /* __AVX512F__ */ > > > > +/** > > + * Copy bytes from one vector register size aligned location to > > another, > > + * locations should not overlap. > > + * Use with n > 64. > > + */ > > static __rte_always_inline void * > > -rte_memcpy_aligned(void *dst, const void *src, size_t n) > > +rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n) > > { > > void *ret = dst; > > > > - /* Copy size < 16 bytes */ > > - if (n < 16) { > > - return rte_mov15_or_less(dst, src, n); > > - } > > - > > - /* Copy 16 <= size <= 32 bytes */ > > - if (__rte_constant(n) && n == 32) { > > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > - return ret; > > - } > > - if (n <= 32) { > > - rte_mov16((uint8_t *)dst, (const uint8_t *)src); > > - if (__rte_constant(n) && n == 16) > > - return ret; /* avoid (harmless) duplicate copy */ > > - rte_mov16((uint8_t *)dst - 16 + n, > > - (const uint8_t *)src - 16 + n); > > - > > - return ret; > > - } > > - > > - /* Copy 32 < size <= 64 bytes */ > > - if (__rte_constant(n) && n == 64) { > > - rte_mov64((uint8_t *)dst, (const uint8_t *)src); > > - return ret; > > - } > > - if (n <= 64) { > > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > - rte_mov32((uint8_t *)dst - 32 + n, > > - (const uint8_t *)src - 32 + n); > > - > > - return ret; > > - } > > - > > /* Copy 64 bytes blocks */ > > for (; n > 64; n -= 64) { > > rte_mov64((uint8_t *)dst, (const uint8_t *)src); @@ > > -751,10 +765,22 @@ rte_memcpy_aligned(void *dst, const void *src, > > size_t n) static __rte_always_inline void * rte_memcpy(void *dst, > > const void *src, size_t n) { > > + /* Common implementation for size <= 64 bytes. */ > > + if (n <= 16) > > + return rte_mov16_or_less(dst, src, n); > > + if (n <= 64) { > > + /* Copy 17 ~ 64 bytes using vector instructions. */ > > + if (n <= 32) > > + return rte_mov17_to_32(dst, src, n); > > + else > > + return rte_mov33_to_64(dst, src, n); > > + } > > + > > + /* Implementation for size > 64 bytes depends on alignment with > > vector register size. */ > > if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) > > - return rte_memcpy_aligned(dst, src, n); > > + return rte_memcpy_aligned_more_than_64(dst, src, n); > > else > > - return rte_memcpy_generic(dst, src, n); > > + return rte_memcpy_generic_more_than_64(dst, src, n); > > } > > > > #undef ALIGNMENT_MASK > > -- > > 2.43.0

