RE: [PATCH v5] eal/x86: optimize memcpy of small sizes

Varghese, Vipin Fri, 09 Jan 2026 07:05:38 -0800

[AMD Official Use Only - AMD Internal Distribution Only]

Hi All,


I was occupied with a few works at Dec 2025. Let me look at this in weeked.

> -----Original Message-----
> From: Morten Brørup <[email protected]>
> Sent: Saturday, January 3, 2026 11:23 PM
> To: [email protected]; Bruce Richardson <[email protected]>; Konstantin
> Ananyev <[email protected]>; Varghese, Vipin
> <[email protected]>
> Cc: Stephen Hemminger <[email protected]>
> Subject: RE: [PATCH v5] eal/x86: optimize memcpy of small sizes
>
> Caution: This message originated from an External Source. Use proper caution
> when opening attachments, clicking links, or responding.
>
>
> PING for review.
>
> Regardless if you - for good and valid reasons - generally prefer memcpy() 
> over
> rte_memcpy(), rte_memcpy() is not deprecated or being phased out, so
> improvements should be allowed.
>
>
> > From: Morten Brørup [mailto:[email protected]]
> > Sent: Monday, 1 December 2025 16.55
> >
> > eal/x86: optimize memcpy of small sizes
> >
> > The implementation for copying up to 64 bytes does not depend on
> > address alignment with the size of the CPU's vector registers, so the
> > code handling this was moved from the various implementations to the
> > common function.
> >
> > Furthermore, the function for copying less than 16 bytes was replaced
> > with a smarter implementation using fewer branches and potentially
> > fewer load/store operations.
> > This function was also extended to handle copying of up to 16 bytes,
> > instead of up to 15 bytes. This small extension reduces the code path
> > for copying two pointers.
> >
> > These changes provide two benefits:
> > 1. The memory footprint of the copy function is reduced.
> > Previously there were two instances of the compiled code to copy up to
> > 64
> > bytes, one in the "aligned" code path, and one in the "generic" code
> > path.
> > Now there is only one instance, in the "common" code path.
> > 2. The performance for copying up to 64 bytes is improved.
> > The memcpy performance test shows cache-to-cache copying of up to 32
> > bytes now typically only takes 2 cycles (4 cycles for 64 bytes) versus
> > ca. 6.5 cycles before this patch.
> >
> > And finally, the missing implementation of rte_mov48() was added.
> >
> > Signed-off-by: Morten Brørup <[email protected]>
> > ---
> > v5:
> > * Reverted v4: Replace SSE2 _mm_loadu_si128() with SSE3
> > _mm_lddqu_si128().
> >   It was slower.
> > * Improved some comments. (Konstantin Ananyev)
> > * Moved the size range 17..32 inside the size <= 64 branch, so when
> >   building for SSE, the generated code can start copying the first
> >   16 bytes before comparing if the size is greater than 32 or not.
> > * Just require RTE_MEMCPY_AVX for using rte_mov32() in
> > rte_mov33_to_64().
> > v4:
> > * Replace SSE2 _mm_loadu_si128() with SSE3 _mm_lddqu_si128().
> > v3:
> > * Fixed typo in comment.
> > v2:
> > * Updated patch title to reflect that the performance is improved.
> > * Use the design pattern of two overlapping stores for small copies
> > too.
> > * Expanded first branch from size < 16 to size <= 16.
> > * Handle more build time constant copy sizes.
> > ---
> >  lib/eal/x86/include/rte_memcpy.h | 346
> > +++++++++++++++++--------------
> >  1 file changed, 186 insertions(+), 160 deletions(-)
> >
> > diff --git a/lib/eal/x86/include/rte_memcpy.h
> > b/lib/eal/x86/include/rte_memcpy.h
> > index 46d34b8081..04d1a474d1 100644
> > --- a/lib/eal/x86/include/rte_memcpy.h
> > +++ b/lib/eal/x86/include/rte_memcpy.h
> > @@ -55,52 +55,6 @@ extern "C" {
> >  static __rte_always_inline void *
> >  rte_memcpy(void *dst, const void *src, size_t n);
> >
> > -/**
> > - * Copy bytes from one location to another,
> > - * locations should not overlap.
> > - * Use with n <= 15.
> > - */
> > -static __rte_always_inline void *
> > -rte_mov15_or_less(void *dst, const void *src, size_t n) -{
> > -     /**
> > -      * Use the following structs to avoid violating C standard
> > -      * alignment requirements and to avoid strict aliasing bugs
> > -      */
> > -     struct __rte_packed_begin rte_uint64_alias {
> > -             uint64_t val;
> > -     } __rte_packed_end __rte_may_alias;
> > -     struct __rte_packed_begin rte_uint32_alias {
> > -             uint32_t val;
> > -     } __rte_packed_end __rte_may_alias;
> > -     struct __rte_packed_begin rte_uint16_alias {
> > -             uint16_t val;
> > -     } __rte_packed_end __rte_may_alias;
> > -
> > -     void *ret = dst;
> > -     if (n & 8) {
> > -             ((struct rte_uint64_alias *)dst)->val =
> > -                     ((const struct rte_uint64_alias *)src)->val;
> > -             src = (const uint64_t *)src + 1;
> > -             dst = (uint64_t *)dst + 1;
> > -     }
> > -     if (n & 4) {
> > -             ((struct rte_uint32_alias *)dst)->val =
> > -                     ((const struct rte_uint32_alias *)src)->val;
> > -             src = (const uint32_t *)src + 1;
> > -             dst = (uint32_t *)dst + 1;
> > -     }
> > -     if (n & 2) {
> > -             ((struct rte_uint16_alias *)dst)->val =
> > -                     ((const struct rte_uint16_alias *)src)->val;
> > -             src = (const uint16_t *)src + 1;
> > -             dst = (uint16_t *)dst + 1;
> > -     }
> > -     if (n & 1)
> > -             *(uint8_t *)dst = *(const uint8_t *)src;
> > -     return ret;
> > -}
> > -
> >  /**
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> > @@ -132,6 +86,23 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> > #endif  }
> >
> > +/**
> > + * Copy 48 bytes from one location to another,
> > + * locations should not overlap.
> > + */
> > +static __rte_always_inline void
> > +rte_mov48(uint8_t *dst, const uint8_t *src) { #if defined
> > +RTE_MEMCPY_AVX
> > +     rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +     rte_mov32((uint8_t *)dst - 32 + 48, (const uint8_t *)src - 32 +
> > 48);
> > +#else /* SSE implementation */
> > +     rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> > 16);
> > +     rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> > 16);
> > +     rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 *
> > 16);
> > +#endif
> > +}
> > +
> >  /**
> >   * Copy 64 bytes from one location to another,
> >   * locations should not overlap.
> > @@ -172,6 +143,137 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
> >       rte_mov128(dst + 1 * 128, src + 1 * 128);  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n <= 16.
> > + *
> > + * Note: Copying uninitialized memory is perfectly acceptable.
> > + * Using e.g. memcpy(dst, src, 8) instead of
> > + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> > + * avoids compiler warnings about source data may be uninitialized
> > + * [-Wmaybe-uninitialized].
> > + */
> > +static __rte_always_inline void *
> > +rte_mov16_or_less(void *dst, const void *src, size_t n) {
> > +     /* Faster way when size is known at build time. */
> > +     if (__rte_constant(n)) {
> > +             if (n == 2)
> > +                     return memcpy(dst, src, 2);
> > +             if (n == 4)
> > +                     return memcpy(dst, src, 4);
> > +             if (n == 6) /* 4 + 2 */
> > +                     return memcpy(dst, src, 6);
> > +             if (n == 8)
> > +                     return memcpy(dst, src, 8);
> > +             if (n == 10) /* 8 + 2 */
> > +                     return memcpy(dst, src, 10);
> > +             if (n == 12) /* 8 + 4 */
> > +                     return memcpy(dst, src, 12);
> > +             if (n == 16) {
> > +                     rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +     }
> > +
> > +     /*
> > +      * Note: Using "n & X" generates 3-byte "test" instructions,
> > +      * instead of "n >= X", which would generate 4-byte "cmp"
> > instructions.
> > +      */
> > +     if (n & 0x18) { /* n >= 8, including n == 0x10, hence n & 0x18.
> > */
> > +             /* Copy 8 ~ 16 bytes. */
> > +             memcpy(dst, src, 8);
> > +             memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8
> > + +
> > n, 8);
> > +     } else if (n & 0x4) {
> > +             /* Copy 4 ~ 7 bytes. */
> > +             memcpy(dst, src, 4);
> > +             memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4
> > + +
> > n, 4);
> > +     } else if (n & 0x2) {
> > +             /* Copy 2 ~ 3 bytes. */
> > +             memcpy(dst, src, 2);
> > +             memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2
> > + +
> > n, 2);
> > +     } else if (n & 0x1) {
> > +             /* Copy 1 byte. */
> > +             memcpy(dst, src, 1);
> > +     }
> > +     return dst;
> > +}
> > +
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with 17 (or 16) < n <= 32.
> > + */
> > +static __rte_always_inline void *
> > +rte_mov17_to_32(void *dst, const void *src, size_t n) {
> > +     /* Faster way when size is known at build time. */
> > +     if (__rte_constant(n)) {
> > +             if (n == 16) {
> > +                     rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +             if (n == 18) /* 16 + 2 */
> > +                     return memcpy(dst, src, 18);
> > +             if (n == 20) /* 16 + 4 */
> > +                     return memcpy(dst, src, 20);
> > +             if (n == 24) /* 16 + 8 */
> > +                     return memcpy(dst, src, 24);
> > +             if (n == 32) {
> > +                     rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +     }
> > +
> > +     /* Copy 17 (or 16) ~ 32 bytes. */
> > +     rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > +     rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> > n);
> > +     return dst;
> > +}
> > +
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with 33 (or 32) < n <= 64.
> > + */
> > +static __rte_always_inline void *
> > +rte_mov33_to_64(void *dst, const void *src, size_t n) {
> > +     /* Faster way when size is known at build time. */
> > +     if (__rte_constant(n)) {
> > +             if (n == 32) {
> > +                     rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +             if (n == 36) /* 32 + 4 */
> > +                     return memcpy(dst, src, 36);
> > +             if (n == 40) /* 32 + 8 */
> > +                     return memcpy(dst, src, 40);
> > +             if (n == 48) {
> > +                     rte_mov48((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +             if (n == 64) {
> > +                     rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +                     return dst;
> > +             }
> > +     }
> > +
> > +     /* Copy 33 (or 32) ~ 64 bytes. */ #if defined RTE_MEMCPY_AVX
> > +     rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +     rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 +
> > n);
> > +#else /* SSE implementation */
> > +     rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> > 16);
> > +     rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> > 16);
> > +     if (n > 48)
> > +             rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src
> > + + 2
> > * 16);
> > +     rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> > n);
> > +#endif
> > +     return dst;
> > +}
> > +
> >  #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> >
> >  /**
> > @@ -232,45 +334,21 @@ rte_mov512blocks(uint8_t *dst, const uint8_t
> > *src, size_t n)
> >       }
> >  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >       void *ret = dst;
> >       size_t dstofss;
> >       size_t bits;
> >
> > -     /**
> > -      * Copy less than 16 bytes
> > -      */
> > -     if (n < 16) {
> > -             return rte_mov15_or_less(dst, src, n);
> > -     }
> > -
> >       /**
> >        * Fast way when copy size doesn't exceed 512 bytes
> >        */
> > -     if (__rte_constant(n) && n == 32) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             return ret;
> > -     }
> > -     if (n <= 32) {
> > -             rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -             if (__rte_constant(n) && n == 16)
> > -                     return ret; /* avoid (harmless) duplicate copy */
> > -             rte_mov16((uint8_t *)dst - 16 + n,
> > -                               (const uint8_t *)src - 16 + n);
> > -             return ret;
> > -     }
> > -     if (__rte_constant(n) && n == 64) {
> > -             rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > -             return ret;
> > -     }
> > -     if (n <= 64) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             rte_mov32((uint8_t *)dst - 32 + n,
> > -                               (const uint8_t *)src - 32 + n);
> > -             return ret;
> > -     }
> >       if (n <= 512) {
> >               if (n >= 256) {
> >                       n -= 256;
> > @@ -381,41 +459,21 @@ rte_mov128blocks(uint8_t *dst, const uint8_t
> > *src, size_t n)
> >       }
> >  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >       void *ret = dst;
> >       size_t dstofss;
> >       size_t bits;
> >
> > -     /**
> > -      * Copy less than 16 bytes
> > -      */
> > -     if (n < 16) {
> > -             return rte_mov15_or_less(dst, src, n);
> > -     }
> > -
> >       /**
> >        * Fast way when copy size doesn't exceed 256 bytes
> >        */
> > -     if (__rte_constant(n) && n == 32) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             return ret;
> > -     }
> > -     if (n <= 32) {
> > -             rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -             if (__rte_constant(n) && n == 16)
> > -                     return ret; /* avoid (harmless) duplicate copy */
> > -             rte_mov16((uint8_t *)dst - 16 + n,
> > -                             (const uint8_t *)src - 16 + n);
> > -             return ret;
> > -     }
> > -     if (n <= 64) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             rte_mov32((uint8_t *)dst - 32 + n,
> > -                             (const uint8_t *)src - 32 + n);
> > -             return ret;
> > -     }
> >       if (n <= 256) {
> >               if (n >= 128) {
> >                       n -= 128;
> > @@ -573,38 +631,22 @@ rte_memcpy_generic(void *dst, const void *src,
> > size_t n)
> >      }
> > \
> >  }
> >
> > +/**
> > + * Copy bytes from one location to another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >       __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
> >       void *ret = dst;
> >       size_t dstofss;
> >       size_t srcofs;
> >
> > -     /**
> > -      * Copy less than 16 bytes
> > -      */
> > -     if (n < 16) {
> > -             return rte_mov15_or_less(dst, src, n);
> > -     }
> > -
> >       /**
> >        * Fast way when copy size doesn't exceed 512 bytes
> >        */
> > -     if (n <= 32) {
> > -             rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -             if (__rte_constant(n) && n == 16)
> > -                     return ret; /* avoid (harmless) duplicate copy */
> > -             rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src -
> > 16 + n);
> > -             return ret;
> > -     }
> > -     if (n <= 64) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             if (n > 48)
> > -                     rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src +
> > 32);
> > -             rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src -
> > 16 + n);
> > -             return ret;
> > -     }
> >       if (n <= 128) {
> >               goto COPY_BLOCK_128_BACK15;
> >       }
> > @@ -696,44 +738,16 @@ rte_memcpy_generic(void *dst, const void *src,
> > size_t n)
> >
> >  #endif /* __AVX512F__ */
> >
> > +/**
> > + * Copy bytes from one vector register size aligned location to
> > another,
> > + * locations should not overlap.
> > + * Use with n > 64.
> > + */
> >  static __rte_always_inline void *
> > -rte_memcpy_aligned(void *dst, const void *src, size_t n)
> > +rte_memcpy_aligned_more_than_64(void *dst, const void *src, size_t n)
> >  {
> >       void *ret = dst;
> >
> > -     /* Copy size < 16 bytes */
> > -     if (n < 16) {
> > -             return rte_mov15_or_less(dst, src, n);
> > -     }
> > -
> > -     /* Copy 16 <= size <= 32 bytes */
> > -     if (__rte_constant(n) && n == 32) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             return ret;
> > -     }
> > -     if (n <= 32) {
> > -             rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -             if (__rte_constant(n) && n == 16)
> > -                     return ret; /* avoid (harmless) duplicate copy */
> > -             rte_mov16((uint8_t *)dst - 16 + n,
> > -                             (const uint8_t *)src - 16 + n);
> > -
> > -             return ret;
> > -     }
> > -
> > -     /* Copy 32 < size <= 64 bytes */
> > -     if (__rte_constant(n) && n == 64) {
> > -             rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > -             return ret;
> > -     }
> > -     if (n <= 64) {
> > -             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -             rte_mov32((uint8_t *)dst - 32 + n,
> > -                             (const uint8_t *)src - 32 + n);
> > -
> > -             return ret;
> > -     }
> > -
> >       /* Copy 64 bytes blocks */
> >       for (; n > 64; n -= 64) {
> >               rte_mov64((uint8_t *)dst, (const uint8_t *)src); @@
> > -751,10 +765,22 @@ rte_memcpy_aligned(void *dst, const void *src,
> > size_t n)  static __rte_always_inline void *  rte_memcpy(void *dst,
> > const void *src, size_t n)  {
> > +     /* Common implementation for size <= 64 bytes. */
> > +     if (n <= 16)
> > +             return rte_mov16_or_less(dst, src, n);
> > +     if (n <= 64) {
> > +             /* Copy 17 ~ 64 bytes using vector instructions. */
> > +             if (n <= 32)
> > +                     return rte_mov17_to_32(dst, src, n);
> > +             else
> > +                     return rte_mov33_to_64(dst, src, n);
> > +     }
> > +
> > +     /* Implementation for size > 64 bytes depends on alignment with
> > vector register size. */
> >       if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> > -             return rte_memcpy_aligned(dst, src, n);
> > +             return rte_memcpy_aligned_more_than_64(dst, src, n);
> >       else
> > -             return rte_memcpy_generic(dst, src, n);
> > +             return rte_memcpy_generic_more_than_64(dst, src, n);
> >  }
> >
> >  #undef ALIGNMENT_MASK
> > --
> > 2.43.0

RE: [PATCH v5] eal/x86: optimize memcpy of small sizes

Reply via email to