On Wed, Jul 01, 2026 at 05:05:52PM +0800, Li Zhe wrote:
> The new x86 memcpy_nt() helper in this series maps to
> memcpy_flushcache(), and the ZONE_DEVICE fast path uses that primitive
> for constant-sized struct page template copies.
> 
> memcpy_flushcache() currently inlines only the 4, 8, and 16-byte
> cases. Larger constant-sized copies fall back to __memcpy_flushcache()
> even when the destination is naturally aligned. Extend the inline
> movnti coverage to 32, 48, 64, 80, and 96 bytes so the struct

Why exactly those sizes and not bigger?

Any numbers to back this up?

> page-sized copies used by that path can stay on the inline
> non-temporal store path instead of dropping into the out-of-line
> helper.
> 
> Factor the store sequences into 8/16/32/64-byte helpers, keep the
> existing 4/8/16-byte cases handled directly in memcpy_flushcache(),
> issue the stores in ascending address order, and leave all other sizes
> on __memcpy_flushcache().
> 
> Signed-off-by: Li Zhe <[email protected]>
> ---
>  arch/x86/include/asm/string_64.h | 80 +++++++++++++++++++++++++++++++-
>  1 file changed, 79 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/string_64.h 
> b/arch/x86/include/asm/string_64.h
> index 6f36abedc56a..95ef2d481418 100644
> --- a/arch/x86/include/asm/string_64.h
> +++ b/arch/x86/include/asm/string_64.h
> @@ -4,6 +4,7 @@
>  
>  #ifdef __KERNEL__
>  #include <linux/jump_label.h>
> +#include <linux/align.h>
>  
>  /* Written 2002 by Andi Kleen */
>  
> @@ -82,8 +83,81 @@ int strcmp(const char *cs, const char *ct);
>  #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
>  #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
>  void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
> -static __always_inline void memcpy_flushcache(void *dst, const void *src, 
> size_t cnt)
> +
> +static __always_inline void memcpy_flushcache_8(void *dst, const void *src)

If this is calling MOVNTI, you might as well call it that way. Only leave the
externally exposed wrapper called "flushcache".

> +{
> +     asm volatile("movntiq %1, %0"
> +                  : "=m"(*(u64 *)dst)
> +                  : "r"(*(const u64 *)src)
> +                  : "memory");
> +}
> +
> +static __always_inline void memcpy_flushcache_16(void *dst,
> +                                              const void *src)
> +{
> +     memcpy_flushcache_8(dst, src);
> +     memcpy_flushcache_8(dst + 8, src + 8);
> +}
> +
> +static __always_inline void memcpy_flushcache_32(void *dst,
> +                                              const void *src)
> +{
> +     memcpy_flushcache_16(dst, src);
> +     memcpy_flushcache_16(dst + 16, src + 16);
> +}
> +
> +static __always_inline void memcpy_flushcache_64(void *dst,
> +                                              const void *src)
>  {
> +     memcpy_flushcache_32(dst, src);
> +     memcpy_flushcache_32(dst + 32, src + 32);
> +}
> +
> +/*
> + * Keep the additional aligned fixed-size cases on the inline movnti path.
> + * Leave the existing 4/8/16-byte cases handled directly in
> + * memcpy_flushcache() so their code generation stays unchanged.
> + */
> +static __always_inline int memcpy_flushcache_large(void *dst,
> +                                                const void *src,
> +                                                size_t cnt)
> +{
> +     char *dptr = dst;
> +     const char *sptr = src;
> +
> +     if (!IS_ALIGNED((unsigned long)dst, 8))
> +             return 0;
> +
> +     switch (cnt) {
> +     case 32:
> +             memcpy_flushcache_32(dptr, sptr);
> +             return 1;
> +     case 48:
> +             memcpy_flushcache_32(dptr, sptr);
> +             memcpy_flushcache_16(dptr + 32, sptr + 32);
> +             return 1;
> +     case 64:
> +             memcpy_flushcache_64(dptr, sptr);
> +             return 1;
> +     case 80:
> +             memcpy_flushcache_64(dptr, sptr);
> +             memcpy_flushcache_16(dptr + 64, sptr + 64);
> +             return 1;
> +     case 96:
> +             memcpy_flushcache_64(dptr, sptr);
> +             memcpy_flushcache_32(dptr + 64, sptr + 64);
> +             return 1;
> +     }
> +
> +     return 0;
> +}
> +
> +static __always_inline void memcpy_flushcache(void *dst, const void *src,
> +                                           size_t cnt)
> +{
> +     if (!cnt)
> +             return;
> +
>       if (__builtin_constant_p(cnt)) {
>               switch (cnt) {
>                       case 4:
> @@ -97,7 +171,11 @@ static __always_inline void memcpy_flushcache(void *dst, 
> const void *src, size_t
>                               asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) 
> : "r"(*(u64 *)(src + 8)));
>                               return;
>               }
> +
> +             if (memcpy_flushcache_large(dst, src, cnt))

Why is this a separate function and not extending the switch-case?

I betcha it has to do with the alignment check but nothing explains to me why.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Reply via email to