The new x86 memcpy_nt() helper in this series maps to memcpy_flushcache(), and the ZONE_DEVICE fast path uses that primitive for constant-sized struct page template copies.
memcpy_flushcache() currently inlines only the 4, 8, and 16-byte cases. Larger constant-sized copies fall back to __memcpy_flushcache() even when the destination is naturally aligned. Extend the inline movnti coverage to 32, 48, 64, 80, and 96 bytes so the struct page-sized copies used by that path can stay on the inline non-temporal store path instead of dropping into the out-of-line helper. Factor the store sequences into 8/16/32/64-byte helpers, keep the existing 4/8/16-byte cases handled directly in memcpy_flushcache(), issue the stores in ascending address order, and leave all other sizes on __memcpy_flushcache(). Signed-off-by: Li Zhe <[email protected]> --- arch/x86/include/asm/string_64.h | 80 +++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 6f36abedc56a..95ef2d481418 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -4,6 +4,7 @@ #ifdef __KERNEL__ #include <linux/jump_label.h> +#include <linux/align.h> /* Written 2002 by Andi Kleen */ @@ -82,8 +83,81 @@ int strcmp(const char *cs, const char *ct); #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 void __memcpy_flushcache(void *dst, const void *src, size_t cnt); -static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) + +static __always_inline void memcpy_flushcache_8(void *dst, const void *src) +{ + asm volatile("movntiq %1, %0" + : "=m"(*(u64 *)dst) + : "r"(*(const u64 *)src) + : "memory"); +} + +static __always_inline void memcpy_flushcache_16(void *dst, + const void *src) +{ + memcpy_flushcache_8(dst, src); + memcpy_flushcache_8(dst + 8, src + 8); +} + +static __always_inline void memcpy_flushcache_32(void *dst, + const void *src) +{ + memcpy_flushcache_16(dst, src); + memcpy_flushcache_16(dst + 16, src + 16); +} + +static __always_inline void memcpy_flushcache_64(void *dst, + const void *src) { + memcpy_flushcache_32(dst, src); + memcpy_flushcache_32(dst + 32, src + 32); +} + +/* + * Keep the additional aligned fixed-size cases on the inline movnti path. + * Leave the existing 4/8/16-byte cases handled directly in + * memcpy_flushcache() so their code generation stays unchanged. + */ +static __always_inline int memcpy_flushcache_large(void *dst, + const void *src, + size_t cnt) +{ + char *dptr = dst; + const char *sptr = src; + + if (!IS_ALIGNED((unsigned long)dst, 8)) + return 0; + + switch (cnt) { + case 32: + memcpy_flushcache_32(dptr, sptr); + return 1; + case 48: + memcpy_flushcache_32(dptr, sptr); + memcpy_flushcache_16(dptr + 32, sptr + 32); + return 1; + case 64: + memcpy_flushcache_64(dptr, sptr); + return 1; + case 80: + memcpy_flushcache_64(dptr, sptr); + memcpy_flushcache_16(dptr + 64, sptr + 64); + return 1; + case 96: + memcpy_flushcache_64(dptr, sptr); + memcpy_flushcache_32(dptr + 64, sptr + 64); + return 1; + } + + return 0; +} + +static __always_inline void memcpy_flushcache(void *dst, const void *src, + size_t cnt) +{ + if (!cnt) + return; + if (__builtin_constant_p(cnt)) { switch (cnt) { case 4: @@ -97,7 +171,11 @@ static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8))); return; } + + if (memcpy_flushcache_large(dst, src, cnt)) + return; } + __memcpy_flushcache(dst, src, cnt); } -- 2.20.1

