[PATCH v10 6/6] arm64: introduce copy_mc_to_kernel() implementation

Tong Tiangen Mon, 29 Jan 2024 06:08:11 -0800

The copy_mc_to_kernel() helper is memory copy implementation that handles
source exceptions. It can be used in memory copy scenarios that tolerate
hardware memory errors(e.g: pmem_read/dax_copy_to_iter).


Currnently, only x86 and ppc suuport this helper, after arm64 support
machine check safe framework, we introduce copy_mc_to_kernel()
implementation.

Signed-off-by: Tong Tiangen <tongtian...@huawei.com>
---
 arch/arm64/include/asm/string.h  |   5 +
 arch/arm64/include/asm/uaccess.h |  21 +++
 arch/arm64/lib/Makefile          |   2 +-
 arch/arm64/lib/memcpy_mc.S       | 257 +++++++++++++++++++++++++++++++
 mm/kasan/shadow.c                |  12 ++
 5 files changed, 296 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/lib/memcpy_mc.S

diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index 3a3264ff47b9..995b63c26e99 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -35,6 +35,10 @@ extern void *memchr(const void *, int, __kernel_size_t);
 extern void *memcpy(void *, const void *, __kernel_size_t);
 extern void *__memcpy(void *, const void *, __kernel_size_t);
 
+#define __HAVE_ARCH_MEMCPY_MC
+extern int memcpy_mcs(void *, const void *, __kernel_size_t);
+extern int __memcpy_mcs(void *, const void *, __kernel_size_t);
+
 #define __HAVE_ARCH_MEMMOVE
 extern void *memmove(void *, const void *, __kernel_size_t);
 extern void *__memmove(void *, const void *, __kernel_size_t);
@@ -57,6 +61,7 @@ void memcpy_flushcache(void *dst, const void *src, size_t 
cnt);
  */
 
 #define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memcpy_mcs(dst, src, len) __memcpy_mcs(dst, src, len)
 #define memmove(dst, src, len) __memmove(dst, src, len)
 #define memset(s, c, n) __memset(s, c, n)
 
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 14be5000c5a0..61e28ef2112a 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -425,4 +425,25 @@ static inline size_t probe_subpage_writeable(const char 
__user *uaddr,
 
 #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */
 
+#ifdef CONFIG_ARCH_HAS_COPY_MC
+/**
+ * copy_mc_to_kernel - memory copy that handles source exceptions
+ *
+ * @dst:       destination address
+ * @src:       source address
+ * @len:       number of bytes to copy
+ *
+ * Return 0 for success, or #size if there was an exception.
+ */
+static inline unsigned long __must_check
+copy_mc_to_kernel(void *to, const void *from, unsigned long size)
+{
+       int ret;
+
+       ret = memcpy_mcs(to, from, size);
+       return (ret == -EFAULT) ? size : 0;
+}
+#define copy_mc_to_kernel copy_mc_to_kernel
+#endif
+
 #endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index a2fd865b816d..899d6ae9698c 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -3,7 +3,7 @@ lib-y           := clear_user.o delay.o copy_from_user.o        
        \
                   copy_to_user.o copy_page.o                           \
                   clear_page.o csum.o insn.o memchr.o memcpy.o         \
                   memset.o memcmp.o strcmp.o strncmp.o strlen.o        \
-                  strnlen.o strchr.o strrchr.o tishift.o
+                  strnlen.o strchr.o strrchr.o tishift.o memcpy_mc.o
 
 ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)       += xor-neon.o
diff --git a/arch/arm64/lib/memcpy_mc.S b/arch/arm64/lib/memcpy_mc.S
new file mode 100644
index 000000000000..7076b500d154
--- /dev/null
+++ b/arch/arm64/lib/memcpy_mc.S
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2012-2021, Arm Limited.
+ *
+ * Adapted from the original at:
+ * 
https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define L(label) .L ## label
+
+#define dstin  x0
+#define src    x1
+#define count  x2
+#define dst    x3
+#define srcend x4
+#define dstend x5
+#define A_l    x6
+#define A_lw   w6
+#define A_h    x7
+#define B_l    x8
+#define B_lw   w8
+#define B_h    x9
+#define C_l    x10
+#define C_lw   w10
+#define C_h    x11
+#define D_l    x12
+#define D_h    x13
+#define E_l    x14
+#define E_h    x15
+#define F_l    x16
+#define F_h    x17
+#define G_l    count
+#define G_h    dst
+#define H_l    src
+#define H_h    srcend
+#define tmp1   x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per 
iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+SYM_FUNC_START(__pi_memcpy_mcs)
+       add     srcend, src, count
+       add     dstend, dstin, count
+       cmp     count, 128
+       b.hi    L(copy_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)
+
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
+       CPY_MC(9998f, ldp       A_l, A_h, [src])
+       CPY_MC(9998f, ldp       D_l, D_h, [srcend, -16])
+       CPY_MC(9998f, stp       A_l, A_h, [dstin])
+       CPY_MC(9998f, stp       D_l, D_h, [dstend, -16])
+       mov x0, #0
+       ret
+
+       /* Copy 8-15 bytes.  */
+L(copy16):
+       tbz     count, 3, L(copy8)
+       CPY_MC(9998f, ldr       A_l, [src])
+       CPY_MC(9998f, ldr       A_h, [srcend, -8])
+       CPY_MC(9998f, str       A_l, [dstin])
+       CPY_MC(9998f, str       A_h, [dstend, -8])
+       mov x0, #0
+       ret
+
+       .p2align 3
+       /* Copy 4-7 bytes.  */
+L(copy8):
+       tbz     count, 2, L(copy4)
+       CPY_MC(9998f, ldr       A_lw, [src])
+       CPY_MC(9998f, ldr       B_lw, [srcend, -4])
+       CPY_MC(9998f, str       A_lw, [dstin])
+       CPY_MC(9998f, str       B_lw, [dstend, -4])
+       mov x0, #0
+       ret
+
+       /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+       cbz     count, L(copy0)
+       lsr     tmp1, count, 1
+       CPY_MC(9998f, ldrb      A_lw, [src])
+       CPY_MC(9998f, ldrb      C_lw, [srcend, -1])
+       CPY_MC(9998f, ldrb      B_lw, [src, tmp1])
+       CPY_MC(9998f, strb      A_lw, [dstin])
+       CPY_MC(9998f, strb      B_lw, [dstin, tmp1])
+       CPY_MC(9998f, strb      C_lw, [dstend, -1])
+L(copy0):
+       mov x0, #0
+       ret
+
+       .p2align 4
+       /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+       CPY_MC(9998f, ldp       A_l, A_h, [src])
+       CPY_MC(9998f, ldp       B_l, B_h, [src, 16])
+       CPY_MC(9998f, ldp       C_l, C_h, [srcend, -32])
+       CPY_MC(9998f, ldp       D_l, D_h, [srcend, -16])
+       cmp     count, 64
+       b.hi    L(copy128)
+       CPY_MC(9998f, stp       A_l, A_h, [dstin])
+       CPY_MC(9998f, stp       B_l, B_h, [dstin, 16])
+       CPY_MC(9998f, stp       C_l, C_h, [dstend, -32])
+       CPY_MC(9998f, stp       D_l, D_h, [dstend, -16])
+       mov x0, #0
+       ret
+
+       .p2align 4
+       /* Copy 65..128 bytes.  */
+L(copy128):
+       CPY_MC(9998f, ldp       E_l, E_h, [src, 32])
+       CPY_MC(9998f, ldp       F_l, F_h, [src, 48])
+       cmp     count, 96
+       b.ls    L(copy96)
+       CPY_MC(9998f, ldp       G_l, G_h, [srcend, -64])
+       CPY_MC(9998f, ldp       H_l, H_h, [srcend, -48])
+       CPY_MC(9998f, stp       G_l, G_h, [dstend, -64])
+       CPY_MC(9998f, stp       H_l, H_h, [dstend, -48])
+L(copy96):
+       CPY_MC(9998f, stp       A_l, A_h, [dstin])
+       CPY_MC(9998f, stp       B_l, B_h, [dstin, 16])
+       CPY_MC(9998f, stp       E_l, E_h, [dstin, 32])
+       CPY_MC(9998f, stp       F_l, F_h, [dstin, 48])
+       CPY_MC(9998f, stp       C_l, C_h, [dstend, -32])
+       CPY_MC(9998f, stp       D_l, D_h, [dstend, -16])
+       mov x0, #0
+       ret
+
+       .p2align 4
+       /* Copy more than 128 bytes.  */
+L(copy_long):
+       /* Use backwards copy if there is an overlap.  */
+       sub     tmp1, dstin, src
+       cbz     tmp1, L(copy0)
+       cmp     tmp1, count
+       b.lo    L(copy_long_backwards)
+
+       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+       CPY_MC(9998f, ldp       D_l, D_h, [src])
+       and     tmp1, dstin, 15
+       bic     dst, dstin, 15
+       sub     src, src, tmp1
+       add     count, count, tmp1      /* Count is now 16 too large.  */
+       CPY_MC(9998f, ldp       A_l, A_h, [src, 16])
+       CPY_MC(9998f, stp       D_l, D_h, [dstin])
+       CPY_MC(9998f, ldp       B_l, B_h, [src, 32])
+       CPY_MC(9998f, ldp       C_l, C_h, [src, 48])
+       CPY_MC(9998f, ldp       D_l, D_h, [src, 64]!)
+       subs    count, count, 128 + 16  /* Test and readjust count.  */
+       b.ls    L(copy64_from_end)
+
+L(loop64):
+       CPY_MC(9998f, stp       A_l, A_h, [dst, 16])
+       CPY_MC(9998f, ldp       A_l, A_h, [src, 16])
+       CPY_MC(9998f, stp       B_l, B_h, [dst, 32])
+       CPY_MC(9998f, ldp       B_l, B_h, [src, 32])
+       CPY_MC(9998f, stp       C_l, C_h, [dst, 48])
+       CPY_MC(9998f, ldp       C_l, C_h, [src, 48])
+       CPY_MC(9998f, stp       D_l, D_h, [dst, 64]!)
+       CPY_MC(9998f, ldp       D_l, D_h, [src, 64]!)
+       subs    count, count, 64
+       b.hi    L(loop64)
+
+       /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+       CPY_MC(9998f, ldp       E_l, E_h, [srcend, -64])
+       CPY_MC(9998f, stp       A_l, A_h, [dst, 16])
+       CPY_MC(9998f, ldp       A_l, A_h, [srcend, -48])
+       CPY_MC(9998f, stp       B_l, B_h, [dst, 32])
+       CPY_MC(9998f, ldp       B_l, B_h, [srcend, -32])
+       CPY_MC(9998f, stp       C_l, C_h, [dst, 48])
+       CPY_MC(9998f, ldp       C_l, C_h, [srcend, -16])
+       CPY_MC(9998f, stp       D_l, D_h, [dst, 64])
+       CPY_MC(9998f, stp       E_l, E_h, [dstend, -64])
+       CPY_MC(9998f, stp       A_l, A_h, [dstend, -48])
+       CPY_MC(9998f, stp       B_l, B_h, [dstend, -32])
+       CPY_MC(9998f, stp       C_l, C_h, [dstend, -16])
+       mov x0, #0
+       ret
+
+       .p2align 4
+
+       /* Large backwards copy for overlapping copies.
+          Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+       CPY_MC(9998f, ldp       D_l, D_h, [srcend, -16])
+       and     tmp1, dstend, 15
+       sub     srcend, srcend, tmp1
+       sub     count, count, tmp1
+       CPY_MC(9998f, ldp       A_l, A_h, [srcend, -16])
+       CPY_MC(9998f, stp       D_l, D_h, [dstend, -16])
+       CPY_MC(9998f, ldp       B_l, B_h, [srcend, -32])
+       CPY_MC(9998f, ldp       C_l, C_h, [srcend, -48])
+       CPY_MC(9998f, ldp       D_l, D_h, [srcend, -64]!)
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+       CPY_MC(9998f, stp       A_l, A_h, [dstend, -16])
+       CPY_MC(9998f, ldp       A_l, A_h, [srcend, -16])
+       CPY_MC(9998f, stp       B_l, B_h, [dstend, -32])
+       CPY_MC(9998f, ldp       B_l, B_h, [srcend, -32])
+       CPY_MC(9998f, stp       C_l, C_h, [dstend, -48])
+       CPY_MC(9998f, ldp       C_l, C_h, [srcend, -48])
+       CPY_MC(9998f, stp       D_l, D_h, [dstend, -64]!)
+       CPY_MC(9998f, ldp       D_l, D_h, [srcend, -64]!)
+       subs    count, count, 64
+       b.hi    L(loop64_backwards)
+
+       /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+       CPY_MC(9998f, ldp       G_l, G_h, [src, 48])
+       CPY_MC(9998f, stp       A_l, A_h, [dstend, -16])
+       CPY_MC(9998f, ldp       A_l, A_h, [src, 32])
+       CPY_MC(9998f, stp       B_l, B_h, [dstend, -32])
+       CPY_MC(9998f, ldp       B_l, B_h, [src, 16])
+       CPY_MC(9998f, stp       C_l, C_h, [dstend, -48])
+       CPY_MC(9998f, ldp       C_l, C_h, [src])
+       CPY_MC(9998f, stp       D_l, D_h, [dstend, -64])
+       CPY_MC(9998f, stp       G_l, G_h, [dstin, 48])
+       CPY_MC(9998f, stp       A_l, A_h, [dstin, 32])
+       CPY_MC(9998f, stp       B_l, B_h, [dstin, 16])
+       CPY_MC(9998f, stp       C_l, C_h, [dstin])
+       mov x0, #0
+       ret
+
+9998:  mov x0, #-EFAULT
+       ret
+SYM_FUNC_END(__pi_memcpy_mcs)
+
+SYM_FUNC_ALIAS(__memcpy_mcs, __pi_memcpy_mcs)
+EXPORT_SYMBOL(__memcpy_mcs)
+SYM_FUNC_ALIAS_WEAK(memcpy_mcs, __memcpy_mcs)
+EXPORT_SYMBOL(memcpy_mcs)
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 9ef84f31833f..e6519fd329b2 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -79,6 +79,18 @@ void *memcpy(void *dest, const void *src, size_t len)
 }
 #endif
 
+#ifdef __HAVE_ARCH_MEMCPY_MC
+#undef memcpy_mcs
+int memcpy_mcs(void *dest, const void *src, size_t len)
+{
+       if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
+           !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+               return (unsigned long)len;
+
+       return __memcpy_mcs(dest, src, len);
+}
+#endif
+
 void *__asan_memset(void *addr, int c, ssize_t len)
 {
        if (!kasan_check_range(addr, len, true, _RET_IP_))
-- 
2.25.1

[PATCH v10 6/6] arm64: introduce copy_mc_to_kernel() implementation

Reply via email to