[PATCH v2 10/10] ARM: add optimized memmove

Sascha Hauer Thu, 26 Sep 2024 04:47:45 -0700

Until now there has been no assembler optimized version of memmove() for
ARM. Add this from Linux-6.10 for both ARM32 and ARM64. This also updates
memcpy() for ARM64 from Linux-6.10.


Reviewed-by: Ahmad Fatoum <a.fat...@pengutronix.de>
Signed-off-by: Sascha Hauer <s.ha...@pengutronix.de>
---
 arch/arm/include/asm/cache.h   |   8 ++
 arch/arm/include/asm/string.h  |   4 +-
 arch/arm/lib32/Makefile        |   1 +
 arch/arm/lib32/memmove.S       | 206 +++++++++++++++++++++++++++++++
 arch/arm/lib64/copy_template.S | 180 ---------------------------
 arch/arm/lib64/memcpy.S        | 274 ++++++++++++++++++++++++++++++++++-------
 arch/arm/lib64/memset.S        |  18 +--
 arch/arm/lib64/string.c        |  17 +++
 include/string.h               |   2 +
 lib/string.c                   |   1 -
 10 files changed, 478 insertions(+), 233 deletions(-)

diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h
index 261c30129a..dd022c1f23 100644
--- a/arch/arm/include/asm/cache.h
+++ b/arch/arm/include/asm/cache.h
@@ -3,6 +3,13 @@
 #ifndef __ASM_CACHE_H
 #define __ASM_CACHE_H
 
+#ifdef CONFIG_CPU_64
+#define L1_CACHE_SHIFT         (6)
+#define L1_CACHE_BYTES         (1 << L1_CACHE_SHIFT)
+#endif
+
+#ifndef __ASSEMBLY__
+
 void v8_invalidate_icache_all(void);
 void v8_flush_dcache_all(void);
 void v8_invalidate_dcache_all(void);
@@ -25,5 +32,6 @@ void arm_early_mmu_cache_invalidate(void);
 void sync_caches_for_execution(void);
 
 #include <asm-generic/cache.h>
+#endif
 
 #endif
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index 2322b846b2..f79392e53d 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -9,10 +9,12 @@
 extern void *memcpy(void *, const void *, __kernel_size_t);
 #define __HAVE_ARCH_MEMSET
 extern void *memset(void *, int, __kernel_size_t);
-
+#define __HAVE_ARCH_MEMMOVE
+extern void *memmove(void *, const void *, __kernel_size_t);
 #endif
 
 extern void *__memcpy(void *, const void *, __kernel_size_t);
 extern void *__memset(void *, int, __kernel_size_t);
+extern void *__memmove(void *, const void *, __kernel_size_t);
 
 #endif
diff --git a/arch/arm/lib32/Makefile b/arch/arm/lib32/Makefile
index 511a029062..a139a80fb8 100644
--- a/arch/arm/lib32/Makefile
+++ b/arch/arm/lib32/Makefile
@@ -21,6 +21,7 @@ obj-y += lshrdi3.o
 obj-y  += runtime-offset.o
 pbl-y  += runtime-offset.o
 obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS)    += memcpy.o
+obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS)    += memmove.o
 obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS)    += memset.o
 obj-$(CONFIG_ARM_UNWIND) += unwind.o
 obj-$(CONFIG_MODULES) += module.o
diff --git a/arch/arm/lib32/memmove.S b/arch/arm/lib32/memmove.S
new file mode 100644
index 0000000000..6410554039
--- /dev/null
+++ b/arch/arm/lib32/memmove.S
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *  linux/arch/arm/lib/memmove.S
+ *
+ *  Author:    Nicolas Pitre
+ *  Created:   Sep 28, 2005
+ *  Copyright: (C) MontaVista Software Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/unwind.h>
+
+               .text
+
+/*
+ * Prototype: void *memmove(void *dest, const void *src, size_t n);
+ *
+ * Note:
+ *
+ * If the memory regions don't overlap, we simply branch to memcpy which is
+ * normally a bit faster. Otherwise the copy is done going downwards.  This
+ * is a transposition of the code from copy_template.S but with the copy
+ * occurring in the opposite direction.
+ */
+
+ENTRY(__memmove)
+WEAK(memmove)
+       UNWIND( .fnstart                        )
+
+               subs    ip, r0, r1
+               cmphi   r2, ip
+               bls     __memcpy
+       UNWIND( .fnend                          )
+
+       UNWIND( .fnstart                        )
+       UNWIND( .save   {r0, r4, fpreg, lr}     )
+               stmfd   sp!, {r0, r4, UNWIND(fpreg,) lr}
+       UNWIND( .setfp  fpreg, sp               )
+       UNWIND( mov     fpreg, sp               )
+               add     r1, r1, r2
+               add     r0, r0, r2
+               subs    r2, r2, #4
+               blt     8f
+               ands    ip, r0, #3
+       PLD(    pld     [r1, #-4]               )
+               bne     9f
+               ands    ip, r1, #3
+               bne     10f
+
+1:             subs    r2, r2, #(28)
+               stmfd   sp!, {r5, r6, r8, r9}
+               blt     5f
+
+       CALGN(  ands    ip, r0, #31             )
+       CALGN(  sbcsne  r4, ip, r2              )  @ C is always set here
+       CALGN(  bcs     2f                      )
+       CALGN(  adr     r4, 6f                  )
+       CALGN(  subs    r2, r2, ip              )  @ C is set here
+       CALGN(  rsb     ip, ip, #32             )
+       CALGN(  add     pc, r4, ip              )
+
+       PLD(    pld     [r1, #-4]               )
+2:     PLD(    subs    r2, r2, #96             )
+       PLD(    pld     [r1, #-32]              )
+       PLD(    blt     4f                      )
+       PLD(    pld     [r1, #-64]              )
+       PLD(    pld     [r1, #-96]              )
+
+3:     PLD(    pld     [r1, #-128]             )
+4:             ldmdb   r1!, {r3, r4, r5, r6, r8, r9, ip, lr}
+               subs    r2, r2, #32
+               stmdb   r0!, {r3, r4, r5, r6, r8, r9, ip, lr}
+               bge     3b
+       PLD(    cmn     r2, #96                 )
+       PLD(    bge     4b                      )
+
+5:             ands    ip, r2, #28
+               rsb     ip, ip, #32
+               addne   pc, pc, ip              @ C is always clear here
+               b       7f
+6:             W(nop)
+               W(ldr)  r3, [r1, #-4]!
+               W(ldr)  r4, [r1, #-4]!
+               W(ldr)  r5, [r1, #-4]!
+               W(ldr)  r6, [r1, #-4]!
+               W(ldr)  r8, [r1, #-4]!
+               W(ldr)  r9, [r1, #-4]!
+               W(ldr)  lr, [r1, #-4]!
+
+               add     pc, pc, ip
+               nop
+               W(nop)
+               W(str)  r3, [r0, #-4]!
+               W(str)  r4, [r0, #-4]!
+               W(str)  r5, [r0, #-4]!
+               W(str)  r6, [r0, #-4]!
+               W(str)  r8, [r0, #-4]!
+               W(str)  r9, [r0, #-4]!
+               W(str)  lr, [r0, #-4]!
+
+       CALGN(  bcs     2b                      )
+
+7:             ldmfd   sp!, {r5, r6, r8, r9}
+
+8:             movs    r2, r2, lsl #31
+               ldrbne  r3, [r1, #-1]!
+               ldrbcs  r4, [r1, #-1]!
+               ldrbcs  ip, [r1, #-1]
+               strbne  r3, [r0, #-1]!
+               strbcs  r4, [r0, #-1]!
+               strbcs  ip, [r0, #-1]
+               ldmfd   sp!, {r0, r4, UNWIND(fpreg,) pc}
+
+9:             cmp     ip, #2
+               ldrbgt  r3, [r1, #-1]!
+               ldrbge  r4, [r1, #-1]!
+               ldrb    lr, [r1, #-1]!
+               strbgt  r3, [r0, #-1]!
+               strbge  r4, [r0, #-1]!
+               subs    r2, r2, ip
+               strb    lr, [r0, #-1]!
+               blt     8b
+               ands    ip, r1, #3
+               beq     1b
+
+10:            bic     r1, r1, #3
+               cmp     ip, #2
+               ldr     r3, [r1, #0]
+               beq     17f
+               blt     18f
+
+
+               .macro  backward_copy_shift push pull
+
+               subs    r2, r2, #28
+               blt     14f
+
+       CALGN(  ands    ip, r0, #31             )
+       CALGN(  sbcsne  r4, ip, r2              )  @ C is always set here
+       CALGN(  subcc   r2, r2, ip              )
+       CALGN(  bcc     15f                     )
+
+11:            stmfd   sp!, {r5, r6, r8 - r10}
+
+       PLD(    pld     [r1, #-4]               )
+       PLD(    subs    r2, r2, #96             )
+       PLD(    pld     [r1, #-32]              )
+       PLD(    blt     13f                     )
+       PLD(    pld     [r1, #-64]              )
+       PLD(    pld     [r1, #-96]              )
+
+12:    PLD(    pld     [r1, #-128]             )
+13:            ldmdb   r1!, {r8, r9, r10, ip}
+               mov     lr, r3, lspush #\push
+               subs    r2, r2, #32
+               ldmdb   r1!, {r3, r4, r5, r6}
+               orr     lr, lr, ip, lspull #\pull
+               mov     ip, ip, lspush #\push
+               orr     ip, ip, r10, lspull #\pull
+               mov     r10, r10, lspush #\push
+               orr     r10, r10, r9, lspull #\pull
+               mov     r9, r9, lspush #\push
+               orr     r9, r9, r8, lspull #\pull
+               mov     r8, r8, lspush #\push
+               orr     r8, r8, r6, lspull #\pull
+               mov     r6, r6, lspush #\push
+               orr     r6, r6, r5, lspull #\pull
+               mov     r5, r5, lspush #\push
+               orr     r5, r5, r4, lspull #\pull
+               mov     r4, r4, lspush #\push
+               orr     r4, r4, r3, lspull #\pull
+               stmdb   r0!, {r4 - r6, r8 - r10, ip, lr}
+               bge     12b
+       PLD(    cmn     r2, #96                 )
+       PLD(    bge     13b                     )
+
+               ldmfd   sp!, {r5, r6, r8 - r10}
+
+14:            ands    ip, r2, #28
+               beq     16f
+
+15:            mov     lr, r3, lspush #\push
+               ldr     r3, [r1, #-4]!
+               subs    ip, ip, #4
+               orr     lr, lr, r3, lspull #\pull
+               str     lr, [r0, #-4]!
+               bgt     15b
+       CALGN(  cmp     r2, #0                  )
+       CALGN(  bge     11b                     )
+
+16:            add     r1, r1, #(\pull / 8)
+               b       8b
+
+               .endm
+
+
+               backward_copy_shift     push=8  pull=24
+
+17:            backward_copy_shift     push=16 pull=16
+
+18:            backward_copy_shift     push=24 pull=8
+
+       UNWIND( .fnend                          )
+ENDPROC(memmove)
+ENDPROC(__memmove)
diff --git a/arch/arm/lib64/copy_template.S b/arch/arm/lib64/copy_template.S
deleted file mode 100644
index 8e4ff059d1..0000000000
--- a/arch/arm/lib64/copy_template.S
+++ /dev/null
@@ -1,180 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
-/* SPDX-FileCopyrightText: 2013 Linaro */
-
-/*
- * This code is based on glibc cortex strings work originally authored by 
Linaro
- * and re-licensed under GPLv2 for the Linux kernel. The original code can
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- */
-
-/*
- * Copy a buffer from src to dest (alignment handled by the hardware)
- *
- * Parameters:
- *     x0 - dest
- *     x1 - src
- *     x2 - n
- * Returns:
- *     x0 - dest
- */
-dstin  .req    x0
-src    .req    x1
-count  .req    x2
-tmp1   .req    x3
-tmp1w  .req    w3
-tmp2   .req    x4
-tmp2w  .req    w4
-dst    .req    x6
-
-A_l    .req    x7
-A_h    .req    x8
-B_l    .req    x9
-B_h    .req    x10
-C_l    .req    x11
-C_h    .req    x12
-D_l    .req    x13
-D_h    .req    x14
-
-       mov     dst, dstin
-       cmp     count, #16
-       /*When memory length is less than 16, the accessed are not aligned.*/
-       b.lo    .Ltiny15
-
-       neg     tmp2, src
-       ands    tmp2, tmp2, #15/* Bytes to reach alignment. */
-       b.eq    .LSrcAligned
-       sub     count, count, tmp2
-       /*
-       * Copy the leading memory data from src to dst in an increasing
-       * address order.By this way,the risk of overwritting the source
-       * memory data is eliminated when the distance between src and
-       * dst is less than 16. The memory accesses here are alignment.
-       */
-       tbz     tmp2, #0, 1f
-       ldrb1   tmp1w, src, #1
-       strb1   tmp1w, dst, #1
-1:
-       tbz     tmp2, #1, 2f
-       ldrh1   tmp1w, src, #2
-       strh1   tmp1w, dst, #2
-2:
-       tbz     tmp2, #2, 3f
-       ldr1    tmp1w, src, #4
-       str1    tmp1w, dst, #4
-3:
-       tbz     tmp2, #3, .LSrcAligned
-       ldr1    tmp1, src, #8
-       str1    tmp1, dst, #8
-
-.LSrcAligned:
-       cmp     count, #64
-       b.ge    .Lcpy_over64
-       /*
-       * Deal with small copies quickly by dropping straight into the
-       * exit block.
-       */
-.Ltail63:
-       /*
-       * Copy up to 48 bytes of data. At this point we only need the
-       * bottom 6 bits of count to be accurate.
-       */
-       ands    tmp1, count, #0x30
-       b.eq    .Ltiny15
-       cmp     tmp1w, #0x20
-       b.eq    1f
-       b.lt    2f
-       ldp1    A_l, A_h, src, #16
-       stp1    A_l, A_h, dst, #16
-1:
-       ldp1    A_l, A_h, src, #16
-       stp1    A_l, A_h, dst, #16
-2:
-       ldp1    A_l, A_h, src, #16
-       stp1    A_l, A_h, dst, #16
-.Ltiny15:
-       /*
-       * Prefer to break one ldp/stp into several load/store to access
-       * memory in an increasing address order,rather than to load/store 16
-       * bytes from (src-16) to (dst-16) and to backward the src to aligned
-       * address,which way is used in original cortex memcpy. If keeping
-       * the original memcpy process here, memmove need to satisfy the
-       * precondition that src address is at least 16 bytes bigger than dst
-       * address,otherwise some source data will be overwritten when memove
-       * call memcpy directly. To make memmove simpler and decouple the
-       * memcpy's dependency on memmove, withdrew the original process.
-       */
-       tbz     count, #3, 1f
-       ldr1    tmp1, src, #8
-       str1    tmp1, dst, #8
-1:
-       tbz     count, #2, 2f
-       ldr1    tmp1w, src, #4
-       str1    tmp1w, dst, #4
-2:
-       tbz     count, #1, 3f
-       ldrh1   tmp1w, src, #2
-       strh1   tmp1w, dst, #2
-3:
-       tbz     count, #0, .Lexitfunc
-       ldrb1   tmp1w, src, #1
-       strb1   tmp1w, dst, #1
-
-       b       .Lexitfunc
-
-.Lcpy_over64:
-       subs    count, count, #128
-       b.ge    .Lcpy_body_large
-       /*
-       * Less than 128 bytes to copy, so handle 64 here and then jump
-       * to the tail.
-       */
-       ldp1    A_l, A_h, src, #16
-       stp1    A_l, A_h, dst, #16
-       ldp1    B_l, B_h, src, #16
-       ldp1    C_l, C_h, src, #16
-       stp1    B_l, B_h, dst, #16
-       stp1    C_l, C_h, dst, #16
-       ldp1    D_l, D_h, src, #16
-       stp1    D_l, D_h, dst, #16
-
-       tst     count, #0x3f
-       b.ne    .Ltail63
-       b       .Lexitfunc
-
-       /*
-       * Critical loop.  Start at a new cache line boundary.  Assuming
-       * 64 bytes per line this ensures the entire loop is in one line.
-       */
-.Lcpy_body_large:
-       /* pre-get 64 bytes data. */
-       ldp1    A_l, A_h, src, #16
-       ldp1    B_l, B_h, src, #16
-       ldp1    C_l, C_h, src, #16
-       ldp1    D_l, D_h, src, #16
-1:
-       /*
-       * interlace the load of next 64 bytes data block with store of the last
-       * loaded 64 bytes data.
-       */
-       stp1    A_l, A_h, dst, #16
-       ldp1    A_l, A_h, src, #16
-       stp1    B_l, B_h, dst, #16
-       ldp1    B_l, B_h, src, #16
-       stp1    C_l, C_h, dst, #16
-       ldp1    C_l, C_h, src, #16
-       stp1    D_l, D_h, dst, #16
-       ldp1    D_l, D_h, src, #16
-       subs    count, count, #64
-       b.ge    1b
-       stp1    A_l, A_h, dst, #16
-       stp1    B_l, B_h, dst, #16
-       stp1    C_l, C_h, dst, #16
-       stp1    D_l, D_h, dst, #16
-
-       tst     count, #0x3f
-       b.ne    .Ltail63
-.Lexitfunc:
diff --git a/arch/arm/lib64/memcpy.S b/arch/arm/lib64/memcpy.S
index 92845b25a6..98b453d3fd 100644
--- a/arch/arm/lib64/memcpy.S
+++ b/arch/arm/lib64/memcpy.S
@@ -1,63 +1,249 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
-/* SPDX-FileCopyrightText: 2013 Linaro */
-
 /*
- * This code is based on glibc cortex strings work originally authored by 
Linaro
- * and re-licensed under GPLv2 for the Linux kernel. The original code can
- * be found @
+ * Copyright (c) 2012-2021, Arm Limited.
  *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * 
https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
- * Copy a buffer from src to dest (alignment handled by the hardware)
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
  *
- * Parameters:
- *     x0 - dest
- *     x1 - src
- *     x2 - n
- * Returns:
- *     x0 - dest
  */
-       .macro ldrb1 ptr, regB, val
-       ldrb  \ptr, [\regB], \val
-       .endm
 
-       .macro strb1 ptr, regB, val
-       strb \ptr, [\regB], \val
-       .endm
+#define L(label) .L ## label
+
+#define dstin  x0
+#define src    x1
+#define count  x2
+#define dst    x3
+#define srcend x4
+#define dstend x5
+#define A_l    x6
+#define A_lw   w6
+#define A_h    x7
+#define B_l    x8
+#define B_lw   w8
+#define B_h    x9
+#define C_l    x10
+#define C_lw   w10
+#define C_h    x11
+#define D_l    x12
+#define D_h    x13
+#define E_l    x14
+#define E_h    x15
+#define F_l    x16
+#define F_h    x17
+#define G_l    count
+#define G_h    dst
+#define H_l    src
+#define H_h    srcend
+#define tmp1   x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per 
iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+SYM_FUNC_START(__pi_memcpy)
+       add     srcend, src, count
+       add     dstend, dstin, count
+       cmp     count, 128
+       b.hi    L(copy_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)
+
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
+       ldp     A_l, A_h, [src]
+       ldp     D_l, D_h, [srcend, -16]
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       /* Copy 8-15 bytes.  */
+L(copy16):
+       tbz     count, 3, L(copy8)
+       ldr     A_l, [src]
+       ldr     A_h, [srcend, -8]
+       str     A_l, [dstin]
+       str     A_h, [dstend, -8]
+       ret
+
+       .p2align 3
+       /* Copy 4-7 bytes.  */
+L(copy8):
+       tbz     count, 2, L(copy4)
+       ldr     A_lw, [src]
+       ldr     B_lw, [srcend, -4]
+       str     A_lw, [dstin]
+       str     B_lw, [dstend, -4]
+       ret
 
-       .macro ldrh1 ptr, regB, val
-       ldrh  \ptr, [\regB], \val
-       .endm
+       /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+       cbz     count, L(copy0)
+       lsr     tmp1, count, 1
+       ldrb    A_lw, [src]
+       ldrb    C_lw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    C_lw, [dstend, -1]
+L(copy0):
+       ret
 
-       .macro strh1 ptr, regB, val
-       strh \ptr, [\regB], \val
-       .endm
+       .p2align 4
+       /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+       ldp     A_l, A_h, [src]
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       ldp     D_l, D_h, [srcend, -16]
+       cmp     count, 64
+       b.hi    L(copy128)
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret
 
-       .macro ldr1 ptr, regB, val
-       ldr \ptr, [\regB], \val
-       .endm
+       .p2align 4
+       /* Copy 65..128 bytes.  */
+L(copy128):
+       ldp     E_l, E_h, [src, 32]
+       ldp     F_l, F_h, [src, 48]
+       cmp     count, 96
+       b.ls    L(copy96)
+       ldp     G_l, G_h, [srcend, -64]
+       ldp     H_l, H_h, [srcend, -48]
+       stp     G_l, G_h, [dstend, -64]
+       stp     H_l, H_h, [dstend, -48]
+L(copy96):
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     E_l, E_h, [dstin, 32]
+       stp     F_l, F_h, [dstin, 48]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret
 
-       .macro str1 ptr, regB, val
-       str \ptr, [\regB], \val
-       .endm
+       .p2align 4
+       /* Copy more than 128 bytes.  */
+L(copy_long):
+       /* Use backwards copy if there is an overlap.  */
+       sub     tmp1, dstin, src
+       cbz     tmp1, L(copy0)
+       cmp     tmp1, count
+       b.lo    L(copy_long_backwards)
 
-       .macro ldp1 ptr, regB, regC, val
-       ldp \ptr, \regB, [\regC], \val
-       .endm
+       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
 
-       .macro stp1 ptr, regB, regC, val
-       stp \ptr, \regB, [\regC], \val
-       .endm
+       ldp     D_l, D_h, [src]
+       and     tmp1, dstin, 15
+       bic     dst, dstin, 15
+       sub     src, src, tmp1
+       add     count, count, tmp1      /* Count is now 16 too large.  */
+       ldp     A_l, A_h, [src, 16]
+       stp     D_l, D_h, [dstin]
+       ldp     B_l, B_h, [src, 32]
+       ldp     C_l, C_h, [src, 48]
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 128 + 16  /* Test and readjust count.  */
+       b.ls    L(copy64_from_end)
 
-       .weak __arch_memcpy
-ENTRY(__arch_memcpy)
-#include "copy_template.S"
+L(loop64):
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [src, 16]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [src, 32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [src, 48]
+       stp     D_l, D_h, [dst, 64]!
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 64
+       b.hi    L(loop64)
+
+       /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+       ldp     E_l, E_h, [srcend, -64]
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [srcend, -48]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [srcend, -16]
+       stp     D_l, D_h, [dst, 64]
+       stp     E_l, E_h, [dstend, -64]
+       stp     A_l, A_h, [dstend, -48]
+       stp     B_l, B_h, [dstend, -32]
+       stp     C_l, C_h, [dstend, -16]
        ret
-ENDPROC(__arch_memcpy)
+
+       .p2align 4
+
+       /* Large backwards copy for overlapping copies.
+          Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+       ldp     D_l, D_h, [srcend, -16]
+       and     tmp1, dstend, 15
+       sub     srcend, srcend, tmp1
+       sub     count, count, tmp1
+       ldp     A_l, A_h, [srcend, -16]
+       stp     D_l, D_h, [dstend, -16]
+       ldp     B_l, B_h, [srcend, -32]
+       ldp     C_l, C_h, [srcend, -48]
+       ldp     D_l, D_h, [srcend, -64]!
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [srcend, -16]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [srcend, -48]
+       stp     D_l, D_h, [dstend, -64]!
+       ldp     D_l, D_h, [srcend, -64]!
+       subs    count, count, 64
+       b.hi    L(loop64_backwards)
+
+       /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+       ldp     G_l, G_h, [src, 48]
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [src, 32]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [src, 16]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [src]
+       stp     D_l, D_h, [dstend, -64]
+       stp     G_l, G_h, [dstin, 48]
+       stp     A_l, A_h, [dstin, 32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstin]
+       ret
+SYM_FUNC_END(__pi_memcpy)
+
+SYM_FUNC_ALIAS(__arch_memcpy, __pi_memcpy)
+SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
+
+SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
+
+SYM_FUNC_ALIAS(__arch_memmove, __pi_memmove)
+SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
diff --git a/arch/arm/lib64/memset.S b/arch/arm/lib64/memset.S
index ff201750f1..f059203983 100644
--- a/arch/arm/lib64/memset.S
+++ b/arch/arm/lib64/memset.S
@@ -1,10 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
-/* SPDX-FileCopyrightText: 2013 Linaro */
-
 /*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
  * This code is based on glibc cortex strings work originally authored by 
Linaro
- * and re-licensed under GPLv2 for the Linux kernel. The original code can
  * be found @
  *
  * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
@@ -13,6 +12,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 /*
  * Fill in the buffer with character c (alignment handled by the hardware)
@@ -42,8 +42,7 @@ dst           .req    x8
 tmp3w          .req    w9
 tmp3           .req    x9
 
-       .weak memset
-ENTRY(__arch_memset)
+SYM_FUNC_START(__pi_memset)
        mov     dst, dstin      /* Preserve return value.  */
        and     A_lw, val, #255
        orr     A_lw, A_lw, A_lw, lsl #8
@@ -115,6 +114,7 @@ ENTRY(__arch_memset)
        * Critical loop. Start at a new cache line boundary. Assuming
        * 64 bytes per line, this ensures the entire loop is in one line.
        */
+       .p2align        L1_CACHE_SHIFT
 .Lnot_short:
        sub     dst, dst, #16/* Pre-bias.  */
        sub     count, count, #64
@@ -201,4 +201,8 @@ ENTRY(__arch_memset)
        ands    count, count, zva_bits_x
        b.ne    .Ltail_maybe_long
        ret
-ENDPROC(__arch_memset)
+SYM_FUNC_END(__pi_memset)
+
+SYM_FUNC_ALIAS(__arch_memset, __pi_memset)
+
+SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)
diff --git a/arch/arm/lib64/string.c b/arch/arm/lib64/string.c
index 938790e1a9..c7954d6efe 100644
--- a/arch/arm/lib64/string.c
+++ b/arch/arm/lib64/string.c
@@ -6,6 +6,7 @@
 
 void *__arch_memset(void *dst, int c, __kernel_size_t size);
 void *__arch_memcpy(void * dest, const void *src, size_t count);
+void *__arch_memmove(void * dest, const void *src, size_t count);
 
 static __prereloc void *_memset(void *dst, int c, __kernel_size_t size)
 {
@@ -38,3 +39,19 @@ void __weak *memcpy(void * dest, const void *src, size_t 
count)
 
 void *__memcpy(void * dest, const void *src, size_t count)
        __alias(_memcpy);
+
+static void *_memmove(void * dest, const void *src, size_t count)
+{
+       if (likely(get_cr() & CR_M))
+               return __arch_memmove(dest, src, count);
+
+       return __default_memmove(dest, src, count);
+}
+
+void __weak *memmove(void * dest, const void *src, size_t count)
+{
+       return _memmove(dest, src, count);
+}
+
+void *__memmove(void * dest, const void *src, size_t count)
+       __alias(_memmove);
diff --git a/include/string.h b/include/string.h
index cbe6eddf7f..986ccd83dd 100644
--- a/include/string.h
+++ b/include/string.h
@@ -17,6 +17,8 @@ void *__nokasan_default_memset(void *, int, __kernel_size_t);
 void *__default_memcpy(void * dest,const void *src,size_t count);
 void *__nokasan_default_memcpy(void * dest,const void *src,size_t count);
 
+void *__default_memmove(void * dest,const void *src,size_t count);
+
 char *parse_assignment(char *str);
 
 int strverscmp(const char *a, const char *b);
diff --git a/lib/string.c b/lib/string.c
index 98dd3cffdd..50c2016c2b 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -701,7 +701,6 @@ void *memmove(void * dest, const void *src, size_t count)
 void *__memmove(void * dest, const void *src, size_t count)
        __alias(__default_memmove);
 #endif
-EXPORT_SYMBOL(memmove);
 
 #ifndef __HAVE_ARCH_MEMCMP
 /**

-- 
2.39.5

[PATCH v2 10/10] ARM: add optimized memmove

Reply via email to