Re: [PATCH v2 10/10] ARM: add optimized memmove

Marco Felsch Thu, 26 Sep 2024 22:50:14 -0700

Hi Sascha,

On 24-09-26, Sascha Hauer wrote:
> Until now there has been no assembler optimized version of memmove() for
> ARM. Add this from Linux-6.10 for both ARM32 and ARM64. This also updates
> memcpy() for ARM64 from Linux-6.10.


out of curiosity, did you made performance measurements?

Regards,
  Marco

> Reviewed-by: Ahmad Fatoum <a.fat...@pengutronix.de>
> Signed-off-by: Sascha Hauer <s.ha...@pengutronix.de>
> ---
>  arch/arm/include/asm/cache.h   |   8 ++
>  arch/arm/include/asm/string.h  |   4 +-
>  arch/arm/lib32/Makefile        |   1 +
>  arch/arm/lib32/memmove.S       | 206 +++++++++++++++++++++++++++++++
>  arch/arm/lib64/copy_template.S | 180 ---------------------------
>  arch/arm/lib64/memcpy.S        | 274 
> ++++++++++++++++++++++++++++++++++-------
>  arch/arm/lib64/memset.S        |  18 +--
>  arch/arm/lib64/string.c        |  17 +++
>  include/string.h               |   2 +
>  lib/string.c                   |   1 -
>  10 files changed, 478 insertions(+), 233 deletions(-)
> 
> diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h
> index 261c30129a..dd022c1f23 100644
> --- a/arch/arm/include/asm/cache.h
> +++ b/arch/arm/include/asm/cache.h
> @@ -3,6 +3,13 @@
>  #ifndef __ASM_CACHE_H
>  #define __ASM_CACHE_H
>  
> +#ifdef CONFIG_CPU_64
> +#define L1_CACHE_SHIFT               (6)
> +#define L1_CACHE_BYTES               (1 << L1_CACHE_SHIFT)
> +#endif
> +
> +#ifndef __ASSEMBLY__
> +
>  void v8_invalidate_icache_all(void);
>  void v8_flush_dcache_all(void);
>  void v8_invalidate_dcache_all(void);
> @@ -25,5 +32,6 @@ void arm_early_mmu_cache_invalidate(void);
>  void sync_caches_for_execution(void);
>  
>  #include <asm-generic/cache.h>
> +#endif
>  
>  #endif
> diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
> index 2322b846b2..f79392e53d 100644
> --- a/arch/arm/include/asm/string.h
> +++ b/arch/arm/include/asm/string.h
> @@ -9,10 +9,12 @@
>  extern void *memcpy(void *, const void *, __kernel_size_t);
>  #define __HAVE_ARCH_MEMSET
>  extern void *memset(void *, int, __kernel_size_t);
> -
> +#define __HAVE_ARCH_MEMMOVE
> +extern void *memmove(void *, const void *, __kernel_size_t);
>  #endif
>  
>  extern void *__memcpy(void *, const void *, __kernel_size_t);
>  extern void *__memset(void *, int, __kernel_size_t);
> +extern void *__memmove(void *, const void *, __kernel_size_t);
>  
>  #endif
> diff --git a/arch/arm/lib32/Makefile b/arch/arm/lib32/Makefile
> index 511a029062..a139a80fb8 100644
> --- a/arch/arm/lib32/Makefile
> +++ b/arch/arm/lib32/Makefile
> @@ -21,6 +21,7 @@ obj-y       += lshrdi3.o
>  obj-y        += runtime-offset.o
>  pbl-y        += runtime-offset.o
>  obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS)  += memcpy.o
> +obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS)  += memmove.o
>  obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS)  += memset.o
>  obj-$(CONFIG_ARM_UNWIND) += unwind.o
>  obj-$(CONFIG_MODULES) += module.o
> diff --git a/arch/arm/lib32/memmove.S b/arch/arm/lib32/memmove.S
> new file mode 100644
> index 0000000000..6410554039
> --- /dev/null
> +++ b/arch/arm/lib32/memmove.S
> @@ -0,0 +1,206 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + *  linux/arch/arm/lib/memmove.S
> + *
> + *  Author:  Nicolas Pitre
> + *  Created: Sep 28, 2005
> + *  Copyright:       (C) MontaVista Software Inc.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +#include <asm/unwind.h>
> +
> +             .text
> +
> +/*
> + * Prototype: void *memmove(void *dest, const void *src, size_t n);
> + *
> + * Note:
> + *
> + * If the memory regions don't overlap, we simply branch to memcpy which is
> + * normally a bit faster. Otherwise the copy is done going downwards.  This
> + * is a transposition of the code from copy_template.S but with the copy
> + * occurring in the opposite direction.
> + */
> +
> +ENTRY(__memmove)
> +WEAK(memmove)
> +     UNWIND( .fnstart                        )
> +
> +             subs    ip, r0, r1
> +             cmphi   r2, ip
> +             bls     __memcpy
> +     UNWIND( .fnend                          )
> +
> +     UNWIND( .fnstart                        )
> +     UNWIND( .save   {r0, r4, fpreg, lr}     )
> +             stmfd   sp!, {r0, r4, UNWIND(fpreg,) lr}
> +     UNWIND( .setfp  fpreg, sp               )
> +     UNWIND( mov     fpreg, sp               )
> +             add     r1, r1, r2
> +             add     r0, r0, r2
> +             subs    r2, r2, #4
> +             blt     8f
> +             ands    ip, r0, #3
> +     PLD(    pld     [r1, #-4]               )
> +             bne     9f
> +             ands    ip, r1, #3
> +             bne     10f
> +
> +1:           subs    r2, r2, #(28)
> +             stmfd   sp!, {r5, r6, r8, r9}
> +             blt     5f
> +
> +     CALGN(  ands    ip, r0, #31             )
> +     CALGN(  sbcsne  r4, ip, r2              )  @ C is always set here
> +     CALGN(  bcs     2f                      )
> +     CALGN(  adr     r4, 6f                  )
> +     CALGN(  subs    r2, r2, ip              )  @ C is set here
> +     CALGN(  rsb     ip, ip, #32             )
> +     CALGN(  add     pc, r4, ip              )
> +
> +     PLD(    pld     [r1, #-4]               )
> +2:   PLD(    subs    r2, r2, #96             )
> +     PLD(    pld     [r1, #-32]              )
> +     PLD(    blt     4f                      )
> +     PLD(    pld     [r1, #-64]              )
> +     PLD(    pld     [r1, #-96]              )
> +
> +3:   PLD(    pld     [r1, #-128]             )
> +4:           ldmdb   r1!, {r3, r4, r5, r6, r8, r9, ip, lr}
> +             subs    r2, r2, #32
> +             stmdb   r0!, {r3, r4, r5, r6, r8, r9, ip, lr}
> +             bge     3b
> +     PLD(    cmn     r2, #96                 )
> +     PLD(    bge     4b                      )
> +
> +5:           ands    ip, r2, #28
> +             rsb     ip, ip, #32
> +             addne   pc, pc, ip              @ C is always clear here
> +             b       7f
> +6:           W(nop)
> +             W(ldr)  r3, [r1, #-4]!
> +             W(ldr)  r4, [r1, #-4]!
> +             W(ldr)  r5, [r1, #-4]!
> +             W(ldr)  r6, [r1, #-4]!
> +             W(ldr)  r8, [r1, #-4]!
> +             W(ldr)  r9, [r1, #-4]!
> +             W(ldr)  lr, [r1, #-4]!
> +
> +             add     pc, pc, ip
> +             nop
> +             W(nop)
> +             W(str)  r3, [r0, #-4]!
> +             W(str)  r4, [r0, #-4]!
> +             W(str)  r5, [r0, #-4]!
> +             W(str)  r6, [r0, #-4]!
> +             W(str)  r8, [r0, #-4]!
> +             W(str)  r9, [r0, #-4]!
> +             W(str)  lr, [r0, #-4]!
> +
> +     CALGN(  bcs     2b                      )
> +
> +7:           ldmfd   sp!, {r5, r6, r8, r9}
> +
> +8:           movs    r2, r2, lsl #31
> +             ldrbne  r3, [r1, #-1]!
> +             ldrbcs  r4, [r1, #-1]!
> +             ldrbcs  ip, [r1, #-1]
> +             strbne  r3, [r0, #-1]!
> +             strbcs  r4, [r0, #-1]!
> +             strbcs  ip, [r0, #-1]
> +             ldmfd   sp!, {r0, r4, UNWIND(fpreg,) pc}
> +
> +9:           cmp     ip, #2
> +             ldrbgt  r3, [r1, #-1]!
> +             ldrbge  r4, [r1, #-1]!
> +             ldrb    lr, [r1, #-1]!
> +             strbgt  r3, [r0, #-1]!
> +             strbge  r4, [r0, #-1]!
> +             subs    r2, r2, ip
> +             strb    lr, [r0, #-1]!
> +             blt     8b
> +             ands    ip, r1, #3
> +             beq     1b
> +
> +10:          bic     r1, r1, #3
> +             cmp     ip, #2
> +             ldr     r3, [r1, #0]
> +             beq     17f
> +             blt     18f
> +
> +
> +             .macro  backward_copy_shift push pull
> +
> +             subs    r2, r2, #28
> +             blt     14f
> +
> +     CALGN(  ands    ip, r0, #31             )
> +     CALGN(  sbcsne  r4, ip, r2              )  @ C is always set here
> +     CALGN(  subcc   r2, r2, ip              )
> +     CALGN(  bcc     15f                     )
> +
> +11:          stmfd   sp!, {r5, r6, r8 - r10}
> +
> +     PLD(    pld     [r1, #-4]               )
> +     PLD(    subs    r2, r2, #96             )
> +     PLD(    pld     [r1, #-32]              )
> +     PLD(    blt     13f                     )
> +     PLD(    pld     [r1, #-64]              )
> +     PLD(    pld     [r1, #-96]              )
> +
> +12:  PLD(    pld     [r1, #-128]             )
> +13:          ldmdb   r1!, {r8, r9, r10, ip}
> +             mov     lr, r3, lspush #\push
> +             subs    r2, r2, #32
> +             ldmdb   r1!, {r3, r4, r5, r6}
> +             orr     lr, lr, ip, lspull #\pull
> +             mov     ip, ip, lspush #\push
> +             orr     ip, ip, r10, lspull #\pull
> +             mov     r10, r10, lspush #\push
> +             orr     r10, r10, r9, lspull #\pull
> +             mov     r9, r9, lspush #\push
> +             orr     r9, r9, r8, lspull #\pull
> +             mov     r8, r8, lspush #\push
> +             orr     r8, r8, r6, lspull #\pull
> +             mov     r6, r6, lspush #\push
> +             orr     r6, r6, r5, lspull #\pull
> +             mov     r5, r5, lspush #\push
> +             orr     r5, r5, r4, lspull #\pull
> +             mov     r4, r4, lspush #\push
> +             orr     r4, r4, r3, lspull #\pull
> +             stmdb   r0!, {r4 - r6, r8 - r10, ip, lr}
> +             bge     12b
> +     PLD(    cmn     r2, #96                 )
> +     PLD(    bge     13b                     )
> +
> +             ldmfd   sp!, {r5, r6, r8 - r10}
> +
> +14:          ands    ip, r2, #28
> +             beq     16f
> +
> +15:          mov     lr, r3, lspush #\push
> +             ldr     r3, [r1, #-4]!
> +             subs    ip, ip, #4
> +             orr     lr, lr, r3, lspull #\pull
> +             str     lr, [r0, #-4]!
> +             bgt     15b
> +     CALGN(  cmp     r2, #0                  )
> +     CALGN(  bge     11b                     )
> +
> +16:          add     r1, r1, #(\pull / 8)
> +             b       8b
> +
> +             .endm
> +
> +
> +             backward_copy_shift     push=8  pull=24
> +
> +17:          backward_copy_shift     push=16 pull=16
> +
> +18:          backward_copy_shift     push=24 pull=8
> +
> +     UNWIND( .fnend                          )
> +ENDPROC(memmove)
> +ENDPROC(__memmove)
> diff --git a/arch/arm/lib64/copy_template.S b/arch/arm/lib64/copy_template.S
> deleted file mode 100644
> index 8e4ff059d1..0000000000
> --- a/arch/arm/lib64/copy_template.S
> +++ /dev/null
> @@ -1,180 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
> -/*
> - * This code is based on glibc cortex strings work originally authored by 
> Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
> - * be found @
> - *
> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> - * files/head:/src/aarch64/
> - */
> -
> -/*
> - * Copy a buffer from src to dest (alignment handled by the hardware)
> - *
> - * Parameters:
> - *   x0 - dest
> - *   x1 - src
> - *   x2 - n
> - * Returns:
> - *   x0 - dest
> - */
> -dstin        .req    x0
> -src  .req    x1
> -count        .req    x2
> -tmp1 .req    x3
> -tmp1w        .req    w3
> -tmp2 .req    x4
> -tmp2w        .req    w4
> -dst  .req    x6
> -
> -A_l  .req    x7
> -A_h  .req    x8
> -B_l  .req    x9
> -B_h  .req    x10
> -C_l  .req    x11
> -C_h  .req    x12
> -D_l  .req    x13
> -D_h  .req    x14
> -
> -     mov     dst, dstin
> -     cmp     count, #16
> -     /*When memory length is less than 16, the accessed are not aligned.*/
> -     b.lo    .Ltiny15
> -
> -     neg     tmp2, src
> -     ands    tmp2, tmp2, #15/* Bytes to reach alignment. */
> -     b.eq    .LSrcAligned
> -     sub     count, count, tmp2
> -     /*
> -     * Copy the leading memory data from src to dst in an increasing
> -     * address order.By this way,the risk of overwritting the source
> -     * memory data is eliminated when the distance between src and
> -     * dst is less than 16. The memory accesses here are alignment.
> -     */
> -     tbz     tmp2, #0, 1f
> -     ldrb1   tmp1w, src, #1
> -     strb1   tmp1w, dst, #1
> -1:
> -     tbz     tmp2, #1, 2f
> -     ldrh1   tmp1w, src, #2
> -     strh1   tmp1w, dst, #2
> -2:
> -     tbz     tmp2, #2, 3f
> -     ldr1    tmp1w, src, #4
> -     str1    tmp1w, dst, #4
> -3:
> -     tbz     tmp2, #3, .LSrcAligned
> -     ldr1    tmp1, src, #8
> -     str1    tmp1, dst, #8
> -
> -.LSrcAligned:
> -     cmp     count, #64
> -     b.ge    .Lcpy_over64
> -     /*
> -     * Deal with small copies quickly by dropping straight into the
> -     * exit block.
> -     */
> -.Ltail63:
> -     /*
> -     * Copy up to 48 bytes of data. At this point we only need the
> -     * bottom 6 bits of count to be accurate.
> -     */
> -     ands    tmp1, count, #0x30
> -     b.eq    .Ltiny15
> -     cmp     tmp1w, #0x20
> -     b.eq    1f
> -     b.lt    2f
> -     ldp1    A_l, A_h, src, #16
> -     stp1    A_l, A_h, dst, #16
> -1:
> -     ldp1    A_l, A_h, src, #16
> -     stp1    A_l, A_h, dst, #16
> -2:
> -     ldp1    A_l, A_h, src, #16
> -     stp1    A_l, A_h, dst, #16
> -.Ltiny15:
> -     /*
> -     * Prefer to break one ldp/stp into several load/store to access
> -     * memory in an increasing address order,rather than to load/store 16
> -     * bytes from (src-16) to (dst-16) and to backward the src to aligned
> -     * address,which way is used in original cortex memcpy. If keeping
> -     * the original memcpy process here, memmove need to satisfy the
> -     * precondition that src address is at least 16 bytes bigger than dst
> -     * address,otherwise some source data will be overwritten when memove
> -     * call memcpy directly. To make memmove simpler and decouple the
> -     * memcpy's dependency on memmove, withdrew the original process.
> -     */
> -     tbz     count, #3, 1f
> -     ldr1    tmp1, src, #8
> -     str1    tmp1, dst, #8
> -1:
> -     tbz     count, #2, 2f
> -     ldr1    tmp1w, src, #4
> -     str1    tmp1w, dst, #4
> -2:
> -     tbz     count, #1, 3f
> -     ldrh1   tmp1w, src, #2
> -     strh1   tmp1w, dst, #2
> -3:
> -     tbz     count, #0, .Lexitfunc
> -     ldrb1   tmp1w, src, #1
> -     strb1   tmp1w, dst, #1
> -
> -     b       .Lexitfunc
> -
> -.Lcpy_over64:
> -     subs    count, count, #128
> -     b.ge    .Lcpy_body_large
> -     /*
> -     * Less than 128 bytes to copy, so handle 64 here and then jump
> -     * to the tail.
> -     */
> -     ldp1    A_l, A_h, src, #16
> -     stp1    A_l, A_h, dst, #16
> -     ldp1    B_l, B_h, src, #16
> -     ldp1    C_l, C_h, src, #16
> -     stp1    B_l, B_h, dst, #16
> -     stp1    C_l, C_h, dst, #16
> -     ldp1    D_l, D_h, src, #16
> -     stp1    D_l, D_h, dst, #16
> -
> -     tst     count, #0x3f
> -     b.ne    .Ltail63
> -     b       .Lexitfunc
> -
> -     /*
> -     * Critical loop.  Start at a new cache line boundary.  Assuming
> -     * 64 bytes per line this ensures the entire loop is in one line.
> -     */
> -.Lcpy_body_large:
> -     /* pre-get 64 bytes data. */
> -     ldp1    A_l, A_h, src, #16
> -     ldp1    B_l, B_h, src, #16
> -     ldp1    C_l, C_h, src, #16
> -     ldp1    D_l, D_h, src, #16
> -1:
> -     /*
> -     * interlace the load of next 64 bytes data block with store of the last
> -     * loaded 64 bytes data.
> -     */
> -     stp1    A_l, A_h, dst, #16
> -     ldp1    A_l, A_h, src, #16
> -     stp1    B_l, B_h, dst, #16
> -     ldp1    B_l, B_h, src, #16
> -     stp1    C_l, C_h, dst, #16
> -     ldp1    C_l, C_h, src, #16
> -     stp1    D_l, D_h, dst, #16
> -     ldp1    D_l, D_h, src, #16
> -     subs    count, count, #64
> -     b.ge    1b
> -     stp1    A_l, A_h, dst, #16
> -     stp1    B_l, B_h, dst, #16
> -     stp1    C_l, C_h, dst, #16
> -     stp1    D_l, D_h, dst, #16
> -
> -     tst     count, #0x3f
> -     b.ne    .Ltail63
> -.Lexitfunc:
> diff --git a/arch/arm/lib64/memcpy.S b/arch/arm/lib64/memcpy.S
> index 92845b25a6..98b453d3fd 100644
> --- a/arch/arm/lib64/memcpy.S
> +++ b/arch/arm/lib64/memcpy.S
> @@ -1,63 +1,249 @@
>  /* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
>  /*
> - * This code is based on glibc cortex strings work originally authored by 
> Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
> - * be found @
> + * Copyright (c) 2012-2021, Arm Limited.
>   *
> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> - * files/head:/src/aarch64/
> + * Adapted from the original at:
> + * 
> https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
>   */
>  
>  #include <linux/linkage.h>
>  #include <asm/assembler.h>
>  
> -/*
> - * Copy a buffer from src to dest (alignment handled by the hardware)
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, unaligned accesses.
>   *
> - * Parameters:
> - *   x0 - dest
> - *   x1 - src
> - *   x2 - n
> - * Returns:
> - *   x0 - dest
>   */
> -     .macro ldrb1 ptr, regB, val
> -     ldrb  \ptr, [\regB], \val
> -     .endm
>  
> -     .macro strb1 ptr, regB, val
> -     strb \ptr, [\regB], \val
> -     .endm
> +#define L(label) .L ## label
> +
> +#define dstin        x0
> +#define src  x1
> +#define count        x2
> +#define dst  x3
> +#define srcend       x4
> +#define dstend       x5
> +#define A_l  x6
> +#define A_lw w6
> +#define A_h  x7
> +#define B_l  x8
> +#define B_lw w8
> +#define B_h  x9
> +#define C_l  x10
> +#define C_lw w10
> +#define C_h  x11
> +#define D_l  x12
> +#define D_h  x13
> +#define E_l  x14
> +#define E_h  x15
> +#define F_l  x16
> +#define F_h  x17
> +#define G_l  count
> +#define G_h  dst
> +#define H_l  src
> +#define H_h  srcend
> +#define tmp1 x14
> +
> +/* This implementation handles overlaps and supports both memcpy and memmove
> +   from a single entry point.  It uses unaligned accesses and branchless
> +   sequences to keep the code small, simple and improve performance.
> +
> +   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> +   copies of up to 128 bytes, and large copies.  The overhead of the overlap
> +   check is negligible since it is only required for large copies.
> +
> +   Large copies use a software pipelined loop processing 64 bytes per 
> iteration.
> +   The destination pointer is 16-byte aligned to minimize unaligned accesses.
> +   The loop tail is handled by always copying 64 bytes from the end.
> +*/
> +
> +SYM_FUNC_START(__pi_memcpy)
> +     add     srcend, src, count
> +     add     dstend, dstin, count
> +     cmp     count, 128
> +     b.hi    L(copy_long)
> +     cmp     count, 32
> +     b.hi    L(copy32_128)
> +
> +     /* Small copies: 0..32 bytes.  */
> +     cmp     count, 16
> +     b.lo    L(copy16)
> +     ldp     A_l, A_h, [src]
> +     ldp     D_l, D_h, [srcend, -16]
> +     stp     A_l, A_h, [dstin]
> +     stp     D_l, D_h, [dstend, -16]
> +     ret
> +
> +     /* Copy 8-15 bytes.  */
> +L(copy16):
> +     tbz     count, 3, L(copy8)
> +     ldr     A_l, [src]
> +     ldr     A_h, [srcend, -8]
> +     str     A_l, [dstin]
> +     str     A_h, [dstend, -8]
> +     ret
> +
> +     .p2align 3
> +     /* Copy 4-7 bytes.  */
> +L(copy8):
> +     tbz     count, 2, L(copy4)
> +     ldr     A_lw, [src]
> +     ldr     B_lw, [srcend, -4]
> +     str     A_lw, [dstin]
> +     str     B_lw, [dstend, -4]
> +     ret
>  
> -     .macro ldrh1 ptr, regB, val
> -     ldrh  \ptr, [\regB], \val
> -     .endm
> +     /* Copy 0..3 bytes using a branchless sequence.  */
> +L(copy4):
> +     cbz     count, L(copy0)
> +     lsr     tmp1, count, 1
> +     ldrb    A_lw, [src]
> +     ldrb    C_lw, [srcend, -1]
> +     ldrb    B_lw, [src, tmp1]
> +     strb    A_lw, [dstin]
> +     strb    B_lw, [dstin, tmp1]
> +     strb    C_lw, [dstend, -1]
> +L(copy0):
> +     ret
>  
> -     .macro strh1 ptr, regB, val
> -     strh \ptr, [\regB], \val
> -     .endm
> +     .p2align 4
> +     /* Medium copies: 33..128 bytes.  */
> +L(copy32_128):
> +     ldp     A_l, A_h, [src]
> +     ldp     B_l, B_h, [src, 16]
> +     ldp     C_l, C_h, [srcend, -32]
> +     ldp     D_l, D_h, [srcend, -16]
> +     cmp     count, 64
> +     b.hi    L(copy128)
> +     stp     A_l, A_h, [dstin]
> +     stp     B_l, B_h, [dstin, 16]
> +     stp     C_l, C_h, [dstend, -32]
> +     stp     D_l, D_h, [dstend, -16]
> +     ret
>  
> -     .macro ldr1 ptr, regB, val
> -     ldr \ptr, [\regB], \val
> -     .endm
> +     .p2align 4
> +     /* Copy 65..128 bytes.  */
> +L(copy128):
> +     ldp     E_l, E_h, [src, 32]
> +     ldp     F_l, F_h, [src, 48]
> +     cmp     count, 96
> +     b.ls    L(copy96)
> +     ldp     G_l, G_h, [srcend, -64]
> +     ldp     H_l, H_h, [srcend, -48]
> +     stp     G_l, G_h, [dstend, -64]
> +     stp     H_l, H_h, [dstend, -48]
> +L(copy96):
> +     stp     A_l, A_h, [dstin]
> +     stp     B_l, B_h, [dstin, 16]
> +     stp     E_l, E_h, [dstin, 32]
> +     stp     F_l, F_h, [dstin, 48]
> +     stp     C_l, C_h, [dstend, -32]
> +     stp     D_l, D_h, [dstend, -16]
> +     ret
>  
> -     .macro str1 ptr, regB, val
> -     str \ptr, [\regB], \val
> -     .endm
> +     .p2align 4
> +     /* Copy more than 128 bytes.  */
> +L(copy_long):
> +     /* Use backwards copy if there is an overlap.  */
> +     sub     tmp1, dstin, src
> +     cbz     tmp1, L(copy0)
> +     cmp     tmp1, count
> +     b.lo    L(copy_long_backwards)
>  
> -     .macro ldp1 ptr, regB, regC, val
> -     ldp \ptr, \regB, [\regC], \val
> -     .endm
> +     /* Copy 16 bytes and then align dst to 16-byte alignment.  */
>  
> -     .macro stp1 ptr, regB, regC, val
> -     stp \ptr, \regB, [\regC], \val
> -     .endm
> +     ldp     D_l, D_h, [src]
> +     and     tmp1, dstin, 15
> +     bic     dst, dstin, 15
> +     sub     src, src, tmp1
> +     add     count, count, tmp1      /* Count is now 16 too large.  */
> +     ldp     A_l, A_h, [src, 16]
> +     stp     D_l, D_h, [dstin]
> +     ldp     B_l, B_h, [src, 32]
> +     ldp     C_l, C_h, [src, 48]
> +     ldp     D_l, D_h, [src, 64]!
> +     subs    count, count, 128 + 16  /* Test and readjust count.  */
> +     b.ls    L(copy64_from_end)
>  
> -     .weak __arch_memcpy
> -ENTRY(__arch_memcpy)
> -#include "copy_template.S"
> +L(loop64):
> +     stp     A_l, A_h, [dst, 16]
> +     ldp     A_l, A_h, [src, 16]
> +     stp     B_l, B_h, [dst, 32]
> +     ldp     B_l, B_h, [src, 32]
> +     stp     C_l, C_h, [dst, 48]
> +     ldp     C_l, C_h, [src, 48]
> +     stp     D_l, D_h, [dst, 64]!
> +     ldp     D_l, D_h, [src, 64]!
> +     subs    count, count, 64
> +     b.hi    L(loop64)
> +
> +     /* Write the last iteration and copy 64 bytes from the end.  */
> +L(copy64_from_end):
> +     ldp     E_l, E_h, [srcend, -64]
> +     stp     A_l, A_h, [dst, 16]
> +     ldp     A_l, A_h, [srcend, -48]
> +     stp     B_l, B_h, [dst, 32]
> +     ldp     B_l, B_h, [srcend, -32]
> +     stp     C_l, C_h, [dst, 48]
> +     ldp     C_l, C_h, [srcend, -16]
> +     stp     D_l, D_h, [dst, 64]
> +     stp     E_l, E_h, [dstend, -64]
> +     stp     A_l, A_h, [dstend, -48]
> +     stp     B_l, B_h, [dstend, -32]
> +     stp     C_l, C_h, [dstend, -16]
>       ret
> -ENDPROC(__arch_memcpy)
> +
> +     .p2align 4
> +
> +     /* Large backwards copy for overlapping copies.
> +        Copy 16 bytes and then align dst to 16-byte alignment.  */
> +L(copy_long_backwards):
> +     ldp     D_l, D_h, [srcend, -16]
> +     and     tmp1, dstend, 15
> +     sub     srcend, srcend, tmp1
> +     sub     count, count, tmp1
> +     ldp     A_l, A_h, [srcend, -16]
> +     stp     D_l, D_h, [dstend, -16]
> +     ldp     B_l, B_h, [srcend, -32]
> +     ldp     C_l, C_h, [srcend, -48]
> +     ldp     D_l, D_h, [srcend, -64]!
> +     sub     dstend, dstend, tmp1
> +     subs    count, count, 128
> +     b.ls    L(copy64_from_start)
> +
> +L(loop64_backwards):
> +     stp     A_l, A_h, [dstend, -16]
> +     ldp     A_l, A_h, [srcend, -16]
> +     stp     B_l, B_h, [dstend, -32]
> +     ldp     B_l, B_h, [srcend, -32]
> +     stp     C_l, C_h, [dstend, -48]
> +     ldp     C_l, C_h, [srcend, -48]
> +     stp     D_l, D_h, [dstend, -64]!
> +     ldp     D_l, D_h, [srcend, -64]!
> +     subs    count, count, 64
> +     b.hi    L(loop64_backwards)
> +
> +     /* Write the last iteration and copy 64 bytes from the start.  */
> +L(copy64_from_start):
> +     ldp     G_l, G_h, [src, 48]
> +     stp     A_l, A_h, [dstend, -16]
> +     ldp     A_l, A_h, [src, 32]
> +     stp     B_l, B_h, [dstend, -32]
> +     ldp     B_l, B_h, [src, 16]
> +     stp     C_l, C_h, [dstend, -48]
> +     ldp     C_l, C_h, [src]
> +     stp     D_l, D_h, [dstend, -64]
> +     stp     G_l, G_h, [dstin, 48]
> +     stp     A_l, A_h, [dstin, 32]
> +     stp     B_l, B_h, [dstin, 16]
> +     stp     C_l, C_h, [dstin]
> +     ret
> +SYM_FUNC_END(__pi_memcpy)
> +
> +SYM_FUNC_ALIAS(__arch_memcpy, __pi_memcpy)
> +SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
> +
> +SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
> +
> +SYM_FUNC_ALIAS(__arch_memmove, __pi_memmove)
> +SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
> diff --git a/arch/arm/lib64/memset.S b/arch/arm/lib64/memset.S
> index ff201750f1..f059203983 100644
> --- a/arch/arm/lib64/memset.S
> +++ b/arch/arm/lib64/memset.S
> @@ -1,10 +1,9 @@
>  /* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
>  /*
> + * Copyright (C) 2013 ARM Ltd.
> + * Copyright (C) 2013 Linaro.
> + *
>   * This code is based on glibc cortex strings work originally authored by 
> Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
>   * be found @
>   *
>   * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> @@ -13,6 +12,7 @@
>  
>  #include <linux/linkage.h>
>  #include <asm/assembler.h>
> +#include <asm/cache.h>
>  
>  /*
>   * Fill in the buffer with character c (alignment handled by the hardware)
> @@ -42,8 +42,7 @@ dst         .req    x8
>  tmp3w                .req    w9
>  tmp3         .req    x9
>  
> -     .weak memset
> -ENTRY(__arch_memset)
> +SYM_FUNC_START(__pi_memset)
>       mov     dst, dstin      /* Preserve return value.  */
>       and     A_lw, val, #255
>       orr     A_lw, A_lw, A_lw, lsl #8
> @@ -115,6 +114,7 @@ ENTRY(__arch_memset)
>       * Critical loop. Start at a new cache line boundary. Assuming
>       * 64 bytes per line, this ensures the entire loop is in one line.
>       */
> +     .p2align        L1_CACHE_SHIFT
>  .Lnot_short:
>       sub     dst, dst, #16/* Pre-bias.  */
>       sub     count, count, #64
> @@ -201,4 +201,8 @@ ENTRY(__arch_memset)
>       ands    count, count, zva_bits_x
>       b.ne    .Ltail_maybe_long
>       ret
> -ENDPROC(__arch_memset)
> +SYM_FUNC_END(__pi_memset)
> +
> +SYM_FUNC_ALIAS(__arch_memset, __pi_memset)
> +
> +SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)
> diff --git a/arch/arm/lib64/string.c b/arch/arm/lib64/string.c
> index 938790e1a9..c7954d6efe 100644
> --- a/arch/arm/lib64/string.c
> +++ b/arch/arm/lib64/string.c
> @@ -6,6 +6,7 @@
>  
>  void *__arch_memset(void *dst, int c, __kernel_size_t size);
>  void *__arch_memcpy(void * dest, const void *src, size_t count);
> +void *__arch_memmove(void * dest, const void *src, size_t count);
>  
>  static __prereloc void *_memset(void *dst, int c, __kernel_size_t size)
>  {
> @@ -38,3 +39,19 @@ void __weak *memcpy(void * dest, const void *src, size_t 
> count)
>  
>  void *__memcpy(void * dest, const void *src, size_t count)
>       __alias(_memcpy);
> +
> +static void *_memmove(void * dest, const void *src, size_t count)
> +{
> +     if (likely(get_cr() & CR_M))
> +             return __arch_memmove(dest, src, count);
> +
> +     return __default_memmove(dest, src, count);
> +}
> +
> +void __weak *memmove(void * dest, const void *src, size_t count)
> +{
> +     return _memmove(dest, src, count);
> +}
> +
> +void *__memmove(void * dest, const void *src, size_t count)
> +     __alias(_memmove);
> diff --git a/include/string.h b/include/string.h
> index cbe6eddf7f..986ccd83dd 100644
> --- a/include/string.h
> +++ b/include/string.h
> @@ -17,6 +17,8 @@ void *__nokasan_default_memset(void *, int, 
> __kernel_size_t);
>  void *__default_memcpy(void * dest,const void *src,size_t count);
>  void *__nokasan_default_memcpy(void * dest,const void *src,size_t count);
>  
> +void *__default_memmove(void * dest,const void *src,size_t count);
> +
>  char *parse_assignment(char *str);
>  
>  int strverscmp(const char *a, const char *b);
> diff --git a/lib/string.c b/lib/string.c
> index 98dd3cffdd..50c2016c2b 100644
> --- a/lib/string.c
> +++ b/lib/string.c
> @@ -701,7 +701,6 @@ void *memmove(void * dest, const void *src, size_t count)
>  void *__memmove(void * dest, const void *src, size_t count)
>       __alias(__default_memmove);
>  #endif
> -EXPORT_SYMBOL(memmove);
>  
>  #ifndef __HAVE_ARCH_MEMCMP
>  /**
> 
> -- 
> 2.39.5
> 
> 
>

Re: [PATCH v2 10/10] ARM: add optimized memmove

Reply via email to