(nuttx) branch master updated: arm: memcpy: add NEON paths for aligned copies

xiaoxiang Fri, 08 May 2026 18:47:02 -0700

This is an automated email from the ASF dual-hosted git repository.

xiaoxiang781216 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nuttx.git



The following commit(s) were added to refs/heads/master by this push:
     new 131f7f75ed3 arm: memcpy: add NEON paths for aligned copies
131f7f75ed3 is described below

commit 131f7f75ed3815277ad87ac4eb05ff61b88c438c
Author: yaojiaqi <[email protected]>
AuthorDate: Wed May 6 15:51:13 2026 +0800

    arm: memcpy: add NEON paths for aligned copies
    
    Add dedicated NEON implementations for mutually aligned medium and long 
memcpy copies when building with __ARM_NEON__. These paths use NEON 
multi-register loads and stores while preserving the existing VFP 
implementation for non-NEON VFP configurations.
    
    NEON builds also define USE_VFP, so select the NEON implementation 
explicitly before falling back to VFP. Apply the same aligned-copy optimization 
to the armv7-a, armv7-r, and armv8-r implementations.
    
    Signed-off-by: yaojiaqi <[email protected]>
---
 libs/libc/machine/arm/armv7-a/arch_memcpy.S | 58 +++++++++++++++++++++++++++--
 libs/libc/machine/arm/armv7-r/arch_memcpy.S | 58 +++++++++++++++++++++++++++--
 libs/libc/machine/arm/armv8-r/arch_memcpy.S | 58 +++++++++++++++++++++++++++--
 3 files changed, 165 insertions(+), 9 deletions(-)

diff --git a/libs/libc/machine/arm/armv7-a/arch_memcpy.S 
b/libs/libc/machine/arm/armv7-a/arch_memcpy.S
index ea41f403cc3..e96b408e64b 100644
--- a/libs/libc/machine/arm/armv7-a/arch_memcpy.S
+++ b/libs/libc/machine/arm/armv7-a/arch_memcpy.S
@@ -260,7 +260,7 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
        cmp     tmp1, tmp2
        bne     .Lcpy_notaligned
 
-#ifdef USE_VFP
+#if defined(USE_VFP) && !defined(USE_NEON)
        /* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
           that the FP pipeline is much better at streaming loads and
           stores.  This is outside the critical loop.  */
@@ -290,7 +290,40 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
        bge     .Lcpy_body_long
 
 .Lcpy_body_medium:                     /* Count in tmp2.  */
-#ifdef USE_VFP
+#ifdef USE_NEON
+       /* Use NEON multi-register transfers with destination alignment
+          hints for aligned copies.  */
+1:
+       vld1.8  {d0-d3}, [src]!
+       vld1.8  {d4-d7}, [src]!
+       pld     [src, #(prefetch_lines * 64)]
+       subs    tmp2, tmp2, #64
+       vst1.8  {d0-d3}, [ALIGN(dst, 64)]!
+       vst1.8  {d4-d7}, [ALIGN(dst, 64)]!
+       bge     1b
+       tst     tmp2, #0x3f
+       beq     .Ldone
+
+.Ltail63aligned:                       /* Count in tmp2.  */
+       /* Use NEON 8-byte vld1/vst1 for the tail.  */
+       and     tmp1, tmp2, #0x38
+       rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+       add     pc, pc, tmp1
+       vld1.8  {d0}, [src]!    /* 14 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 12 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 10 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 8 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 6 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 4 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 2 words to go.  */
+       vst1.8  {d0}, [dst]!
+#elif defined(USE_VFP)
 1:
        vldr    d0, [src, #0]
        subs    tmp2, tmp2, #64
@@ -411,7 +444,26 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
 
        /* Long copy.  We know that there's at least (prefetch_lines * 64)
           bytes to go.  */
-#ifdef USE_VFP
+#ifdef USE_NEON
+       /* Use NEON multi-register transfers with prefetching for long
+          copies.  */
+       pld     [src, #0]
+       pld     [src, #64]
+       pld     [src, #128]
+       pld     [src, #192]
+       pld     [src, #256]
+1:
+       vld1.8  {d0-d3}, [src]!
+       vld1.8  {d4-d7}, [src]!
+       pld     [src, #(prefetch_lines * 64)]
+       subs    tmp2, tmp2, #64
+       vst1.8  {d0-d3}, [ALIGN(dst, 64)]!
+       vst1.8  {d4-d7}, [ALIGN(dst, 64)]!
+       bge     1b
+       tst     tmp2, #0x3f
+       beq     .Ldone
+       b       .Ltail63aligned
+#elif defined(USE_VFP)
        /* Don't use PLD.  Instead, read some data in advance of the current
           copy position into a register.  This should act like a PLD
           operation but we won't have to repeat the transfer.  */
diff --git a/libs/libc/machine/arm/armv7-r/arch_memcpy.S 
b/libs/libc/machine/arm/armv7-r/arch_memcpy.S
index 731d1dfd882..1ce1b03216f 100644
--- a/libs/libc/machine/arm/armv7-r/arch_memcpy.S
+++ b/libs/libc/machine/arm/armv7-r/arch_memcpy.S
@@ -258,7 +258,7 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
        cmp     tmp1, tmp2
        bne     .Lcpy_notaligned
 
-#ifdef USE_VFP
+#if defined(USE_VFP) && !defined(USE_NEON)
        /* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
           that the FP pipeline is much better at streaming loads and
           stores.  This is outside the critical loop.  */
@@ -288,7 +288,40 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
        bge     .Lcpy_body_long
 
 .Lcpy_body_medium:                     /* Count in tmp2.  */
-#ifdef USE_VFP
+#ifdef USE_NEON
+       /* Use NEON multi-register transfers with destination alignment
+          hints for aligned copies.  */
+1:
+       vld1.8  {d0-d3}, [src]!
+       vld1.8  {d4-d7}, [src]!
+       pld     [src, #(prefetch_lines * 64)]
+       subs    tmp2, tmp2, #64
+       vst1.8  {d0-d3}, [ALIGN(dst, 64)]!
+       vst1.8  {d4-d7}, [ALIGN(dst, 64)]!
+       bge     1b
+       tst     tmp2, #0x3f
+       beq     .Ldone
+
+.Ltail63aligned:                       /* Count in tmp2.  */
+       /* Use NEON 8-byte vld1/vst1 for the tail.  */
+       and     tmp1, tmp2, #0x38
+       rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+       add     pc, pc, tmp1
+       vld1.8  {d0}, [src]!    /* 14 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 12 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 10 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 8 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 6 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 4 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 2 words to go.  */
+       vst1.8  {d0}, [dst]!
+#elif defined(USE_VFP)
 1:
        vldr    d0, [src, #0]
        subs    tmp2, tmp2, #64
@@ -409,7 +442,26 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
 
        /* Long copy.  We know that there's at least (prefetch_lines * 64)
           bytes to go.  */
-#ifdef USE_VFP
+#ifdef USE_NEON
+       /* Use NEON multi-register transfers with prefetching for long
+          copies.  */
+       pld     [src, #0]
+       pld     [src, #64]
+       pld     [src, #128]
+       pld     [src, #192]
+       pld     [src, #256]
+1:
+       vld1.8  {d0-d3}, [src]!
+       vld1.8  {d4-d7}, [src]!
+       pld     [src, #(prefetch_lines * 64)]
+       subs    tmp2, tmp2, #64
+       vst1.8  {d0-d3}, [ALIGN(dst, 64)]!
+       vst1.8  {d4-d7}, [ALIGN(dst, 64)]!
+       bge     1b
+       tst     tmp2, #0x3f
+       beq     .Ldone
+       b       .Ltail63aligned
+#elif defined(USE_VFP)
        /* Don't use PLD.  Instead, read some data in advance of the current
           copy position into a register.  This should act like a PLD
           operation but we won't have to repeat the transfer.  */
diff --git a/libs/libc/machine/arm/armv8-r/arch_memcpy.S 
b/libs/libc/machine/arm/armv8-r/arch_memcpy.S
index ed62204be1e..9a1238ec4de 100644
--- a/libs/libc/machine/arm/armv8-r/arch_memcpy.S
+++ b/libs/libc/machine/arm/armv8-r/arch_memcpy.S
@@ -258,7 +258,7 @@ def_fn memcpy p2align=6
        cmp     tmp1, tmp2
        bne     .Lcpy_notaligned
 
-#ifdef USE_VFP
+#if defined(USE_VFP) && !defined(USE_NEON)
        /* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
           that the FP pipeline is much better at streaming loads and
           stores.  This is outside the critical loop.  */
@@ -288,7 +288,40 @@ def_fn memcpy p2align=6
        bge     .Lcpy_body_long
 
 .Lcpy_body_medium:                     /* Count in tmp2.  */
-#ifdef USE_VFP
+#ifdef USE_NEON
+       /* Use NEON multi-register transfers with destination alignment
+          hints for aligned copies.  */
+1:
+       vld1.8  {d0-d3}, [src]!
+       vld1.8  {d4-d7}, [src]!
+       pld     [src, #(prefetch_lines * 64)]
+       subs    tmp2, tmp2, #64
+       vst1.8  {d0-d3}, [ALIGN(dst, 64)]!
+       vst1.8  {d4-d7}, [ALIGN(dst, 64)]!
+       bge     1b
+       tst     tmp2, #0x3f
+       beq     .Ldone
+
+.Ltail63aligned:                       /* Count in tmp2.  */
+       /* Use NEON 8-byte vld1/vst1 for the tail.  */
+       and     tmp1, tmp2, #0x38
+       rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+       add     pc, pc, tmp1
+       vld1.8  {d0}, [src]!    /* 14 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 12 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 10 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 8 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 6 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 4 words to go.  */
+       vst1.8  {d0}, [dst]!
+       vld1.8  {d0}, [src]!    /* 2 words to go.  */
+       vst1.8  {d0}, [dst]!
+#elif defined(USE_VFP)
 1:
        vldr    d0, [src, #0]
        subs    tmp2, tmp2, #64
@@ -409,7 +442,26 @@ def_fn memcpy p2align=6
 
        /* Long copy.  We know that there's at least (prefetch_lines * 64)
           bytes to go.  */
-#ifdef USE_VFP
+#ifdef USE_NEON
+       /* Use NEON multi-register transfers with prefetching for long
+          copies.  */
+       pld     [src, #0]
+       pld     [src, #64]
+       pld     [src, #128]
+       pld     [src, #192]
+       pld     [src, #256]
+1:
+       vld1.8  {d0-d3}, [src]!
+       vld1.8  {d4-d7}, [src]!
+       pld     [src, #(prefetch_lines * 64)]
+       subs    tmp2, tmp2, #64
+       vst1.8  {d0-d3}, [ALIGN(dst, 64)]!
+       vst1.8  {d4-d7}, [ALIGN(dst, 64)]!
+       bge     1b
+       tst     tmp2, #0x3f
+       beq     .Ldone
+       b       .Ltail63aligned
+#elif defined(USE_VFP)
        /* Don't use PLD.  Instead, read some data in advance of the current
           copy position into a register.  This should act like a PLD
           operation but we won't have to repeat the transfer.  */

(nuttx) branch master updated: arm: memcpy: add NEON paths for aligned copies

Reply via email to