This is an automated email from the ASF dual-hosted git repository.
xiaoxiang781216 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nuttx.git
The following commit(s) were added to refs/heads/master by this push:
new 131f7f75ed3 arm: memcpy: add NEON paths for aligned copies
131f7f75ed3 is described below
commit 131f7f75ed3815277ad87ac4eb05ff61b88c438c
Author: yaojiaqi <[email protected]>
AuthorDate: Wed May 6 15:51:13 2026 +0800
arm: memcpy: add NEON paths for aligned copies
Add dedicated NEON implementations for mutually aligned medium and long
memcpy copies when building with __ARM_NEON__. These paths use NEON
multi-register loads and stores while preserving the existing VFP
implementation for non-NEON VFP configurations.
NEON builds also define USE_VFP, so select the NEON implementation
explicitly before falling back to VFP. Apply the same aligned-copy optimization
to the armv7-a, armv7-r, and armv8-r implementations.
Signed-off-by: yaojiaqi <[email protected]>
---
libs/libc/machine/arm/armv7-a/arch_memcpy.S | 58 +++++++++++++++++++++++++++--
libs/libc/machine/arm/armv7-r/arch_memcpy.S | 58 +++++++++++++++++++++++++++--
libs/libc/machine/arm/armv8-r/arch_memcpy.S | 58 +++++++++++++++++++++++++++--
3 files changed, 165 insertions(+), 9 deletions(-)
diff --git a/libs/libc/machine/arm/armv7-a/arch_memcpy.S
b/libs/libc/machine/arm/armv7-a/arch_memcpy.S
index ea41f403cc3..e96b408e64b 100644
--- a/libs/libc/machine/arm/armv7-a/arch_memcpy.S
+++ b/libs/libc/machine/arm/armv7-a/arch_memcpy.S
@@ -260,7 +260,7 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
cmp tmp1, tmp2
bne .Lcpy_notaligned
-#ifdef USE_VFP
+#if defined(USE_VFP) && !defined(USE_NEON)
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */
@@ -290,7 +290,40 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */
-#ifdef USE_VFP
+#ifdef USE_NEON
+ /* Use NEON multi-register transfers with destination alignment
+ hints for aligned copies. */
+1:
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ pld [src, #(prefetch_lines * 64)]
+ subs tmp2, tmp2, #64
+ vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+
+.Ltail63aligned: /* Count in tmp2. */
+ /* Use NEON 8-byte vld1/vst1 for the tail. */
+ and tmp1, tmp2, #0x38
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ vld1.8 {d0}, [src]! /* 14 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 12 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 10 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 8 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 6 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 4 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 2 words to go. */
+ vst1.8 {d0}, [dst]!
+#elif defined(USE_VFP)
1:
vldr d0, [src, #0]
subs tmp2, tmp2, #64
@@ -411,7 +444,26 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
/* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */
-#ifdef USE_VFP
+#ifdef USE_NEON
+ /* Use NEON multi-register transfers with prefetching for long
+ copies. */
+ pld [src, #0]
+ pld [src, #64]
+ pld [src, #128]
+ pld [src, #192]
+ pld [src, #256]
+1:
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ pld [src, #(prefetch_lines * 64)]
+ subs tmp2, tmp2, #64
+ vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+ b .Ltail63aligned
+#elif defined(USE_VFP)
/* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */
diff --git a/libs/libc/machine/arm/armv7-r/arch_memcpy.S
b/libs/libc/machine/arm/armv7-r/arch_memcpy.S
index 731d1dfd882..1ce1b03216f 100644
--- a/libs/libc/machine/arm/armv7-r/arch_memcpy.S
+++ b/libs/libc/machine/arm/armv7-r/arch_memcpy.S
@@ -258,7 +258,7 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
cmp tmp1, tmp2
bne .Lcpy_notaligned
-#ifdef USE_VFP
+#if defined(USE_VFP) && !defined(USE_NEON)
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */
@@ -288,7 +288,40 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */
-#ifdef USE_VFP
+#ifdef USE_NEON
+ /* Use NEON multi-register transfers with destination alignment
+ hints for aligned copies. */
+1:
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ pld [src, #(prefetch_lines * 64)]
+ subs tmp2, tmp2, #64
+ vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+
+.Ltail63aligned: /* Count in tmp2. */
+ /* Use NEON 8-byte vld1/vst1 for the tail. */
+ and tmp1, tmp2, #0x38
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ vld1.8 {d0}, [src]! /* 14 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 12 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 10 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 8 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 6 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 4 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 2 words to go. */
+ vst1.8 {d0}, [dst]!
+#elif defined(USE_VFP)
1:
vldr d0, [src, #0]
subs tmp2, tmp2, #64
@@ -409,7 +442,26 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
/* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */
-#ifdef USE_VFP
+#ifdef USE_NEON
+ /* Use NEON multi-register transfers with prefetching for long
+ copies. */
+ pld [src, #0]
+ pld [src, #64]
+ pld [src, #128]
+ pld [src, #192]
+ pld [src, #256]
+1:
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ pld [src, #(prefetch_lines * 64)]
+ subs tmp2, tmp2, #64
+ vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+ b .Ltail63aligned
+#elif defined(USE_VFP)
/* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */
diff --git a/libs/libc/machine/arm/armv8-r/arch_memcpy.S
b/libs/libc/machine/arm/armv8-r/arch_memcpy.S
index ed62204be1e..9a1238ec4de 100644
--- a/libs/libc/machine/arm/armv8-r/arch_memcpy.S
+++ b/libs/libc/machine/arm/armv8-r/arch_memcpy.S
@@ -258,7 +258,7 @@ def_fn memcpy p2align=6
cmp tmp1, tmp2
bne .Lcpy_notaligned
-#ifdef USE_VFP
+#if defined(USE_VFP) && !defined(USE_NEON)
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */
@@ -288,7 +288,40 @@ def_fn memcpy p2align=6
bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */
-#ifdef USE_VFP
+#ifdef USE_NEON
+ /* Use NEON multi-register transfers with destination alignment
+ hints for aligned copies. */
+1:
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ pld [src, #(prefetch_lines * 64)]
+ subs tmp2, tmp2, #64
+ vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+
+.Ltail63aligned: /* Count in tmp2. */
+ /* Use NEON 8-byte vld1/vst1 for the tail. */
+ and tmp1, tmp2, #0x38
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ vld1.8 {d0}, [src]! /* 14 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 12 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 10 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 8 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 6 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 4 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 2 words to go. */
+ vst1.8 {d0}, [dst]!
+#elif defined(USE_VFP)
1:
vldr d0, [src, #0]
subs tmp2, tmp2, #64
@@ -409,7 +442,26 @@ def_fn memcpy p2align=6
/* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */
-#ifdef USE_VFP
+#ifdef USE_NEON
+ /* Use NEON multi-register transfers with prefetching for long
+ copies. */
+ pld [src, #0]
+ pld [src, #64]
+ pld [src, #128]
+ pld [src, #192]
+ pld [src, #256]
+1:
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ pld [src, #(prefetch_lines * 64)]
+ subs tmp2, tmp2, #64
+ vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+ b .Ltail63aligned
+#elif defined(USE_VFP)
/* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */