Module Name: src
Committed By: skrll
Date: Sun Feb 4 21:52:17 UTC 2018
Modified Files:
src/common/lib/libc/arch/aarch64/string: memcmp.S memcpy.S
Added Files:
src/common/lib/libc/arch/aarch64/string: bcopy.S memmove.S
Log Message:
Working / new versions from Ryo Shimizu
To generate a diff of this commit:
cvs rdiff -u -r0 -r1.1 src/common/lib/libc/arch/aarch64/string/bcopy.S \
src/common/lib/libc/arch/aarch64/string/memmove.S
cvs rdiff -u -r1.1 -r1.2 src/common/lib/libc/arch/aarch64/string/memcmp.S \
src/common/lib/libc/arch/aarch64/string/memcpy.S
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/common/lib/libc/arch/aarch64/string/memcmp.S
diff -u src/common/lib/libc/arch/aarch64/string/memcmp.S:1.1 src/common/lib/libc/arch/aarch64/string/memcmp.S:1.2
--- src/common/lib/libc/arch/aarch64/string/memcmp.S:1.1 Sun Aug 10 05:47:35 2014
+++ src/common/lib/libc/arch/aarch64/string/memcmp.S Sun Feb 4 21:52:16 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: memcmp.S,v 1.1 2014/08/10 05:47:35 matt Exp $ */
+/* $NetBSD: memcmp.S,v 1.2 2018/02/04 21:52:16 skrll Exp $ */
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -31,7 +31,7 @@
#include <machine/asm.h>
-RCSID("$NetBSD: memcmp.S,v 1.1 2014/08/10 05:47:35 matt Exp $")
+RCSID("$NetBSD: memcmp.S,v 1.2 2018/02/04 21:52:16 skrll Exp $")
ENTRY(memcmp)
mov x9, x0
@@ -42,14 +42,14 @@ ENTRY(memcmp)
cmp x2, #6
b.eq .Lmemcmp_6bytes
#endif
- cmp x2, #7
+ cmp x2, #8
b.ls .Lmemcmp_lessthan8
ands x3, x9, #7
b.eq .Lmemcmp_dword_loop
/*
- * The two addresses have identical alignment but are not yet dword aligned.
+ * The src1 address is not dword aligned.
*/
add x2, x2, x3 /* add unalignment to length */
sub x2, x2, #8 /* now subtract a dword */
@@ -68,14 +68,7 @@ ENTRY(memcmp)
lsr x6, x6, x3 /* discard leading bytes from data2 */
#endif
subs x0, x4, x6 /* compare data */
-#ifdef __AARCH64EL__
b.ne .Lmemcmp_last_compare /* difference. find it */
-#else
- b.eq .Lmemcmp_dword_loop /* no difference. go to loop */
- rev x4, x4 /* byte swap data1 */
- rev x6, x6 /* byte swap data2 */
- b .Lmemcmp_last_compare /* go find the difference. */
-#endif
.Lmemcmp_dword_loop:
subs x2, x2, #8
@@ -84,10 +77,6 @@ ENTRY(memcmp)
ldr x6, [x10], #8
subs x0, x4, x6
b.eq .Lmemcmp_dword_loop /* no difference. go to loop */
-#ifdef __AARCH64EB__
- rev x4, x4 /* byte swap data1 */
- rev x6, x6 /* byte swap data2 */
-#endif
b .Lmemcmp_last_compare /* go find the difference. */
.Lmemcmp_finish_dword:
@@ -96,6 +85,8 @@ ENTRY(memcmp)
*/
tst x2, #7
b.eq .Lmemcmp_ret
+ mov x4, xzr
+ mov x6, xzr
/*
*
*/
@@ -120,16 +111,18 @@ ENTRY(memcmp)
#endif
.Lmemcmp_finish_hword:
-#ifdef __AARCH64EB__
- rev x4, x4 /* byte swap data1 */
- rev x6, x6 /* byte swap data1 */
-#endif
- tbz x2, #0, .Lmemcmp_last_compare
+ tbz x2, #0, .Lmemcmp_last_compare0
+
ldrb w5, [x9]
ldrb w7, [x10]
+#ifdef __AARCH64EB__
+ orr x4, x4, x5, lsl #8
+ orr x6, x6, x7, lsl #8
+#else
orr x4, x4, x5, lsl #48
orr x6, x6, x7, lsl #48
- b .Lmemcmp_last_compare /* go find the difference. */
+#endif
+ b .Lmemcmp_last_compare0 /* go find the difference. */
/*
* D
@@ -167,7 +160,7 @@ ENTRY(memcmp)
#endif /* _KERNEL */
/*
- * We have loaded the final bytes in x4 and x6 in LE format. Now we have
+ * We have loaded the final bytes in x4 and x6 in host-endian. Now we have
* to figure what the difference is (if any). First we subtract. Any bytes
* that are the same will be 0. So to find the first non-zero byte we byterev
* and then use clz to find that byte.
@@ -175,13 +168,25 @@ ENTRY(memcmp)
* data dwords left to remove the equal part. Then we shift right to discard
* the trailing bytes. Then we subtract and return.
*/
+.Lmemcmp_last_compare0:
subs x0, x4, x6
b.eq .Lmemcmp_ret
.Lmemcmp_last_compare:
- rev x1, x0 /* byte reverse */
+#if __AARCH64EB__
+ clz x1, x0 /* find first non-zero byte */
+ rev x0, x0
+#else
+ rev x1, x0
clz x1, x1 /* find first non-zero byte */
- bfi x1, xzr, #0, #3 /* make it byte aligned */
- lsr x0, x0, x1 /* shift to LSB */
- sxtb w0, w0 /* sign extend */
+#endif
+ bfi x1, xzr, #0, #3 /* make it byte aligned */
+ lsr x1, x0, x1 /* shift to LSB */
+#if __AARCH64EL__
+ rev x4, x4 /* byte reverse */
+ rev x6, x6 /* byte reverse */
+#endif
+ subs x0, x4, x6
+ csetm x0, cc /* set mask bits as sign */
+ bfm x0, x1, #0, #7 /* extend with sign bit */
ret
END(memcmp)
Index: src/common/lib/libc/arch/aarch64/string/memcpy.S
diff -u src/common/lib/libc/arch/aarch64/string/memcpy.S:1.1 src/common/lib/libc/arch/aarch64/string/memcpy.S:1.2
--- src/common/lib/libc/arch/aarch64/string/memcpy.S:1.1 Sun Aug 10 05:47:35 2014
+++ src/common/lib/libc/arch/aarch64/string/memcpy.S Sun Feb 4 21:52:16 2018
@@ -1,126 +1,4 @@
-/* $NetBSD: memcpy.S,v 1.1 2014/08/10 05:47:35 matt Exp $ */
+/* $NetBSD: memcpy.S,v 1.2 2018/02/04 21:52:16 skrll Exp $ */
-/*-
- * Copyright (c) 2014 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Matt Thomas of 3am Software Foundry.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <machine/asm.h>
-
-RCSID("$NetBSD: memcpy.S,v 1.1 2014/08/10 05:47:35 matt Exp $")
-
-/* LINTSTUB: void *memcpy(void * restrict, const void * restrict, size_t); */
-
-ENTRY(memcpy)
- mov x10, x0
- mov x11, x1
- cbz x2, .Lmemcpy_ret
-
- cmp x2, #7
- b.ls .Lmemcpy_last_dword
-
- ands x3, x10, #7
- b.eq .Lmemcpy_dword_aligned
-
-/*
- * The dst address doesn't have dword alignment. The src address may or may
- * not have the same alignment. Make dst dword aligned. Hope src will be
- * dword aligned but if it isn't, take advantage of unaligned access.
- */
- add x2, x2, x3 /* add unalignment to length */
- sub x2, x2, #8 /* now subtract a dword */
-
- tbz x10, #0, .Lmemcpy_hword_aligned
- ldrb w4, [x11], #1
- strb w4, [x10], #1
-.Lmemcpy_hword_aligned:
- tbz x10, #1, .Lmemcpy_word_aligned
- ldrh w4, [x11], #2
- strh w4, [x10], #2
-.Lmemcpy_word_aligned:
- tbz x10, #2, .Lmemcpy_dword_aligned
- ldr w4, [x11], #4
- str w4, [x10], #4
-.Lmemcpy_dword_aligned:
- /*
- * destination is now dword aligned.
- */
- subs x2, x2, #32
- b.mi .Lmemcpy_last_oword
-
-.Lmemcpy_oword_loop:
- ldp x4, x5, [x11], #16
- ldp x6, x7, [x11], #16
- stp x4, x5, [x10], #16
- stp x6, x7, [x10], #16
- cbz x2, .Lmemcpy_ret
- subs x2, x2, #32
- b.pl .Lmemcpy_oword_loop
-
-.Lmemcpy_last_oword:
- /*
- * We have 31 bytes or less to copy. First see if we can write a qword
- */
- tbz x2, #4, .Lmemcpy_last_qword
- ldp x4, x5, [x11], #16 /* read word */
- stp x4, x5, [x10], #16 /* write word */
-
-.Lmemcpy_last_qword:
- /*
- * We have 15 bytes or less to copy. First see if we can write a dword
- */
- tbz x2, #3, .Lmemcpy_last_dword
- ldr x4, [x11], #8 /* read word */
- str x4, [x10], #8 /* write word */
-
-.Lmemcpy_last_dword:
- /*
- * We have 7 bytes or less to copy. First see if we can write a word
- */
- tbz x2, #2, .Lmemcpy_last_word
- ldr w4, [x11], #4 /* read word */
- str w4, [x10], #4 /* write word */
-
-.Lmemcpy_last_word:
- /*
- * We have 3 bytes or less to copy. First see if we can write a hword
- */
- tbz x2, #1, .Lmemcpy_last_hword
- ldrh w4, [x11], #2
- strh w4, [x10], #2
-
-.Lmemcpy_last_hword:
- /*
- * We have 1 or none bytes to copy.
- */
- tbz x2, #0, .Lmemcpy_ret
- ldrb w4, [x11]
- strb w4, [x10]
-
-.Lmemcpy_ret:
- ret
-END(memcpy)
+#define MEMCOPY
+#include "bcopy.S"
Added files:
Index: src/common/lib/libc/arch/aarch64/string/bcopy.S
diff -u /dev/null src/common/lib/libc/arch/aarch64/string/bcopy.S:1.1
--- /dev/null Sun Feb 4 21:52:17 2018
+++ src/common/lib/libc/arch/aarch64/string/bcopy.S Sun Feb 4 21:52:16 2018
@@ -0,0 +1,990 @@
+/* $NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $ */
+
+/*
+ * Copyright (c) 2018 Ryo Shimizu <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+
+#if defined(LIBC_SCCS)
+RCSID("$NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $")
+#endif
+
+#if defined(MEMCOPY)
+
+/*
+ * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
+ */
+#define FUNCTION memcpy
+#define NO_OVERLAP
+#define SRC0 x1
+#define DST0 x0
+#define LEN x2
+
+#elif defined(MEMMOVE)
+
+/*
+ * void *memmove(void *dst, const void *src, size_t len);
+ */
+#define FUNCTION memmove
+#undef NO_OVERLAP
+#define SRC0 x1
+#define DST0 x0
+#define LEN x2
+
+#else /* !MEMCOPY && !MEMMOVE */
+
+/*
+ * void bcopy(const void *src, void *dst, size_t len);
+ */
+#define FUNCTION bcopy
+#define NO_OVERLAP
+#define SRC0 x0
+#define DST0 x1
+#define LEN x2
+
+#endif /* MEMCOPY/MEMMOVE/BCOPY */
+
+/* caller-saved temporary registers. breakable. */
+#define TMP_X x3
+#define TMP_Xw w3
+#define TMP_D x4
+#define TMP_S x5
+#define DST x6
+#define SRC x7
+#define DATA0 x8
+#define DATA0w w8
+#define DATA1 x9
+#define DATA1w w9
+#define DATA2 x10
+#define SRC_ALIGNBIT x11 /* (SRC & 7) * 8 */
+#define DST_ALIGNBIT x12 /* (DST & 7) * 8 */
+#define SRC_DST_ALIGNBIT x13 /* = SRC_ALIGNBIT - DST_ALIGNBIT */
+#define DST_SRC_ALIGNBIT x14 /* = -SRC_DST_ALIGNBIT */
+
+#define STP_ALIGN 16 /* align before stp/ldp. 8 or 16 */
+#define SMALLSIZE 32
+
+ .text
+ .align 5
+
+#ifndef NO_OVERLAP
+#ifndef STRICT_ALIGNMENT
+backward_ignore_align:
+ prfm PLDL1KEEP, [SRC0]
+ add SRC0, SRC0, LEN
+ add DST, DST0, LEN
+ cmp LEN, #SMALLSIZE
+ bcs copy_backward
+copy_backward_small:
+ cmp LEN, #8
+ bcs 9f
+
+ /* 0 <= len < 8 */
+ /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
+ tbz LEN, #2, 1f
+ ldr TMP_Xw, [SRC0, #-4]!
+ str TMP_Xw, [DST, #-4]!
+1:
+ /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
+ tbz LEN, #1, 1f
+ ldrh TMP_Xw, [SRC0, #-2]!
+ strh TMP_Xw, [DST, #-2]!
+1:
+ /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
+ tbz LEN, #0, 1f
+ ldrb TMP_Xw, [SRC0, #-1]!
+ strb TMP_Xw, [DST, #-1]!
+1:
+ ret
+9:
+
+ cmp LEN, #16
+ bcs 9f
+
+ /* 8 <= len < 16 */
+ /* *--(uint64_t *)dst = *--(uint64_t *)src; */
+ ldr TMP_X, [SRC0, #-8]!
+ str TMP_X, [DST, #-8]!
+ /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
+ tbz LEN, #2, 1f
+ ldr TMP_Xw, [SRC0, #-4]!
+ str TMP_Xw, [DST, #-4]!
+1:
+ /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
+ tbz LEN, #1, 1f
+ ldrh TMP_Xw, [SRC0, #-2]!
+ strh TMP_Xw, [DST, #-2]!
+1:
+ /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
+ tbz LEN, #0, 1f
+ ldrb TMP_Xw, [SRC0, #-1]!
+ strb TMP_Xw, [DST, #-1]!
+1:
+ ret
+9:
+
+ /* 16 <= len < 32 */
+ ldp DATA0, DATA1, [SRC0, #-16]!
+ stp DATA0, DATA1, [DST, #-16]!
+ /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
+ tbz LEN, #3, 1f
+ ldr TMP_X, [SRC0, #-8]!
+ str TMP_X, [DST, #-8]!
+1:
+ /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
+ tbz LEN, #2, 1f
+ ldr TMP_Xw, [SRC0, #-4]!
+ str TMP_Xw, [DST, #-4]!
+1:
+ /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
+ tbz LEN, #1, 1f
+ ldrh TMP_Xw, [SRC0, #-2]!
+ strh TMP_Xw, [DST, #-2]!
+1:
+ /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
+ tbz LEN, #0, 1f
+ ldrb TMP_Xw, [SRC0, #-1]!
+ strb TMP_Xw, [DST, #-1]!
+1:
+ ret
+#endif /* !STRICT_ALIGNMENT */
+
+ .align 4
+copy_backward:
+ /* DST is not aligned at this point */
+#ifndef STRICT_ALIGNMENT
+ cmp LEN, #512 /* pre-alignment can be overhead when small */
+ bcc 9f
+#endif
+ /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
+ tbz DST, #0, 1f
+ ldrb TMP_Xw, [SRC0, #-1]!
+ strb TMP_Xw, [DST, #-1]!
+ sub LEN, LEN, #1
+1:
+ /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
+ tbz DST, #1, 1f
+ ldrh TMP_Xw, [SRC0, #-2]!
+ strh TMP_Xw, [DST, #-2]!
+ sub LEN, LEN, #2
+1:
+ /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
+ tbz DST, #2, 1f
+ ldr TMP_Xw, [SRC0, #-4]!
+ str TMP_Xw, [DST, #-4]!
+ sub LEN, LEN, #4
+1:
+#if (STP_ALIGN > 8)
+ /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
+ tbz DST, #3, 1f
+ ldr TMP_X, [SRC0, #-8]!
+ str TMP_X, [DST, #-8]!
+ sub LEN, LEN, #8
+1:
+#endif /* (STP_ALIGN > 8) */
+9:
+
+ cmp LEN, #1024
+ bhs backward_copy1k
+backward_less1k:
+ /* copy 16*n bytes */
+ and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */
+ adr TMP_X, 8f
+ sub LEN, LEN, TMP_D
+ sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */
+ br TMP_X
+backward_copy1k: /* copy 16*64 bytes */
+ sub LEN, LEN, #1024
+ .rept (1024 / 16)
+ ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */
+ stp DATA0, DATA1, [DST, #-16]!
+ .endr
+8:
+ cbz LEN, done
+ cmp LEN, #1024
+ bhs backward_copy1k
+ cmp LEN, #16
+ bhs backward_less1k
+
+ /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
+ tbz LEN, #4, 1f
+ ldp DATA0, DATA1, [SRC0, #-16]!
+ ldp DATA0, DATA1, [DST, #-16]!
+1:
+ /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
+ tbz LEN, #3, 1f
+ ldr TMP_X, [SRC0, #-8]!
+ str TMP_X, [DST, #-8]!
+1:
+ /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
+ tbz LEN, #2, 1f
+ ldr TMP_Xw, [SRC0, #-4]!
+ str TMP_Xw, [DST, #-4]!
+1:
+ /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
+ tbz LEN, #1, 1f
+ ldrh TMP_Xw, [SRC0, #-2]!
+ strh TMP_Xw, [DST, #-2]!
+1:
+ /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
+ tbz LEN, #0, 1f
+ ldrb TMP_Xw, [SRC0, #-1]!
+ strb TMP_Xw, [DST, #-1]!
+1:
+ ret
+#endif /* !NO_OVERLAP */
+
+
+#if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP)
+ .align 5
+backward_copy:
+ prfm PLDL1KEEP, [SRC0]
+ add DST, DST0, LEN
+ add SRC0, SRC0, LEN
+ cmp LEN, #SMALLSIZE
+ bcs strict_backward
+
+ cmp LEN, #10
+ bcs 9f
+backward_tiny:
+ /* copy 1-10 bytes */
+ adr TMP_X, 8f
+ sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */
+ br TMP_X
+ .rept 10
+ ldrb TMP_Xw, [SRC0, #-1]!
+ strb TMP_Xw, [DST, #-1]!
+ .endr
+8:
+ ret
+9:
+ /* length is small(<32), and src or dst may be unaligned */
+ eor TMP_X, SRC0, DST0
+ ands TMP_X, TMP_X, #7
+ bne notaligned_backward_small
+
+samealign_backward_small:
+ /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
+ tbz DST, #0, 1f
+ ldrb TMP_Xw, [SRC0, #-1]!
+ strb TMP_Xw, [DST, #-1]!
+ sub LEN, LEN, #1
+1:
+ /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
+ tbz DST, #1, 1f
+ ldrh TMP_Xw, [SRC0, #-2]!
+ strh TMP_Xw, [DST, #-2]!
+ sub LEN, LEN, #2
+1:
+ /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
+ tbz DST, #2, 1f
+ ldr TMP_Xw, [SRC0, #-4]!
+ str TMP_Xw, [DST, #-4]!
+ sub LEN, LEN, #4
+1:
+ /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
+ tbz LEN, #4, 1f
+ ldp DATA0, DATA1, [SRC0, #-16]!
+ stp DATA0, DATA1, [DST, #-16]!
+1:
+ /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
+ tbz LEN, #3, 1f
+ ldr TMP_X, [SRC0, #-8]!
+ str TMP_X, [DST, #-8]!
+1:
+ /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
+ tbz LEN, #2, 1f
+ ldr TMP_Xw, [SRC0, #-4]!
+ str TMP_Xw, [DST, #-4]!
+1:
+ /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
+ tbz LEN, #1, 1f
+ ldrh TMP_Xw, [SRC0, #-2]!
+ strh TMP_Xw, [DST, #-2]!
+1:
+ /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
+ tbz LEN, #0, 1f
+ ldrb TMP_Xw, [SRC0, #-1]!
+ strb TMP_Xw, [DST, #-1]!
+1:
+ ret
+
+notaligned_backward_small:
+ /* length is small, and src or dst may be unaligned */
+ sub TMP_S, SRC0, LEN /* tmp_s = src - len */
+1: /* do { */
+ ldrb TMP_Xw, [SRC0, #-1]!
+ strb TMP_Xw, [DST, #-1]! /* *(char *)dst++ = *(char *)src++ */
+ cmp TMP_S, SRC0 /* while (tmp_s < src) */
+ blo 1b
+ ret
+
+strict_backward:
+ /* src or dst may be unaligned */
+ and SRC_ALIGNBIT, SRC0, #7
+ and DST_ALIGNBIT, DST, #7
+ lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3
+ lsl DST_ALIGNBIT, DST_ALIGNBIT, #3
+ sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
+ cbz SRC_DST_ALIGNBIT, copy_backward /* same alignment? */
+
+ and SRC, SRC0, #~7
+ and DST, DST, #~7
+ neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+ tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */
+
+ cmp SRC, SRC0 /* don't access out of range */
+ beq 1f
+ ldr DATA1, [SRC]
+1:
+ ldr DATA0, [SRC, #-8]!
+
+ lsl DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */
+ lsr TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1<<dst_src_alignbit)| */
+ orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */
+
+ b 9f /* } */
+5: /* else { */
+ ldr DATA0, [SRC] /* data0 = *src; */
+ lsr DATA1, DATA0, SRC_DST_ALIGNBIT /* data1=data0>>src_dst_abit;*/
+9: /* } */
+
+ cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */
+ mov TMP_D, DST /* tmp_d = dst; */
+
+ tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */
+ str DATA1w, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = data1; */
+ lsr DATA1, DATA1, #32 /* data1 >>= 32; */
+1: /* } */
+ tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */
+ strh DATA1w, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = data1; */
+ lsr DATA1, DATA1, #16 /* data1 >>= 16; */
+1: /* } */
+ tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */
+ strb DATA1w, [TMP_D] /* *(uint8_t *)tmp_d = data1; */
+1: /* } */
+
+ sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */
+9: /* } */
+#else /* BYTE_ORDER */
+ tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */
+
+ cmp SRC, SRC0 /* don't access out of range */
+ beq 1f
+ ldr DATA1, [SRC]
+1:
+ ldr DATA0, [SRC, #-8]!
+
+ lsr DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */
+ lsl TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1>>dst_src_alignbit)| */
+ orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */
+
+ b 9f /* } */
+5: /* else { */
+ ldr DATA0, [SRC] /* data0 = *src; */
+ lsr DATA1, DATA0, DST_SRC_ALIGNBIT /* data1=data0<<dst_src_abit;*/
+9: /* } */
+
+ cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */
+ mov TMP_D, DST /* tmp_d = dst; */
+
+ tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */
+ lsr TMP_X, DATA1, #32 /* x = data1 >> 32; */
+ str TMP_Xw, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = x; */
+1: /* } */
+ tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */
+ lsr TMP_X, DATA1, #16 /* x = data1 >> 16; */
+ strh TMP_Xw, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = x; */
+1: /* } */
+ tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */
+ lsr TMP_X, DATA1, #8 /* x = data1 >> 8; */
+ strb TMP_Xw, [TMP_D], #1 /* *(uint8_t *)tmp_d++ = x; */
+1: /* } */
+
+ sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */
+9: /* } */
+#endif /* BYTE_ORDER */
+
+
+backward_shifting_copy_loop:
+ ldp DATA2, DATA1, [SRC, #-16]!
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
+ lsl DATA0, DATA0, DST_SRC_ALIGNBIT
+ lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+ /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */
+ lsl DATA1, DATA1, DST_SRC_ALIGNBIT
+ lsr TMP_X, DATA2, SRC_DST_ALIGNBIT
+ orr DATA1, DATA1, TMP_X
+#else /* BYTE_ORDER */
+ /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
+ lsr DATA0, DATA0, DST_SRC_ALIGNBIT
+ lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+ /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */
+ lsr DATA1, DATA1, DST_SRC_ALIGNBIT
+ lsl TMP_X, DATA2, SRC_DST_ALIGNBIT
+ orr DATA1, DATA1, TMP_X
+#endif /* BYTE_ORDER */
+ stp DATA1, DATA0, [DST, #-16]!
+ mov DATA0, DATA2
+ sub LEN, LEN, #16
+ cmp LEN, #16
+ bhs backward_shifting_copy_loop
+
+
+ /* write 8 bytes */
+ tbz LEN, #3, 9f
+
+ ldr DATA1, [SRC, #-8]!
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
+ lsl DATA0, DATA0, DST_SRC_ALIGNBIT
+ lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+#else /* BYTE_ORDER */
+ /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
+ lsr DATA0, DATA0, DST_SRC_ALIGNBIT
+ lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+#endif /* BYTE_ORDER */
+ str DATA0, [DST, #-8]!
+ mov DATA0, DATA1
+ sub LEN, LEN, #8
+9:
+
+ cbz LEN, backward_shifting_copy_done
+
+ /* copy last 1-7 bytes */
+ and TMP_X, SRC_DST_ALIGNBIT, #63
+ cmp LEN, TMP_X, lsr #3
+ bls 1f
+ ldr DATA1, [SRC, #-8]! /* don't access out of range */
+1:
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
+ lsl DATA0, DATA0, DST_SRC_ALIGNBIT
+ lsr TMP_X, DATA1, SRC_DST_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+#else /* BYTE_ORDER */
+ /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
+ lsr DATA0, DATA0, DST_SRC_ALIGNBIT
+ lsl TMP_X, DATA1, SRC_DST_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+#endif /* BYTE_ORDER */
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+ tbz LEN, #2, 1f
+ ror DATA0, DATA0, #32
+ str DATA0w, [DST, #-4]!
+1:
+ tbz LEN, #1, 1f
+ ror DATA0, DATA0, #48
+ strh DATA0w, [DST, #-2]!
+1:
+ tbz LEN, #0, 1f
+ ror DATA0, DATA0, #56
+ strb DATA0w, [DST, #-1]!
+1:
+#else /* BYTE_ORDER */
+ tbz LEN, #2, 1f
+ str DATA0w, [DST, #-4]!
+ lsr DATA0, DATA0, #32
+1:
+ tbz LEN, #1, 1f
+ strh DATA0w, [DST, #-2]!
+ lsr DATA0, DATA0, #16
+1:
+ tbz LEN, #0, 1f
+ strb DATA0w, [DST, #-1]!
+1:
+#endif /* BYTE_ORDER */
+backward_shifting_copy_done:
+ ret
+#endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */
+
+
+ .align 5
+ENTRY(FUNCTION)
+#ifdef STRICT_ALIGNMENT
+ cbz LEN, done
+#ifndef NO_OVERLAP
+ cmp SRC0, DST0
+ beq done
+ bcc backward_copy
+#endif /* NO_OVERLAP */
+ mov DST, DST0
+ cmp LEN, #SMALLSIZE
+ bcs strict_forward
+
+ cmp LEN, #10
+ bcs 9f
+forward_tiny:
+ /* copy 1-10 bytes */
+ adr TMP_X, 8f
+ sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */
+ br TMP_X
+ .rept 10
+ ldrb TMP_Xw, [SRC0], #1
+ strb TMP_Xw, [DST], #1
+ .endr
+8:
+ ret
+9:
+ /* length is small(<32), and src or dst may be unaligned */
+ eor TMP_X, SRC0, DST0
+ ands TMP_X, TMP_X, #7
+ bne notaligned_forward_small
+samealign_forward_small:
+ /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
+ tbz DST, #0, 1f
+ ldrb TMP_Xw, [SRC0], #1
+ strb TMP_Xw, [DST], #1
+ sub LEN, LEN, #1
+1:
+ /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
+ tbz DST, #1, 1f
+ ldrh TMP_Xw, [SRC0], #2
+ strh TMP_Xw, [DST], #2
+ sub LEN, LEN, #2
+1:
+ /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
+ tbz DST, #2, 1f
+ ldr TMP_Xw, [SRC0], #4
+ str TMP_Xw, [DST], #4
+ sub LEN, LEN, #4
+1:
+ /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
+ tbz LEN, #4, 1f
+ ldp DATA0, DATA1, [SRC0], #16
+ stp DATA0, DATA1, [DST], #16
+1:
+ /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
+ tbz LEN, #3, 1f
+ ldr TMP_X, [SRC0], #8
+ str TMP_X, [DST], #8
+1:
+ /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
+ tbz LEN, #2, 1f
+ ldr TMP_Xw, [SRC0], #4
+ str TMP_Xw, [DST], #4
+1:
+ /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
+ tbz LEN, #1, 1f
+ ldrh TMP_Xw, [SRC0], #2
+ strh TMP_Xw, [DST], #2
+1:
+ /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
+ tbz LEN, #0, 1f
+ ldrb TMP_Xw, [SRC0], #1
+ strb TMP_Xw, [DST], #1
+1:
+ ret
+
+notaligned_forward_small:
+ /* src and dst are not aligned... */
+ prfm PLDL1KEEP, [SRC0]
+ prfm PLDL1KEEP, [SRC0, #8]
+ prfm PLDL1KEEP, [SRC0, #16]
+ add TMP_S, SRC0, LEN /* tmp_s = src + len */
+1: /* do { */
+ ldrb TMP_Xw, [SRC0], #1
+ strb TMP_Xw, [DST], #1 /* *(char *)dst++ = *(char *)src++ */
+ cmp SRC0, TMP_S /* while (src < tmp_s); */
+ blo 1b
+ ret
+
+strict_forward:
+ /* src or dst may be unaligned */
+ and SRC_ALIGNBIT, SRC0, #7
+ and DST_ALIGNBIT, DST0, #7
+ lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3
+ lsl DST_ALIGNBIT, DST_ALIGNBIT, #3
+ sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
+ cbz SRC_DST_ALIGNBIT, copy_forward /* same alignment? */
+
+ and SRC, SRC0, #~7
+ and DST, DST0, #~7
+ neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+ tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */
+ ldp DATA1, DATA0, [SRC], #16
+ neg TMP_X, SRC_ALIGNBIT
+ lsr DATA1, DATA1, SRC_ALIGNBIT /* data1 = */
+ lsl TMP_X, DATA0, TMP_X /* (data1 >> src_alignbit) | */
+ orr DATA1, DATA1, TMP_X /* (data0 << -src_alignbit); */
+ b 9f
+5:
+ ldr DATA0, [SRC], #8
+ lsr DATA1, DATA0, SRC_ALIGNBIT
+9:
+
+ cbz DST_ALIGNBIT, 5f
+ mov TMP_D, DST0
+ /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */
+ tbz TMP_D, #0, 1f
+ strb DATA1w, [TMP_D], #1
+ lsr DATA1, DATA1, #8
+1:
+ /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */
+ tbz TMP_D, #1, 1f
+ strh DATA1w, [TMP_D], #2
+ lsr DATA1, DATA1, #16
+1:
+ /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */
+ tbz TMP_D, #2, 1f
+ str DATA1w, [TMP_D], #4
+1:
+ add DST, DST, #8
+ b 9f
+5:
+ str DATA1, [DST], #8
+9:
+ sub LEN, LEN, #8
+ add LEN, LEN, DST_ALIGNBIT, lsr #3
+#else /* BYTE_ORDER */
+ tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */
+ ldp DATA1, DATA0, [SRC], #16
+ neg TMP_X, SRC_ALIGNBIT
+ lsl DATA1, DATA1, SRC_ALIGNBIT /* data1 = */
+ lsr TMP_X, DATA0, TMP_X /* (data1 << src_alignbit) | */
+ orr DATA1, DATA1, TMP_X /* (data0 >> -src_alignbit); */
+ b 9f
+5:
+ ldr DATA0, [SRC], #8
+ lsl DATA1, DATA0, SRC_ALIGNBIT
+9:
+
+ cbz DST_ALIGNBIT, 5f
+ mov TMP_D, DST0
+ /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */
+ tbz TMP_D, #0, 1f
+ lsr TMP_X, DATA1, #56
+ strb TMP_Xw, [TMP_D], #1
+1:
+ /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */
+ tbz TMP_D, #1, 1f
+ lsr TMP_X, DATA1, #48
+ strh TMP_Xw, [TMP_D], #2
+1:
+ /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */
+ tbz TMP_D, #2, 1f
+ lsr TMP_X, DATA1, #32
+ str TMP_Xw, [TMP_D], #4
+1:
+ add DST, DST, #8
+ b 9f
+5:
+ str DATA1, [DST], #8
+9:
+ sub LEN, LEN, #8
+ add LEN, LEN, DST_ALIGNBIT, lsr #3
+#endif /* BYTE_ORDER */
+
+shifting_copy_loop:
+ ldp DATA1, DATA2, [SRC], #16
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
+ lsr DATA0, DATA0, SRC_DST_ALIGNBIT
+ lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+ /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */
+ lsr DATA1, DATA1, SRC_DST_ALIGNBIT
+ lsl TMP_X, DATA2, DST_SRC_ALIGNBIT
+ orr DATA1, DATA1, TMP_X
+#else /* BYTE_ORDER */
+ /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
+ lsl DATA0, DATA0, SRC_DST_ALIGNBIT
+ lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+ /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */
+ lsl DATA1, DATA1, SRC_DST_ALIGNBIT
+ lsr TMP_X, DATA2, DST_SRC_ALIGNBIT
+ orr DATA1, DATA1, TMP_X
+#endif /* BYTE_ORDER */
+ stp DATA0, DATA1, [DST], #16
+ mov DATA0, DATA2
+ sub LEN, LEN, #16
+ cmp LEN, #16
+ bhs shifting_copy_loop
+
+
+ /* write 8 bytes */
+ tbz LEN, #3, 9f
+ ldr DATA1, [SRC], #8
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
+ lsr DATA0, DATA0, SRC_DST_ALIGNBIT
+ lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+#else /* BYTE_ORDER */
+ /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
+ lsl DATA0, DATA0, SRC_DST_ALIGNBIT
+ lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+#endif /* BYTE_ORDER */
+ str DATA0, [DST], #8
+ mov DATA0, DATA1
+ sub LEN, LEN, #8
+9:
+
+ cbz LEN, shifting_copy_done
+
+ /* copy last 1-7 bytes */
+ and TMP_X, DST_SRC_ALIGNBIT, #63
+ cmp LEN, TMP_X, lsr #3
+ bls 1f
+ ldr DATA1, [SRC], #8 /* don't access out of range */
+1:
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
+ lsr DATA0, DATA0, SRC_DST_ALIGNBIT
+ lsl TMP_X, DATA1, DST_SRC_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+#else /* BYTE_ORDER */
+ /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
+ lsl DATA0, DATA0, SRC_DST_ALIGNBIT
+ lsr TMP_X, DATA1, DST_SRC_ALIGNBIT
+ orr DATA0, DATA0, TMP_X
+#endif /* BYTE_ORDER */
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* if (len & 4) { *(uint32_t *)dst++ = data0; } */
+ tbz LEN, #2, 1f
+ str DATA0w, [DST], #4
+ lsr DATA0, DATA0, #32
+1:
+ /* if (len & 2) { *(uint16_t *)dst++ = data0; } */
+ tbz LEN, #1, 1f
+ strh DATA0w, [DST], #2
+ lsr DATA0, DATA0, #16
+1:
+ /* if (len & 1) { *(uint8_t *)dst++ = data0; } */
+ tbz LEN, #0, 1f
+ strb DATA0w, [DST], #1
+1:
+#else /* BYTE_ORDER */
+ /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */
+ tbz LEN, #2, 1f
+ lsr TMP_X, DATA0, #32
+ str TMP_Xw, [DST], #4
+1:
+ /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */
+ tbz LEN, #1, 1f
+ lsr TMP_X, DATA0, #16
+ strh TMP_Xw, [DST], #2
+1:
+ /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */
+ tbz LEN, #0, 1f
+ lsr TMP_X, DATA0, #8
+ strb TMP_Xw, [DST], #1
+1:
+#endif /* BYTE_ORDER */
+shifting_copy_done:
+ ret
+
+#else /* STRICT_ALIGNMENT */
+#ifndef NO_OVERLAP
+ cbz LEN, done
+ cmp SRC0, DST0
+ beq done
+ bcc backward_ignore_align
+#endif /* NO_OVERLAP */
+
+ prfm PLDL1KEEP, [SRC0]
+ cmp LEN, #SMALLSIZE
+ bcs copy_forward
+ mov DST, DST0
+
+copy_forward_small:
+ cmp LEN, #8
+ bcs 9f
+
+ /* 0 <= len < 8 */
+ /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
+ tbz LEN, #2, 1f
+ ldr TMP_Xw, [SRC0], #4
+ str TMP_Xw, [DST], #4
+1:
+ /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
+ tbz LEN, #1, 1f
+ ldrh TMP_Xw, [SRC0], #2
+ strh TMP_Xw, [DST], #2
+1:
+ /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
+ tbz LEN, #0, 1f
+ ldrb TMP_Xw, [SRC0], #1
+ strb TMP_Xw, [DST], #1
+1:
+ ret
+9:
+
+ prfm PLDL1KEEP, [SRC0, #8]
+ cmp LEN, #16
+ bcs 9f
+
+ /* 8 <= len < 16 */
+ /* *(uint64_t *)dst++ = *(uint64_t *)src++; */
+ ldr TMP_X, [SRC0], #8
+ str TMP_X, [DST], #8
+ /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
+ tbz LEN, #2, 1f
+ ldr TMP_Xw, [SRC0], #4
+ str TMP_Xw, [DST], #4
+1:
+ /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
+ tbz LEN, #1, 1f
+ ldrh TMP_Xw, [SRC0], #2
+ strh TMP_Xw, [DST], #2
+1:
+ /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
+ tbz LEN, #0, 1f
+ ldrb TMP_Xw, [SRC0], #1
+ strb TMP_Xw, [DST], #1
+1:
+ ret
+9:
+
+ /* 16 <= len < 32 */
+ prfm PLDL1KEEP, [SRC0, 16]
+ prfm PLDL1KEEP, [SRC0, 24]
+ ldp DATA0, DATA1, [SRC0], #16
+ stp DATA0, DATA1, [DST], #16
+ /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
+ tbz LEN, #3, 1f
+ ldr TMP_X, [SRC0], #8
+ str TMP_X, [DST], #8
+1:
+ /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
+ tbz LEN, #2, 1f
+ ldr TMP_Xw, [SRC0], #4
+ str TMP_Xw, [DST], #4
+1:
+ /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
+ tbz LEN, #1, 1f
+ ldrh TMP_Xw, [SRC0], #2
+ strh TMP_Xw, [DST], #2
+1:
+ /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
+ tbz LEN, #0, 1f
+ ldrb TMP_Xw, [SRC0], #1
+ strb TMP_Xw, [DST], #1
+1:
+ ret
+#endif /* !STRICT_ALIGNMENT */
+
+ .align 4
+copy_forward:
+ /* DST is not aligned at this point */
+ mov DST, DST0
+#ifndef STRICT_ALIGNMENT
+ cmp LEN, #512 /* pre-alignment can be overhead when small */
+ bcc 9f
+#endif /* STRICT_ALIGNMENT */
+ /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
+ tbz DST, #0, 1f
+ ldrb TMP_Xw, [SRC0], #1
+ strb TMP_Xw, [DST], #1
+ sub LEN, LEN, #1
+1:
+ /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
+ tbz DST, #1, 1f
+ ldrh TMP_Xw, [SRC0], #2
+ strh TMP_Xw, [DST], #2
+ sub LEN, LEN, #2
+1:
+ /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
+ tbz DST, #2, 1f
+ ldr TMP_Xw, [SRC0], #4
+ str TMP_Xw, [DST], #4
+ sub LEN, LEN, #4
+1:
+#if (STP_ALIGN > 8)
+ /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
+ tbz DST, #3, 1f
+ ldr TMP_X, [SRC0], #8
+ str TMP_X, [DST], #8
+ sub LEN, LEN, #8
+1:
+#endif /* (STP_ALIGN > 8) */
+9:
+
+ cmp LEN, #1024
+ bhs forward_copy1k
+forward_less1k:
+ /* copy 16*n bytes */
+ and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */
+ adr TMP_X, 8f
+ sub LEN, LEN, TMP_D
+ sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */
+ br TMP_X
+forward_copy1k: /* copy 16*64 bytes */
+ sub LEN, LEN, #1024
+ .rept (1024 / 16)
+ ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */
+ stp DATA0, DATA1, [DST], #16
+ .endr
+8:
+ cbz LEN, done
+ cmp LEN, #1024
+ bhs forward_copy1k
+ cmp LEN, #16
+ bhs forward_less1k
+
+ /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
+ tbz LEN, #4, 1f
+ ldp DATA0, DATA1, [SRC0], #16
+ stp DATA0, DATA1, [DST], #16
+1:
+ /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
+ tbz LEN, #3, 1f
+ ldr TMP_X, [SRC0], #8
+ str TMP_X, [DST], #8
+1:
+ /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
+ tbz LEN, #2, 1f
+ ldr TMP_Xw, [SRC0], #4
+ str TMP_Xw, [DST], #4
+1:
+ /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
+ tbz LEN, #1, 1f
+ ldrh TMP_Xw, [SRC0], #2
+ strh TMP_Xw, [DST], #2
+1:
+ /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
+ tbz LEN, #0, 1f
+ ldrb TMP_Xw, [SRC0], #1
+ strb TMP_Xw, [DST], #1
+1:
+done:
+ ret
+END(FUNCTION)
Index: src/common/lib/libc/arch/aarch64/string/memmove.S
diff -u /dev/null src/common/lib/libc/arch/aarch64/string/memmove.S:1.1
--- /dev/null Sun Feb 4 21:52:17 2018
+++ src/common/lib/libc/arch/aarch64/string/memmove.S Sun Feb 4 21:52:16 2018
@@ -0,0 +1,4 @@
+/* $NetBSD: memmove.S,v 1.1 2018/02/04 21:52:16 skrll Exp $ */
+
+#define MEMMOVE
+#include "bcopy.S"