From: Waldemar Kozaczuk <jwkozac...@gmail.com>
Committer: Nadav Har'El <n...@scylladb.com>
Branch: master

aarch64: fix memmove bug

This patch replaces arch/aarch/string.S with the newest copies
of memcpy.S, memset.S and memmove.S from the newlib library to
fix the memmove bug.

It seems that the original string.S was a merge of memset.S, memcpy.S and 
memmove.S
from newlib as of circa beginning of 2015. Based of comparison with the
newlib tree, it might have been as of commit 
a8907bda23e23c6deaf53e6375dc31b20d238b1d
at least based on the memcpy.S and memset.S. The memcpy and memset were
copied pretty much as is, however memmove was actually replaced with
memcpy/memcpy_backwards-like implementation and it seems that is where
the bug is.

Rather than identifying and fixing the bug, we instead replace
string.S with newest copies of memcpy.S, memset.S and memmove.S
from the latest version of newlib as of the 
53b7116705754192c66f2580d638644414b60c9d
commit. Apart from fixing the bug, the extra benefit is the newest
versions of these functions have been heavily optimized for speed.

Fixes #1090

Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
Message-Id: <20200719152743.37371-1-jwkozac...@gmail.com>

---
diff --git a/Makefile b/Makefile
--- a/Makefile
+++ b/Makefile
@@ -833,7 +833,6 @@ endif # aarch64
 objects += arch/$(arch)/arch-trace.o
 objects += arch/$(arch)/arch-setup.o
 objects += arch/$(arch)/signal.o
-objects += arch/$(arch)/string.o
 objects += arch/$(arch)/arch-cpu.o
 objects += arch/$(arch)/backtrace.o
 objects += arch/$(arch)/smp.o
@@ -861,11 +860,15 @@ objects += arch/$(arch)/arm-clock.o
 objects += arch/$(arch)/gic.o
 objects += arch/$(arch)/arch-dtb.o
 objects += arch/$(arch)/hypercall.o
+objects += arch/$(arch)/memset.o
+objects += arch/$(arch)/memcpy.o
+objects += arch/$(arch)/memmove.o
 objects += $(libfdt)
 endif
 
 ifeq ($(arch),x64)
 objects += arch/x64/dmi.o
+objects += arch/x64/string.o
 objects += arch/x64/string-ssse3.o
 objects += arch/x64/arch-trace.o
 objects += arch/x64/ioapic.o
diff --git a/arch/aarch64/memcpy.S b/arch/aarch64/memcpy.S
--- a/arch/aarch64/memcpy.S
+++ b/arch/aarch64/memcpy.S
@@ -0,0 +1,230 @@
+/* Copyright (c) 2012-2013, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
+/* See memcpy-stub.c  */
+#else
+
+#define dstin  x0
+#define src    x1
+#define count  x2
+#define dst    x3
+#define srcend x4
+#define dstend x5
+#define A_l    x6
+#define A_lw   w6
+#define A_h    x7
+#define A_hw   w7
+#define B_l    x8
+#define B_lw   w8
+#define B_h    x9
+#define C_l    x10
+#define C_h    x11
+#define D_l    x12
+#define D_h    x13
+#define E_l    src
+#define E_h    count
+#define F_l    srcend
+#define F_h    dst
+#define tmp1   x9
+
+#define L(l) .L ## l
+
+       .macro def_fn f p2align=0
+       .text
+       .p2align \p2align
+       .global \f
+       .type \f, %function
+\f:
+       .endm
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   Small and medium copies read all data before writing, allowing any
+   kind of overlap, and memmove tailcalls memcpy for these cases as
+   well as non-overlapping copies.
+*/
+
+def_fn memcpy p2align=6
+       prfm    PLDL1KEEP, [src]
+       add     srcend, src, count
+       add     dstend, dstin, count
+       cmp     count, 16
+       b.ls    L(copy16)
+       cmp     count, 96
+       b.hi    L(copy_long)
+
+       /* Medium copies: 17..96 bytes.  */
+       sub     tmp1, count, 1
+       ldp     A_l, A_h, [src]
+       tbnz    tmp1, 6, L(copy96)
+       ldp     D_l, D_h, [srcend, -16]
+       tbz     tmp1, 5, 1f
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+1:
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
+       /* Small copies: 0..16 bytes.  */
+L(copy16):
+       cmp     count, 8
+       b.lo    1f
+       ldr     A_l, [src]
+       ldr     A_h, [srcend, -8]
+       str     A_l, [dstin]
+       str     A_h, [dstend, -8]
+       ret
+       .p2align 4
+1:
+       tbz     count, 2, 1f
+       ldr     A_lw, [src]
+       ldr     A_hw, [srcend, -4]
+       str     A_lw, [dstin]
+       str     A_hw, [dstend, -4]
+       ret
+
+       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+       cbz     count, 2f
+       lsr     tmp1, count, 1
+       ldrb    A_lw, [src]
+       ldrb    A_hw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    A_hw, [dstend, -1]
+2:     ret
+
+       .p2align 4
+       /* Copy 64..96 bytes.  Copy 64 bytes from the start and
+          32 bytes from the end.  */
+L(copy96):
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [src, 32]
+       ldp     D_l, D_h, [src, 48]
+       ldp     E_l, E_h, [srcend, -32]
+       ldp     F_l, F_h, [srcend, -16]
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstin, 32]
+       stp     D_l, D_h, [dstin, 48]
+       stp     E_l, E_h, [dstend, -32]
+       stp     F_l, F_h, [dstend, -16]
+       ret
+
+       /* Align DST to 16 byte alignment so that we don't cross cache line
+          boundaries on both loads and stores.  There are at least 96 bytes
+          to copy, so copy 16 bytes unaligned and then align.  The loop
+          copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+       .p2align 4
+L(copy_long):
+       and     tmp1, dstin, 15
+       bic     dst, dstin, 15
+       ldp     D_l, D_h, [src]
+       sub     src, src, tmp1
+       add     count, count, tmp1      /* Count is now 16 too large.  */
+       ldp     A_l, A_h, [src, 16]
+       stp     D_l, D_h, [dstin]
+       ldp     B_l, B_h, [src, 32]
+       ldp     C_l, C_h, [src, 48]
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 128 + 16  /* Test and readjust count.  */
+       b.ls    2f
+1:
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [src, 16]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [src, 32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [src, 48]
+       stp     D_l, D_h, [dst, 64]!
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 64
+       b.hi    1b
+
+       /* Write the last full set of 64 bytes.  The remainder is at most 64
+          bytes, so it is safe to always copy 64 bytes from the end even if
+          there is just 1 byte left.  */
+2:
+       ldp     E_l, E_h, [srcend, -64]
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [srcend, -48]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [srcend, -16]
+       stp     D_l, D_h, [dst, 64]
+       stp     E_l, E_h, [dstend, -64]
+       stp     A_l, A_h, [dstend, -48]
+       stp     B_l, B_h, [dstend, -32]
+       stp     C_l, C_h, [dstend, -16]
+       ret
+
+       .size   memcpy, . - memcpy
+#endif
diff --git a/arch/aarch64/memmove.S b/arch/aarch64/memmove.S
--- a/arch/aarch64/memmove.S
+++ b/arch/aarch64/memmove.S
@@ -0,0 +1,155 @@
+/* Copyright (c) 2013, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ */
+
+#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
+/* See memmove-stub.c  */
+#else
+
+       .macro def_fn f p2align=0
+       .text
+       .p2align \p2align
+       .global \f
+       .type \f, %function
+\f:
+       .endm
+
+/* Parameters and result.  */
+#define dstin  x0
+#define src    x1
+#define count  x2
+#define srcend x3
+#define dstend x4
+#define tmp1   x5
+#define A_l    x6
+#define A_h    x7
+#define B_l    x8
+#define B_h    x9
+#define C_l    x10
+#define C_h    x11
+#define D_l    x12
+#define D_h    x13
+#define E_l    count
+#define E_h    tmp1
+
+/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+   Larger backwards copies are also handled by memcpy. The only remaining
+   case is forward large copies.  The destination is aligned, and an
+   unrolled loop processes 64 bytes per iteration.
+*/
+
+def_fn memmove, 6
+       sub     tmp1, dstin, src
+       cmp     count, 96
+       ccmp    tmp1, count, 2, hi
+       b.hs    memcpy
+
+       cbz     tmp1, 3f
+       add     dstend, dstin, count
+       add     srcend, src, count
+
+       /* Align dstend to 16 byte alignment so that we don't cross cache line
+          boundaries on both loads and stores.  There are at least 96 bytes
+          to copy, so copy 16 bytes unaligned and then align.  The loop
+          copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+       and     tmp1, dstend, 15
+       ldp     D_l, D_h, [srcend, -16]
+       sub     srcend, srcend, tmp1
+       sub     count, count, tmp1
+       ldp     A_l, A_h, [srcend, -16]
+       stp     D_l, D_h, [dstend, -16]
+       ldp     B_l, B_h, [srcend, -32]
+       ldp     C_l, C_h, [srcend, -48]
+       ldp     D_l, D_h, [srcend, -64]!
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    2f
+       nop
+1:
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [srcend, -16]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [srcend, -48]
+       stp     D_l, D_h, [dstend, -64]!
+       ldp     D_l, D_h, [srcend, -64]!
+       subs    count, count, 64
+       b.hi    1b
+
+       /* Write the last full set of 64 bytes.  The remainder is at most 64
+          bytes, so it is safe to always copy 64 bytes from the start even if
+          there is just 1 byte left.  */
+2:
+       ldp     E_l, E_h, [src, 48]
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [src, 32]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [src, 16]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [src]
+       stp     D_l, D_h, [dstend, -64]
+       stp     E_l, E_h, [dstin, 48]
+       stp     A_l, A_h, [dstin, 32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstin]
+3:     ret
+
+       .size   memmove, . - memmove
+#endif
diff --git a/arch/aarch64/memset.S b/arch/aarch64/memset.S
--- a/arch/aarch64/memset.S
+++ b/arch/aarch64/memset.S
@@ -0,0 +1,240 @@
+/* Copyright (c) 2012-2013, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
+ */
+
+#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
+/* See memset-stub.c  */
+#else
+
+#define dstin  x0
+#define val    x1
+#define valw   w1
+#define count  x2
+#define dst    x3
+#define dstend x4
+#define tmp1   x5
+#define tmp1w  w5
+#define tmp2   x6
+#define tmp2w  w6
+#define zva_len x7
+#define zva_lenw w7
+
+#define L(l) .L ## l
+
+       .macro def_fn f p2align=0
+       .text
+       .p2align \p2align
+       .global \f
+       .type \f, %function
+\f:
+       .endm
+
+def_fn memset p2align=6
+
+       dup     v0.16B, valw
+       add     dstend, dstin, count
+
+       cmp     count, 96
+       b.hi    L(set_long)
+       cmp     count, 16
+       b.hs    L(set_medium)
+       mov     val, v0.D[0]
+
+       /* Set 0..15 bytes.  */
+       tbz     count, 3, 1f
+       str     val, [dstin]
+       str     val, [dstend, -8]
+       ret
+       nop
+1:     tbz     count, 2, 2f
+       str     valw, [dstin]
+       str     valw, [dstend, -4]
+       ret
+2:     cbz     count, 3f
+       strb    valw, [dstin]
+       tbz     count, 1, 3f
+       strh    valw, [dstend, -2]
+3:     ret
+
+       /* Set 17..96 bytes.  */
+L(set_medium):
+       str     q0, [dstin]
+       tbnz    count, 6, L(set96)
+       str     q0, [dstend, -16]
+       tbz     count, 5, 1f
+       str     q0, [dstin, 16]
+       str     q0, [dstend, -32]
+1:     ret
+
+       .p2align 4
+       /* Set 64..96 bytes.  Write 64 bytes from the start and
+          32 bytes from the end.  */
+L(set96):
+       str     q0, [dstin, 16]
+       stp     q0, q0, [dstin, 32]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+       .p2align 3
+       nop
+L(set_long):
+       and     valw, valw, 255
+       bic     dst, dstin, 15
+       str     q0, [dstin]
+       cmp     count, 256
+       ccmp    valw, 0, 0, cs
+       b.eq    L(try_zva)
+L(no_zva):
+       sub     count, dstend, dst      /* Count is 16 too large.  */
+       sub     dst, dst, 16            /* Dst is biased by -32.  */
+       sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
+1:     stp     q0, q0, [dst, 32]
+       stp     q0, q0, [dst, 64]!
+L(tail64):
+       subs    count, count, 64
+       b.hi    1b
+2:     stp     q0, q0, [dstend, -64]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+       .p2align 3
+L(try_zva):
+       mrs     tmp1, dczid_el0
+       tbnz    tmp1w, 4, L(no_zva)
+       and     tmp1w, tmp1w, 15
+       cmp     tmp1w, 4        /* ZVA size is 64 bytes.  */
+       b.ne     L(zva_128)
+
+       /* Write the first and last 64 byte aligned block using stp rather
+          than using DC ZVA.  This is faster on some cores.
+        */
+L(zva_64):
+       str     q0, [dst, 16]
+       stp     q0, q0, [dst, 32]
+       bic     dst, dst, 63
+       stp     q0, q0, [dst, 64]
+       stp     q0, q0, [dst, 96]
+       sub     count, dstend, dst      /* Count is now 128 too large.  */
+       sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
+       add     dst, dst, 128
+       nop
+1:     dc      zva, dst
+       add     dst, dst, 64
+       subs    count, count, 64
+       b.hi    1b
+       stp     q0, q0, [dst, 0]
+       stp     q0, q0, [dst, 32]
+       stp     q0, q0, [dstend, -64]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+       .p2align 3
+L(zva_128):
+       cmp     tmp1w, 5        /* ZVA size is 128 bytes.  */
+       b.ne    L(zva_other)
+
+       str     q0, [dst, 16]
+       stp     q0, q0, [dst, 32]
+       stp     q0, q0, [dst, 64]
+       stp     q0, q0, [dst, 96]
+       bic     dst, dst, 127
+       sub     count, dstend, dst      /* Count is now 128 too large.  */
+       sub     count, count, 128+128   /* Adjust count and bias for loop.  */
+       add     dst, dst, 128
+1:     dc      zva, dst
+       add     dst, dst, 128
+       subs    count, count, 128
+       b.hi    1b
+       stp     q0, q0, [dstend, -128]
+       stp     q0, q0, [dstend, -96]
+       stp     q0, q0, [dstend, -64]
+       stp     q0, q0, [dstend, -32]
+       ret
+
+L(zva_other):
+       mov     tmp2w, 4
+       lsl     zva_lenw, tmp2w, tmp1w
+       add     tmp1, zva_len, 64       /* Max alignment bytes written.  */
+       cmp     count, tmp1
+       blo     L(no_zva)
+
+       sub     tmp2, zva_len, 1
+       add     tmp1, dst, zva_len
+       add     dst, dst, 16
+       subs    count, tmp1, dst        /* Actual alignment bytes to write.  */
+       bic     tmp1, tmp1, tmp2        /* Aligned dc zva start address.  */
+       beq     2f
+1:     stp     q0, q0, [dst], 64
+       stp     q0, q0, [dst, -32]
+       subs    count, count, 64
+       b.hi    1b
+2:     mov     dst, tmp1
+       sub     count, dstend, tmp1     /* Remaining bytes to write.  */
+       subs    count, count, zva_len
+       b.lo    4f
+3:     dc      zva, dst
+       add     dst, dst, zva_len
+       subs    count, count, zva_len
+       b.hs    3b
+4:     add     count, count, zva_len
+       sub     dst, dst, 32            /* Bias dst for tail loop.  */
+       b       L(tail64)
+
+       .size   memset, . - memset
+#endif
diff --git a/arch/aarch64/string.S b/arch/aarch64/string.S
--- a/arch/aarch64/string.S
+++ b/arch/aarch64/string.S
@@ -1,545 +0,0 @@
-/* Copyright (c) 2012-2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- * Unaligned accesses
- *
- */
-
-/*
- * Modified 2015 by Claudio Fontana <claudio.font...@huawei.com> for OSv
- *
- * This file is a merge of memset.S memcpy.S and memmove.S
- * from newlib 2.2.0-1.
- *
- * Changes:
- * - adapted interface for memcpy_backwards
- * - some preprocessor related changes due to the merge of three files
- * - always #define MAYBE_VIRT 1 since we are running virtualized.
- */
-
-#define MAYBE_VIRT 1
-
-#define dstin x0
-#define src   x1
-#define count x2
-#define tmp1  x3
-#define tmp1w w3
-#define tmp2  x4
-#define tmp2w w4
-#define tmp3  x5
-#define tmp3w w5
-#define dst   x6
-
-#define A_l   x7
-#define A_h   x8
-#define B_l   x9
-#define B_h   x10
-#define C_l   x11
-#define C_h   x12
-#define D_l   x13
-#define D_h   x14
-
-       .macro def_fn f p2align=0
-       .text
-       .p2align \p2align
-       .global \f
-       .type \f, %function
-\f:
-       .endm
-
-def_fn memcpy p2align=6
-
-       mov     dst, dstin
-       cmp     count, #64
-       b.ge    .Lcpy_not_short
-       cmp     count, #15
-       b.le    .Ltail15tiny
-
-       /* Deal with small copies quickly by dropping straight into the
-        * exit block.  */
-.Ltail63:
-       /* Copy up to 48 bytes of data.  At this point we only need the
-        * bottom 6 bits of count to be accurate.  */
-       ands    tmp1, count, #0x30
-       b.eq    .Ltail15
-       add     dst, dst, tmp1
-       add     src, src, tmp1
-       cmp     tmp1w, #0x20
-       b.eq    1f
-       b.lt    2f
-       ldp     A_l, A_h, [src, #-48]
-       stp     A_l, A_h, [dst, #-48]
-1:
-       ldp     A_l, A_h, [src, #-32]
-       stp     A_l, A_h, [dst, #-32]
-2:
-       ldp     A_l, A_h, [src, #-16]
-       stp     A_l, A_h, [dst, #-16]
-
-.Ltail15:
-       ands    count, count, #15
-       beq     1f
-       add     src, src, count
-       ldp     A_l, A_h, [src, #-16]
-       add     dst, dst, count
-       stp     A_l, A_h, [dst, #-16]
-1:
-       ret
-
-.Ltail15tiny:
-       /* Copy up to 15 bytes of data.  Does not assume additional data
-          being copied.  */
-       tbz     count, #3, 1f
-       ldr     tmp1, [src], #8
-       str     tmp1, [dst], #8
-1:
-       tbz     count, #2, 1f
-       ldr     tmp1w, [src], #4
-       str     tmp1w, [dst], #4
-1:
-       tbz     count, #1, 1f
-       ldrh    tmp1w, [src], #2
-       strh    tmp1w, [dst], #2
-1:
-       tbz     count, #0, 1f
-       ldrb    tmp1w, [src]
-       strb    tmp1w, [dst]
-1:
-       ret
-
-.Lcpy_not_short:
-       /* We don't much care about the alignment of DST, but we want SRC
-        * to be 128-bit (16 byte) aligned so that we don't cross cache line
-        * boundaries on both loads and stores.  */
-       neg     tmp2, src
-       ands    tmp2, tmp2, #15         /* Bytes to reach alignment.  */
-       b.eq    2f
-       sub     count, count, tmp2
-       /* Copy more data than needed; it's faster than jumping
-        * around copying sub-Quadword quantities.  We know that
-        * it can't overrun.  */
-       ldp     A_l, A_h, [src]
-       add     src, src, tmp2
-       stp     A_l, A_h, [dst]
-       add     dst, dst, tmp2
-       /* There may be less than 63 bytes to go now.  */
-       cmp     count, #63
-       b.le    .Ltail63
-2:
-       subs    count, count, #128
-       b.ge    .Lcpy_body_large
-       /* Less than 128 bytes to copy, so handle 64 here and then jump
-        * to the tail.  */
-       ldp     A_l, A_h, [src]
-       ldp     B_l, B_h, [src, #16]
-       ldp     C_l, C_h, [src, #32]
-       ldp     D_l, D_h, [src, #48]
-       stp     A_l, A_h, [dst]
-       stp     B_l, B_h, [dst, #16]
-       stp     C_l, C_h, [dst, #32]
-       stp     D_l, D_h, [dst, #48]
-       tst     count, #0x3f
-       add     src, src, #64
-       add     dst, dst, #64
-       b.ne    .Ltail63
-       ret
-
-       /* Critical loop.  Start at a new cache line boundary.  Assuming
-        * 64 bytes per line this ensures the entire loop is in one line.  */
-       .p2align 6
-.Lcpy_body_large:
-       /* There are at least 128 bytes to copy.  */
-       ldp     A_l, A_h, [src, #0]
-       sub     dst, dst, #16           /* Pre-bias.  */
-       ldp     B_l, B_h, [src, #16]
-       ldp     C_l, C_h, [src, #32]
-       ldp     D_l, D_h, [src, #48]!   /* src += 64 - Pre-bias.  */
-1:
-       stp     A_l, A_h, [dst, #16]
-       ldp     A_l, A_h, [src, #16]
-       stp     B_l, B_h, [dst, #32]
-       ldp     B_l, B_h, [src, #32]
-       stp     C_l, C_h, [dst, #48]
-       ldp     C_l, C_h, [src, #48]
-       stp     D_l, D_h, [dst, #64]!
-       ldp     D_l, D_h, [src, #64]!
-       subs    count, count, #64
-       b.ge    1b
-       stp     A_l, A_h, [dst, #16]
-       stp     B_l, B_h, [dst, #32]
-       stp     C_l, C_h, [dst, #48]
-       stp     D_l, D_h, [dst, #64]
-       add     src, src, #16
-       add     dst, dst, #64 + 16
-       tst     count, #0x3f
-       b.ne    .Ltail63
-       ret
-       .size   memcpy, .-memcpy
-
-#undef dst
-#undef tmp3w
-
-#define dstin x0
-#define val w1
-#define count x2
-#define tmp1 x3
-#define tmp1w w3
-#define tmp2 x4
-#define tmp2w w4
-#define zva_len_x x5
-#define zva_len w5
-#define zva_bits_x x6
-
-#define A_l x7
-#define A_lw w7
-
-#define dst x8
-#define tmp3w w9
-
-def_fn memset p2align=6
-
-       mov     dst, dstin              /* Preserve return value.  */
-       ands    A_lw, val, #255
-#ifndef DONT_USE_DC
-       b.eq    .Lzero_mem
-#endif
-       orr     A_lw, A_lw, A_lw, lsl #8
-       orr     A_lw, A_lw, A_lw, lsl #16
-       orr     A_l, A_l, A_l, lsl #32
-.Ltail_maybe_long:
-       cmp     count, #64
-       b.ge    .Lnot_short
-.Ltail_maybe_tiny:
-       cmp     count, #15
-       b.le    .Ltail15_memsettiny
-.Ltail63_memset:
-       ands    tmp1, count, #0x30
-       b.eq    .Ltail15_memset
-       add     dst, dst, tmp1
-       cmp     tmp1w, #0x20
-       b.eq    1f
-       b.lt    2f
-       stp     A_l, A_l, [dst, #-48]
-1:
-       stp     A_l, A_l, [dst, #-32]
-2:
-       stp     A_l, A_l, [dst, #-16]
-
-.Ltail15_memset:
-       and     count, count, #15
-       add     dst, dst, count
-       stp     A_l, A_l, [dst, #-16]   /* Repeat some/all of last store. */
-       ret
-
-.Ltail15_memsettiny:
-       /* Set up to 15 bytes.  Does not assume earlier memory
-          being set.  */
-       tbz     count, #3, 1f
-       str     A_l, [dst], #8
-1:
-       tbz     count, #2, 1f
-       str     A_lw, [dst], #4
-1:
-       tbz     count, #1, 1f
-       strh    A_lw, [dst], #2
-1:
-       tbz     count, #0, 1f
-       strb    A_lw, [dst]
-1:
-       ret
-
-       /* Critical loop.  Start at a new cache line boundary.  Assuming
-        * 64 bytes per line, this ensures the entire loop is in one line.  */
-       .p2align 6
-.Lnot_short:
-       neg     tmp2, dst
-       ands    tmp2, tmp2, #15
-       b.eq    2f
-       /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
-        * more than that to set, so we simply store 16 bytes and advance by
-        * the amount required to reach alignment.  */
-       sub     count, count, tmp2
-       stp     A_l, A_l, [dst]
-       add     dst, dst, tmp2
-       /* There may be less than 63 bytes to go now.  */
-       cmp     count, #63
-       b.le    .Ltail63_memset
-2:
-       sub     dst, dst, #16           /* Pre-bias.  */
-       sub     count, count, #64
-1:
-       stp     A_l, A_l, [dst, #16]
-       stp     A_l, A_l, [dst, #32]
-       stp     A_l, A_l, [dst, #48]
-       stp     A_l, A_l, [dst, #64]!
-       subs    count, count, #64
-       b.ge    1b
-       tst     count, #0x3f
-       add     dst, dst, #16
-       b.ne    .Ltail63_memset
-       ret
-
-#ifndef DONT_USE_DC
-       /* For zeroing memory, check to see if we can use the ZVA feature to
-        * zero entire 'cache' lines.  */
-.Lzero_mem:
-       mov     A_l, #0
-       cmp     count, #63
-       b.le    .Ltail_maybe_tiny
-       neg     tmp2, dst
-       ands    tmp2, tmp2, #15
-       b.eq    1f
-       sub     count, count, tmp2
-       stp     A_l, A_l, [dst]
-       add     dst, dst, tmp2
-       cmp     count, #63
-       b.le    .Ltail63_memset
-1:
-       /* For zeroing small amounts of memory, it's not worth setting up
-        * the line-clear code.  */
-       cmp     count, #128
-       b.lt    .Lnot_short
-#ifdef MAYBE_VIRT
-       /* For efficiency when virtualized, we cache the ZVA capability.  */
-       adrp    tmp2, .Lcache_clear
-       ldr     zva_len, [tmp2, #:lo12:.Lcache_clear]
-       tbnz    zva_len, #31, .Lnot_short
-       cbnz    zva_len, .Lzero_by_line
-       mrs     tmp1, dczid_el0
-       tbz     tmp1, #4, 1f
-       /* ZVA not available.  Remember this for next time.  */
-       mov     zva_len, #~0
-       str     zva_len, [tmp2, #:lo12:.Lcache_clear]
-       b       .Lnot_short
-1:
-       mov     tmp3w, #4
-       and     zva_len, tmp1w, #15     /* Safety: other bits reserved.  */
-       lsl     zva_len, tmp3w, zva_len
-       str     zva_len, [tmp2, #:lo12:.Lcache_clear]
-#else
-       mrs     tmp1, dczid_el0
-       tbnz    tmp1, #4, .Lnot_short
-       mov     tmp3w, #4
-       and     zva_len, tmp1w, #15     /* Safety: other bits reserved.  */
-       lsl     zva_len, tmp3w, zva_len
-#endif
-
-.Lzero_by_line:
-       /* Compute how far we need to go to become suitably aligned.  We're
-        * already at quad-word alignment.  */
-       cmp     count, zva_len_x
-       b.lt    .Lnot_short             /* Not enough to reach alignment.  */
-       sub     zva_bits_x, zva_len_x, #1
-       neg     tmp2, dst
-       ands    tmp2, tmp2, zva_bits_x
-       b.eq    1f                      /* Already aligned.  */
-       /* Not aligned, check that there's enough to copy after alignment.  */
-       sub     tmp1, count, tmp2
-       cmp     tmp1, #64
-       ccmp    tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
-       b.lt    .Lnot_short
-       /* We know that there's at least 64 bytes to zero and that it's safe
-        * to overrun by 64 bytes.  */
-       mov     count, tmp1
-2:
-       stp     A_l, A_l, [dst]
-       stp     A_l, A_l, [dst, #16]
-       stp     A_l, A_l, [dst, #32]
-       subs    tmp2, tmp2, #64
-       stp     A_l, A_l, [dst, #48]
-       add     dst, dst, #64
-       b.ge    2b
-       /* We've overrun a bit, so adjust dst downwards.  */
-       add     dst, dst, tmp2
-1:
-       sub     count, count, zva_len_x
-3:
-       dc      zva, dst
-       add     dst, dst, zva_len_x
-       subs    count, count, zva_len_x
-       b.ge    3b
-       ands    count, count, zva_bits_x
-       b.ne    .Ltail_maybe_long
-       ret
-       .size   memset, .-memset
-#ifdef MAYBE_VIRT
-       .bss
-       .p2align 2
-.Lcache_clear:
-       .space 4
-#endif
-#endif /* DONT_USE_DC */
-
-#undef dst
-#undef tmp3w
-/* Parameters and result.  */
-#define dstin  x0
-#define src    x1
-#define count  x2
-#define tmp1   x3
-#define tmp1w  w3
-#define tmp2   x4
-#define tmp2w  w4
-#define tmp3   x5
-#define tmp3w  w5
-#define dst    x6
-
-#define A_l    x7
-#define A_h    x8
-#define B_l    x9
-#define B_h    x10
-#define C_l    x11
-#define C_h    x12
-#define D_l    x13
-#define D_h    x14
-
-/* Upwards move with potential overlap.
- * Need to move from the tail backwards.  SRC and DST point one
- * byte beyond the remaining data to move.  */
-def_fn memcpy_backwards p2align=6
-       add     dst, dstin, count
-       add     src, src, count
-       cmp     count, #64
-       b.ge    .Lmov_not_short_up
-
-       /* Deal with small moves quickly by dropping straight into the
-        * exit block.  */
-.Ltail63up:
-       /* Move up to 48 bytes of data.  At this point we only need the
-        * bottom 6 bits of count to be accurate.  */
-       ands    tmp1, count, #0x30
-       b.eq    .Ltail15up
-       sub     dst, dst, tmp1
-       sub     src, src, tmp1
-       cmp     tmp1w, #0x20
-       b.eq    1f
-       b.lt    2f
-       ldp     A_l, A_h, [src, #32]
-       stp     A_l, A_h, [dst, #32]
-1:
-       ldp     A_l, A_h, [src, #16]
-       stp     A_l, A_h, [dst, #16]
-2:
-       ldp     A_l, A_h, [src]
-       stp     A_l, A_h, [dst]
-.Ltail15up:
-       /* Move up to 15 bytes of data.  Does not assume additional data
-        * being moved.  */
-       tbz     count, #3, 1f
-       ldr     tmp1, [src, #-8]!
-       str     tmp1, [dst, #-8]!
-1:
-       tbz     count, #2, 1f
-       ldr     tmp1w, [src, #-4]!
-       str     tmp1w, [dst, #-4]!
-1:
-       tbz     count, #1, 1f
-       ldrh    tmp1w, [src, #-2]!
-       strh    tmp1w, [dst, #-2]!
-1:
-       tbz     count, #0, 1f
-       ldrb    tmp1w, [src, #-1]
-       strb    tmp1w, [dst, #-1]
-1:
-       ret
-
-.Lmov_not_short_up:
-       /* We don't much care about the alignment of DST, but we want SRC
-        * to be 128-bit (16 byte) aligned so that we don't cross cache line
-        * boundaries on both loads and stores.  */
-       ands    tmp2, src, #15          /* Bytes to reach alignment.  */
-       b.eq    2f
-       sub     count, count, tmp2
-       /* Move enough data to reach alignment; unlike memcpy, we have to
-        * be aware of the overlap, which means we can't move data twice.  */
-       tbz     tmp2, #3, 1f
-       ldr     tmp1, [src, #-8]!
-       str     tmp1, [dst, #-8]!
-1:
-       tbz     tmp2, #2, 1f
-       ldr     tmp1w, [src, #-4]!
-       str     tmp1w, [dst, #-4]!
-1:
-       tbz     tmp2, #1, 1f
-       ldrh    tmp1w, [src, #-2]!
-       strh    tmp1w, [dst, #-2]!
-1:
-       tbz     tmp2, #0, 1f
-       ldrb    tmp1w, [src, #-1]!
-       strb    tmp1w, [dst, #-1]!
-1:
-
-       /* There may be less than 63 bytes to go now.  */
-       cmp     count, #63
-       b.le    .Ltail63up
-2:
-       subs    count, count, #128
-       b.ge    .Lmov_body_large_up
-       /* Less than 128 bytes to move, so handle 64 here and then jump
-        * to the tail.  */
-       ldp     A_l, A_h, [src, #-64]!
-       ldp     B_l, B_h, [src, #16]
-       ldp     C_l, C_h, [src, #32]
-       ldp     D_l, D_h, [src, #48]
-       stp     A_l, A_h, [dst, #-64]!
-       stp     B_l, B_h, [dst, #16]
-       stp     C_l, C_h, [dst, #32]
-       stp     D_l, D_h, [dst, #48]
-       tst     count, #0x3f
-       b.ne    .Ltail63up
-       ret
-
-       /* Critical loop.  Start at a new Icache line boundary.  Assuming
-        * 64 bytes per line this ensures the entire loop is in one line.  */
-       .p2align 6
-.Lmov_body_large_up:
-       /* There are at least 128 bytes to move.  */
-       ldp     A_l, A_h, [src, #-16]
-       ldp     B_l, B_h, [src, #-32]
-       ldp     C_l, C_h, [src, #-48]
-       ldp     D_l, D_h, [src, #-64]!
-1:
-       stp     A_l, A_h, [dst, #-16]
-       ldp     A_l, A_h, [src, #-16]
-       stp     B_l, B_h, [dst, #-32]
-       ldp     B_l, B_h, [src, #-32]
-       stp     C_l, C_h, [dst, #-48]
-       ldp     C_l, C_h, [src, #-48]
-       stp     D_l, D_h, [dst, #-64]!
-       ldp     D_l, D_h, [src, #-64]!
-       subs    count, count, #64
-       b.ge    1b
-       stp     A_l, A_h, [dst, #-16]
-       stp     B_l, B_h, [dst, #-32]
-       stp     C_l, C_h, [dst, #-48]
-       stp     D_l, D_h, [dst, #-64]!
-       tst     count, #0x3f
-       b.ne    .Ltail63up
-       ret
-       .size memcpy_backwards, . - memcpy_backwards
diff --git a/libc/string/memmove.c b/libc/string/memmove.c
--- a/libc/string/memmove.c
+++ b/libc/string/memmove.c
@@ -3,6 +3,7 @@
 #include <stdio.h>
 #include <osv/string.h>
 
+#ifndef AARCH64_PORT_STUB
 void *memmove(void *dest, const void *src, size_t n)
 {
        char *d = dest;
@@ -17,3 +18,4 @@ void *memmove(void *dest, const void *src, size_t n)
         return memcpy_backwards(d, s, n);
        }
 }
+#endif
diff --git a/tests/tst-memmove.cc b/tests/tst-memmove.cc
--- a/tests/tst-memmove.cc
+++ b/tests/tst-memmove.cc
@@ -59,6 +59,7 @@ static void memmove_test(int dest_offset, int src_offset, 
size_t n)
 
     memmove_model(buf1 + dest_offset, buf1 + src_offset, n);
     memmove(buf2 + dest_offset, buf2 + src_offset, n);
+
     assert(0 == memcmp(buf1, buf2, BUF_SIZE));
 
     free(buf2);
@@ -181,6 +182,10 @@ int main()
     memmove_test(4, 0, 13526);
     memmove_test(125, 0, 14572);
 
+    // Some explicit tests that failed on AArch64
+    memmove_test(10318, 10328, 127);
+    memmove_test(10318, 10328, 138);
+
     // Test random overlapping memmove scenarios
     int n;
     for (int i = 0; i < 1000; i++) {

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/000000000000af3fe005aace31f0%40google.com.

Reply via email to