Hi,

has anyone looked at the netbsd xscale-versions of bcopyin/bcopyout/kcopy?

this is from netbsd bcopyinout.S:
#if defined(__XSCALE__) || defined(_ARM_ARCH_6)
/*
 * armv6 and v7 have pld and strd so they can use the xscale
 * bcopyinout as well.
 */
#include "bcopyinout_xscale.S"
#else

untested diff below i just scavenged from one of my dead branches,
just incase someone has the time and motivation to run it through some
performance testing or w/e.

-Artturi


diff --git a/sys/arch/arm/arm/bcopyinout.S b/sys/arch/arm/arm/bcopyinout.S
index 9a7d11865c0..bc2e58d22b7 100644
--- a/sys/arch/arm/arm/bcopyinout.S
+++ b/sys/arch/arm/arm/bcopyinout.S
@@ -41,7 +41,7 @@
 #include <machine/asm.h>
 #include <arm/sysreg.h>
 
-#ifdef __XSCALE__
+#ifdef CPU_ARMv7
 #include "bcopyinout_xscale.S"
 #else
 
diff --git a/sys/arch/arm/arm/bcopyinout_xscale.S 
b/sys/arch/arm/arm/bcopyinout_xscale.S
new file mode 100644
index 00000000000..2e740eb96c2
--- /dev/null
+++ b/sys/arch/arm/arm/bcopyinout_xscale.S
@@ -0,0 +1,1139 @@
+/* $OpenBSD$ */
+/*     $NetBSD: bcopyinout_xscale.S,v 1.11 2013/12/01 02:54:33 joerg Exp $     
*/
+
+/*
+ * Copyright 2003 Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Steve C. Woodford for Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed for the NetBSD Project by
+ *      Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ *    or promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+       .text
+       .align  2
+
+/*
+ * r0 = user space address
+ * r1 = kernel space address
+ * r2 = length
+ *
+ * Copies bytes from user space to kernel space
+ */
+ENTRY(copyin)
+       cmp     r2, #0x00
+#if /* XXX or <= 0 like below? */ 1
+       moveq   r0, #0
+       moveq   pc, lr
+#else
+       movle   r0, #0x00
+       RETc(le)                        /* Bail early if length is <= 0 */
+#endif
+       push    {r10-r11, lr}
+
+       /* Get curcpu from TPIDRPRW. */
+       mrc     CP15_TPIDRPRW(r10)
+       ldr     r10, [r10, #CI_CURPCB]
+
+       mov     r3, #0x00
+       adr     ip, .Lcopyin_fault
+       ldr     r11, [r10, #PCB_ONFAULT]
+       str     ip, [r10, #PCB_ONFAULT]
+       bl      .Lcopyin_guts
+       str     r11, [r10, #PCB_ONFAULT]
+       mov     r0, #0x00
+       pop     {r10-r11, pc}
+
+.Lcopyin_fault:
+       str     r11, [r10, #PCB_ONFAULT]
+       cmp     r3, #0x00
+       popgt   {r4-r7}         /* r3 > 0 Restore r4-r7 */
+       poplt   {r4-r9}         /* r3 < 0 Restore r4-r9 */
+       pop     {r10-r11, pc}
+
+.Lcopyin_guts:
+       pld     [r0]
+       /* Word-align the destination buffer */
+       ands    ip, r1, #0x03           /* Already word aligned? */
+       beq     .Lcopyin_wordaligned    /* Yup */
+       rsb     ip, ip, #0x04
+       cmp     r2, ip                  /* Enough bytes left to align it? */
+       blt     .Lcopyin_l4_2           /* Nope. Just copy bytewise */
+       sub     r2, r2, ip
+       rsbs    ip, ip, #0x03
+       addne   pc, pc, ip, lsl #3
+       nop
+       ldrbt   ip, [r0], #0x01
+       strb    ip, [r1], #0x01
+       ldrbt   ip, [r0], #0x01
+       strb    ip, [r1], #0x01
+       ldrbt   ip, [r0], #0x01
+       strb    ip, [r1], #0x01
+       cmp     r2, #0x00               /* All done? */
+       moveq   pc, lr
+
+       /* Destination buffer is now word aligned */
+.Lcopyin_wordaligned:
+       ands    ip, r0, #0x03           /* Is src also word-aligned? */
+       bne     .Lcopyin_bad_align      /* Nope. Things just got bad */
+       cmp     r2, #0x08               /* Less than 8 bytes remaining? */
+       blt     .Lcopyin_w_less_than8
+
+       /* Quad-align the destination buffer */
+       tst     r1, #0x07               /* Already quad aligned? */
+       ldrtne  ip, [r0], #0x04
+       push    {r4-r9}         /* Free up some registers */
+       mov     r3, #-1                 /* Signal restore r4-r9 */
+       tst     r1, #0x07               /* XXX: bug work-around */
+       subne   r2, r2, #0x04
+       strne   ip, [r1], #0x04
+
+       /* Destination buffer quad aligned, source is word aligned */
+       subs    r2, r2, #0x80
+       blt     .Lcopyin_w_lessthan128
+
+       /* Copy 128 bytes at a time */
+.Lcopyin_w_loop128:
+       ldrt    r4, [r0], #0x04         /* LD:00-03 */
+       ldrt    r5, [r0], #0x04         /* LD:04-07 */
+       pld     [r0, #0x18]             /* Prefetch 0x20 */
+       ldrt    r6, [r0], #0x04         /* LD:08-0b */
+       ldrt    r7, [r0], #0x04         /* LD:0c-0f */
+       ldrt    r8, [r0], #0x04         /* LD:10-13 */
+       ldrt    r9, [r0], #0x04         /* LD:14-17 */
+       strd    r4, r5, [r1], #0x08     /* ST:00-07 */
+       ldrt    r4, [r0], #0x04         /* LD:18-1b */
+       ldrt    r5, [r0], #0x04         /* LD:1c-1f */
+       strd    r6, r7, [r1], #0x08     /* ST:08-0f */
+       ldrt    r6, [r0], #0x04         /* LD:20-23 */
+       ldrt    r7, [r0], #0x04         /* LD:24-27 */
+       pld     [r0, #0x18]             /* Prefetch 0x40 */
+       strd    r8, r9, [r1], #0x08     /* ST:10-17 */
+       ldrt    r8, [r0], #0x04         /* LD:28-2b */
+       ldrt    r9, [r0], #0x04         /* LD:2c-2f */
+       strd    r4, r5, [r1], #0x08     /* ST:18-1f */
+       ldrt    r4, [r0], #0x04         /* LD:30-33 */
+       ldrt    r5, [r0], #0x04         /* LD:34-37 */
+       strd    r6, r7, [r1], #0x08             /* ST:20-27 */
+       ldrt    r6, [r0], #0x04         /* LD:38-3b */
+       ldrt    r7, [r0], #0x04         /* LD:3c-3f */
+       strd    r8, r9, [r1], #0x08     /* ST:28-2f */
+       ldrt    r8, [r0], #0x04         /* LD:40-43 */
+       ldrt    r9, [r0], #0x04         /* LD:44-47 */
+       pld     [r0, #0x18]             /* Prefetch 0x60 */
+       strd    r4, r5, [r1], #0x08     /* ST:30-37 */
+       ldrt    r4, [r0], #0x04         /* LD:48-4b */
+       ldrt    r5, [r0], #0x04         /* LD:4c-4f */
+       strd    r6, r7, [r1], #0x08     /* ST:38-3f */
+       ldrt    r6, [r0], #0x04         /* LD:50-53 */
+       ldrt    r7, [r0], #0x04         /* LD:54-57 */
+       strd    r8, r9, [r1], #0x08     /* ST:40-47 */
+       ldrt    r8, [r0], #0x04         /* LD:58-5b */
+       ldrt    r9, [r0], #0x04         /* LD:5c-5f */
+       strd    r4, r5, [r1], #0x08     /* ST:48-4f */
+       ldrt    r4, [r0], #0x04         /* LD:60-63 */
+       ldrt    r5, [r0], #0x04         /* LD:64-67 */
+       pld     [r0, #0x18]             /* Prefetch 0x80 */
+       strd    r6, r7, [r1], #0x08     /* ST:50-57 */
+       ldrt    r6, [r0], #0x04         /* LD:68-6b */
+       ldrt    r7, [r0], #0x04         /* LD:6c-6f */
+       strd    r8, r9, [r1], #0x08     /* ST:58-5f */
+       ldrt    r8, [r0], #0x04         /* LD:70-73 */
+       ldrt    r9, [r0], #0x04         /* LD:74-77 */
+       strd    r4, r5, [r1], #0x08     /* ST:60-67 */
+       ldrt    r4, [r0], #0x04         /* LD:78-7b */
+       ldrt    r5, [r0], #0x04         /* LD:7c-7f */
+       strd    r6, r7, [r1], #0x08     /* ST:68-6f */
+       strd    r8, r9, [r1], #0x08     /* ST:70-77 */
+       subs    r2, r2, #0x80
+       strd    r4, r5, [r1], #0x08     /* ST:78-7f */
+       bge     .Lcopyin_w_loop128
+
+.Lcopyin_w_lessthan128:
+       adds    r2, r2, #0x80           /* Adjust for extra sub */
+       popeq   {r4-r9}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x20
+       blt     .Lcopyin_w_lessthan32
+
+       /* Copy 32 bytes at a time */
+.Lcopyin_w_loop32:
+       ldrt    r4, [r0], #0x04
+       ldrt    r5, [r0], #0x04
+       pld     [r0, #0x18]
+       ldrt    r6, [r0], #0x04
+       ldrt    r7, [r0], #0x04
+       ldrt    r8, [r0], #0x04
+       ldrt    r9, [r0], #0x04
+       strd    r4, r5, [r1], #0x08
+       ldrt    r4, [r0], #0x04
+       ldrt    r5, [r0], #0x04
+       strd    r6, r7, [r1], #0x08
+       strd    r8, r9, [r1], #0x08
+       subs    r2, r2, #0x20
+       strd    r4, r5, [r1], #0x08
+       bge     .Lcopyin_w_loop32
+
+.Lcopyin_w_lessthan32:
+       adds    r2, r2, #0x20           /* Adjust for extra sub */
+       popeq   {r4-r9}
+       moveq   pc, lr                  /* Return now if done */
+
+       and     r4, r2, #0x18
+       rsb     r5, r4, #0x18
+       subs    r2, r2, r4
+       add     pc, pc, r5, lsl #1
+       nop
+
+       /* At least 24 bytes remaining */
+       ldrt    r4, [r0], #0x04
+       ldrt    r5, [r0], #0x04
+       nop
+       strd    r4, r5, [r1], #0x08
+
+       /* At least 16 bytes remaining */
+       ldrt    r4, [r0], #0x04
+       ldrt    r5, [r0], #0x04
+       nop
+       strd    r4, r5, [r1], #0x08
+
+       /* At least 8 bytes remaining */
+       ldrt    r4, [r0], #0x04
+       ldrt    r5, [r0], #0x04
+       nop
+       strd    r4, r5, [r1], #0x08
+
+       /* Less than 8 bytes remaining */
+       pop     {r4-r9}
+       moveq   pc, lr                  /* Return now if done */
+       mov     r3, #0x00
+
+.Lcopyin_w_less_than8:
+       subs    r2, r2, #0x04
+       ldrtge  ip, [r0], #0x04
+       strge   ip, [r1], #0x04
+       moveq   pc, lr                  /* Return now if done */
+       addlt   r2, r2, #0x04
+       ldrbt   ip, [r0], #0x01
+       cmp     r2, #0x02
+       ldrbtge r2, [r0], #0x01
+       strb    ip, [r1], #0x01
+       ldrbtgt ip, [r0]
+       strbge  r2, [r1], #0x01
+       strbgt  ip, [r1]
+       RET
+
+/*
+ * At this point, it has not been possible to word align both buffers.
+ * The destination buffer (r1) is word aligned, but the source buffer
+ * (r0) is not.
+ */
+.Lcopyin_bad_align:
+       push    {r4-r7}
+       mov     r3, #0x01
+       bic     r0, r0, #0x03
+       cmp     ip, #2
+       ldrt    ip, [r0], #0x04
+       bgt     .Lcopyin_bad3
+       beq     .Lcopyin_bad2
+       b       .Lcopyin_bad1
+
+.Lcopyin_bad1_loop16:
+       mov     r4, ip, lsr #8
+       ldrt    r5, [r0], #0x04
+       pld     [r0, #0x018]
+       ldrt    r6, [r0], #0x04
+       ldrt    r7, [r0], #0x04
+       ldrt    ip, [r0], #0x04
+       orr     r4, r4, r5, lsl #24
+       mov     r5, r5, lsr #8
+       orr     r5, r5, r6, lsl #24
+       mov     r6, r6, lsr #8
+       orr     r6, r6, r7, lsl #24
+       mov     r7, r7, lsr #8
+       orr     r7, r7, ip, lsl #24
+       str     r4, [r1], #0x04
+       str     r5, [r1], #0x04
+       str     r6, [r1], #0x04
+       str     r7, [r1], #0x04
+.Lcopyin_bad1:
+       subs    r2, r2, #0x10
+       bge     .Lcopyin_bad1_loop16
+
+       adds    r2, r2, #0x10
+       popeq   {r4-r7}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x04
+       sublt   r0, r0, #0x03
+       blt     .Lcopyin_l4
+
+.Lcopyin_bad1_loop4:
+       mov     r4, ip, lsr #8
+       ldrt    ip, [r0], #0x04
+       subs    r2, r2, #0x04
+       orr     r4, r4, ip, lsl #24
+       str     r4, [r1], #0x04
+       bge     .Lcopyin_bad1_loop4
+       sub     r0, r0, #0x03
+       b       .Lcopyin_l4
+
+.Lcopyin_bad2_loop16:
+       mov     r4, ip, lsr #16
+       ldrt    r5, [r0], #0x04
+       pld     [r0, #0x018]
+       ldrt    r6, [r0], #0x04
+       ldrt    r7, [r0], #0x04
+       ldrt    ip, [r0], #0x04
+       orr     r4, r4, r5, lsl #16
+       mov     r5, r5, lsr #16
+       orr     r5, r5, r6, lsl #16
+       mov     r6, r6, lsr #16
+       orr     r6, r6, r7, lsl #16
+       mov     r7, r7, lsr #16
+       orr     r7, r7, ip, lsl #16
+       str     r4, [r1], #0x04
+       str     r5, [r1], #0x04
+       str     r6, [r1], #0x04
+       str     r7, [r1], #0x04
+.Lcopyin_bad2:
+       subs    r2, r2, #0x10
+       bge     .Lcopyin_bad2_loop16
+
+       adds    r2, r2, #0x10
+       popeq   {r4-r7}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x04
+       sublt   r0, r0, #0x02
+       blt     .Lcopyin_l4
+
+.Lcopyin_bad2_loop4:
+       mov     r4, ip, lsr #16
+       ldrt    ip, [r0], #0x04
+       subs    r2, r2, #0x04
+       orr     r4, r4, ip, lsl #16
+       str     r4, [r1], #0x04
+       bge     .Lcopyin_bad2_loop4
+       sub     r0, r0, #0x02
+       b       .Lcopyin_l4
+
+.Lcopyin_bad3_loop16:
+       mov     r4, ip, lsr #24
+       ldrt    r5, [r0], #0x04
+       pld     [r0, #0x018]
+       ldrt    r6, [r0], #0x04
+       ldrt    r7, [r0], #0x04
+       ldrt    ip, [r0], #0x04
+       orr     r4, r4, r5, lsl #8
+       mov     r5, r5, lsr #24
+       orr     r5, r5, r6, lsl #8
+       mov     r6, r6, lsr #24
+       orr     r6, r6, r7, lsl #8
+       mov     r7, r7, lsr #24
+       orr     r7, r7, ip, lsl #8
+       str     r4, [r1], #0x04
+       str     r5, [r1], #0x04
+       str     r6, [r1], #0x04
+       str     r7, [r1], #0x04
+.Lcopyin_bad3:
+       subs    r2, r2, #0x10
+       bge     .Lcopyin_bad3_loop16
+
+       adds    r2, r2, #0x10
+       popeq   {r4-r7}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x04
+       sublt   r0, r0, #0x01
+       blt     .Lcopyin_l4
+
+.Lcopyin_bad3_loop4:
+       mov     r4, ip, lsr #24
+       ldrt    ip, [r0], #0x04
+       subs    r2, r2, #0x04
+       orr     r4, r4, ip, lsl #8
+       str     r4, [r1], #0x04
+       bge     .Lcopyin_bad3_loop4
+       sub     r0, r0, #0x01
+
+.Lcopyin_l4:
+       pop     {r4-r7}
+       mov     r3, #0x00
+       adds    r2, r2, #0x04
+       moveq   pc, lr
+.Lcopyin_l4_2:
+       rsbs    r2, r2, #0x03
+       addne   pc, pc, r2, lsl #3
+       nop
+       ldrbt   ip, [r0], #0x01
+       strb    ip, [r1], #0x01
+       ldrbt   ip, [r0], #0x01
+       strb    ip, [r1], #0x01
+       ldrbt   ip, [r0]
+       strb    ip, [r1]
+       RET
+END(copyin)
+
+
+/*
+ * r0 = kernel space address
+ * r1 = user space address
+ * r2 = length
+ *
+ * Copies bytes from kernel space to user space
+ */
+ENTRY(copyout)
+       cmp     r2, #0x00
+#if /* XXX or <= 0 like below? */ 1
+       moveq   r0, #0
+       moveq   pc, lr
+#else
+       movle   r0, #0x00
+       RETc(le)                        /* Bail early if length is <= 0 */
+#endif
+
+       push    {r10-r11, lr}
+
+       /* Get curcpu from TPIDRPRW. */
+       mrc     CP15_TPIDRPRW(r10)
+       ldr     r10, [r10, #CI_CURPCB]
+
+       mov     r3, #0x00
+       adr     ip, .Lcopyout_fault
+       ldr     r11, [r10, #PCB_ONFAULT]
+       str     ip, [r10, #PCB_ONFAULT]
+       bl      .Lcopyout_guts
+       str     r11, [r10, #PCB_ONFAULT]
+       mov     r0, #0x00
+       pop     {r10-r11, pc}
+
+.Lcopyout_fault:
+       str     r11, [r10, #PCB_ONFAULT]
+       cmp     r3, #0x00
+       popgt   {r4-r7}         /* r3 > 0 Restore r4-r7 */
+       poplt   {r4-r9}         /* r3 < 0 Restore r4-r9 */
+       pop     {r10-r11, pc}
+
+.Lcopyout_guts:
+       pld     [r0]
+       /* Word-align the destination buffer */
+       ands    ip, r1, #0x03           /* Already word aligned? */
+       beq     .Lcopyout_wordaligned   /* Yup */
+       rsb     ip, ip, #0x04
+       cmp     r2, ip                  /* Enough bytes left to align it? */
+       blt     .Lcopyout_l4_2          /* Nope. Just copy bytewise */
+       sub     r2, r2, ip
+       rsbs    ip, ip, #0x03
+       addne   pc, pc, ip, lsl #3
+       nop
+       ldrb    ip, [r0], #0x01
+       strbt   ip, [r1], #0x01
+       ldrb    ip, [r0], #0x01
+       strbt   ip, [r1], #0x01
+       ldrb    ip, [r0], #0x01
+       strbt   ip, [r1], #0x01
+       cmp     r2, #0x00               /* All done? */
+       moveq   pc, lr
+
+       /* Destination buffer is now word aligned */
+.Lcopyout_wordaligned:
+       ands    ip, r0, #0x03           /* Is src also word-aligned? */
+       bne     .Lcopyout_bad_align     /* Nope. Things just got bad */
+       cmp     r2, #0x08               /* Less than 8 bytes remaining? */
+       blt     .Lcopyout_w_less_than8
+
+       /* Quad-align the destination buffer */
+       tst     r1, #0x07               /* Already quad aligned? */
+       ldrne   ip, [r0], #0x04
+       push    {r4-r9}         /* Free up some registers */
+       mov     r3, #-1                 /* Signal restore r4-r9 */
+       tst     r1, #0x07               /* XXX: bug work-around */
+       subne   r2, r2, #0x04
+       strtne  ip, [r1], #0x04
+
+       /* Destination buffer quad aligned, source is word aligned */
+       subs    r2, r2, #0x80
+       blt     .Lcopyout_w_lessthan128
+
+       /* Copy 128 bytes at a time */
+.Lcopyout_w_loop128:
+       ldr     r4, [r0], #0x04         /* LD:00-03 */
+       ldr     r5, [r0], #0x04         /* LD:04-07 */
+       pld     [r0, #0x18]             /* Prefetch 0x20 */
+       ldr     r6, [r0], #0x04         /* LD:08-0b */
+       ldr     r7, [r0], #0x04         /* LD:0c-0f */
+       ldr     r8, [r0], #0x04         /* LD:10-13 */
+       ldr     r9, [r0], #0x04         /* LD:14-17 */
+       strt    r4, [r1], #0x04         /* ST:00-03 */
+       strt    r5, [r1], #0x04         /* ST:04-07 */
+       ldr     r4, [r0], #0x04         /* LD:18-1b */
+       ldr     r5, [r0], #0x04         /* LD:1c-1f */
+       strt    r6, [r1], #0x04         /* ST:08-0b */
+       strt    r7, [r1], #0x04         /* ST:0c-0f */
+       ldr     r6, [r0], #0x04         /* LD:20-23 */
+       ldr     r7, [r0], #0x04         /* LD:24-27 */
+       pld     [r0, #0x18]             /* Prefetch 0x40 */
+       strt    r8, [r1], #0x04         /* ST:10-13 */
+       strt    r9, [r1], #0x04         /* ST:14-17 */
+       ldr     r8, [r0], #0x04         /* LD:28-2b */
+       ldr     r9, [r0], #0x04         /* LD:2c-2f */
+       strt    r4, [r1], #0x04         /* ST:18-1b */
+       strt    r5, [r1], #0x04         /* ST:1c-1f */
+       ldr     r4, [r0], #0x04         /* LD:30-33 */
+       ldr     r5, [r0], #0x04         /* LD:34-37 */
+       strt    r6, [r1], #0x04         /* ST:20-23 */
+       strt    r7, [r1], #0x04         /* ST:24-27 */
+       ldr     r6, [r0], #0x04         /* LD:38-3b */
+       ldr     r7, [r0], #0x04         /* LD:3c-3f */
+       strt    r8, [r1], #0x04         /* ST:28-2b */
+       strt    r9, [r1], #0x04         /* ST:2c-2f */
+       ldr     r8, [r0], #0x04         /* LD:40-43 */
+       ldr     r9, [r0], #0x04         /* LD:44-47 */
+       pld     [r0, #0x18]             /* Prefetch 0x60 */
+       strt    r4, [r1], #0x04         /* ST:30-33 */
+       strt    r5, [r1], #0x04         /* ST:34-37 */
+       ldr     r4, [r0], #0x04         /* LD:48-4b */
+       ldr     r5, [r0], #0x04         /* LD:4c-4f */
+       strt    r6, [r1], #0x04         /* ST:38-3b */
+       strt    r7, [r1], #0x04         /* ST:3c-3f */
+       ldr     r6, [r0], #0x04         /* LD:50-53 */
+       ldr     r7, [r0], #0x04         /* LD:54-57 */
+       strt    r8, [r1], #0x04         /* ST:40-43 */
+       strt    r9, [r1], #0x04         /* ST:44-47 */
+       ldr     r8, [r0], #0x04         /* LD:58-5b */
+       ldr     r9, [r0], #0x04         /* LD:5c-5f */
+       strt    r4, [r1], #0x04         /* ST:48-4b */
+       strt    r5, [r1], #0x04         /* ST:4c-4f */
+       ldr     r4, [r0], #0x04         /* LD:60-63 */
+       ldr     r5, [r0], #0x04         /* LD:64-67 */
+       pld     [r0, #0x18]             /* Prefetch 0x80 */
+       strt    r6, [r1], #0x04         /* ST:50-53 */
+       strt    r7, [r1], #0x04         /* ST:54-57 */
+       ldr     r6, [r0], #0x04         /* LD:68-6b */
+       ldr     r7, [r0], #0x04         /* LD:6c-6f */
+       strt    r8, [r1], #0x04         /* ST:58-5b */
+       strt    r9, [r1], #0x04         /* ST:5c-5f */
+       ldr     r8, [r0], #0x04         /* LD:70-73 */
+       ldr     r9, [r0], #0x04         /* LD:74-77 */
+       strt    r4, [r1], #0x04         /* ST:60-63 */
+       strt    r5, [r1], #0x04         /* ST:64-67 */
+       ldr     r4, [r0], #0x04         /* LD:78-7b */
+       ldr     r5, [r0], #0x04         /* LD:7c-7f */
+       strt    r6, [r1], #0x04         /* ST:68-6b */
+       strt    r7, [r1], #0x04         /* ST:6c-6f */
+       strt    r8, [r1], #0x04         /* ST:70-73 */
+       strt    r9, [r1], #0x04         /* ST:74-77 */
+       subs    r2, r2, #0x80
+       strt    r4, [r1], #0x04         /* ST:78-7b */
+       strt    r5, [r1], #0x04         /* ST:7c-7f */
+       bge     .Lcopyout_w_loop128
+
+.Lcopyout_w_lessthan128:
+       adds    r2, r2, #0x80           /* Adjust for extra sub */
+       popeq   {r4-r9}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x20
+       blt     .Lcopyout_w_lessthan32
+
+       /* Copy 32 bytes at a time */
+.Lcopyout_w_loop32:
+       ldr     r4, [r0], #0x04
+       ldr     r5, [r0], #0x04
+       pld     [r0, #0x18]
+       ldr     r6, [r0], #0x04
+       ldr     r7, [r0], #0x04
+       ldr     r8, [r0], #0x04
+       ldr     r9, [r0], #0x04
+       strt    r4, [r1], #0x04
+       strt    r5, [r1], #0x04
+       ldr     r4, [r0], #0x04
+       ldr     r5, [r0], #0x04
+       strt    r6, [r1], #0x04
+       strt    r7, [r1], #0x04
+       strt    r8, [r1], #0x04
+       strt    r9, [r1], #0x04
+       subs    r2, r2, #0x20
+       strt    r4, [r1], #0x04
+       strt    r5, [r1], #0x04
+       bge     .Lcopyout_w_loop32
+
+.Lcopyout_w_lessthan32:
+       adds    r2, r2, #0x20           /* Adjust for extra sub */
+       popeq   {r4-r9}
+       moveq   pc, lr                  /* Return now if done */
+
+       and     r4, r2, #0x18
+       rsb     r5, r4, #0x18
+       subs    r2, r2, r4
+       add     pc, pc, r5, lsl #1
+       nop
+
+       /* At least 24 bytes remaining */
+       ldr     r4, [r0], #0x04
+       ldr     r5, [r0], #0x04
+       strt    r4, [r1], #0x04
+       strt    r5, [r1], #0x04
+
+       /* At least 16 bytes remaining */
+       ldr     r4, [r0], #0x04
+       ldr     r5, [r0], #0x04
+       strt    r4, [r1], #0x04
+       strt    r5, [r1], #0x04
+
+       /* At least 8 bytes remaining */
+       ldr     r4, [r0], #0x04
+       ldr     r5, [r0], #0x04
+       strt    r4, [r1], #0x04
+       strt    r5, [r1], #0x04
+
+       /* Less than 8 bytes remaining */
+       pop     {r4-r9}
+       moveq   pc, lr                  /* Return now if done */
+       mov     r3, #0x00
+
+.Lcopyout_w_less_than8:
+       subs    r2, r2, #0x04
+       ldrge   ip, [r0], #0x04
+       strtge  ip, [r1], #0x04
+       moveq   pc, lr                  /* Return now if done */
+       addlt   r2, r2, #0x04
+       ldrb    ip, [r0], #0x01
+       cmp     r2, #0x02
+       ldrbge  r2, [r0], #0x01
+       strbt   ip, [r1], #0x01
+       ldrbgt  ip, [r0]
+       strbtge r2, [r1], #0x01
+       strbtgt ip, [r1]
+       RET
+
+/*
+ * At this point, it has not been possible to word align both buffers.
+ * The destination buffer (r1) is word aligned, but the source buffer
+ * (r0) is not.
+ */
+.Lcopyout_bad_align:
+       push    {r4-r7}
+       mov     r3, #0x01
+       bic     r0, r0, #0x03
+       cmp     ip, #2
+       ldr     ip, [r0], #0x04
+       bgt     .Lcopyout_bad3
+       beq     .Lcopyout_bad2
+       b       .Lcopyout_bad1
+
+.Lcopyout_bad1_loop16:
+       mov     r4, ip, lsr #8
+       ldr     r5, [r0], #0x04
+       pld     [r0, #0x018]
+       ldr     r6, [r0], #0x04
+       ldr     r7, [r0], #0x04
+       ldr     ip, [r0], #0x04
+       orr     r4, r4, r5, lsl #24
+       mov     r5, r5, lsr #8
+       orr     r5, r5, r6, lsl #24
+       mov     r6, r6, lsr #8
+       orr     r6, r6, r7, lsl #24
+       mov     r7, r7, lsr #8
+       orr     r7, r7, ip, lsl #24
+       strt    r4, [r1], #0x04
+       strt    r5, [r1], #0x04
+       strt    r6, [r1], #0x04
+       strt    r7, [r1], #0x04
+.Lcopyout_bad1:
+       subs    r2, r2, #0x10
+       bge     .Lcopyout_bad1_loop16
+
+       adds    r2, r2, #0x10
+       popeq   {r4-r7}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x04
+       sublt   r0, r0, #0x03
+       blt     .Lcopyout_l4
+
+.Lcopyout_bad1_loop4:
+       mov     r4, ip, lsr #8
+       ldr     ip, [r0], #0x04
+       subs    r2, r2, #0x04
+       orr     r4, r4, ip, lsl #24
+       strt    r4, [r1], #0x04
+       bge     .Lcopyout_bad1_loop4
+       sub     r0, r0, #0x03
+       b       .Lcopyout_l4
+
+.Lcopyout_bad2_loop16:
+       mov     r4, ip, lsr #16
+       ldr     r5, [r0], #0x04
+       pld     [r0, #0x018]
+       ldr     r6, [r0], #0x04
+       ldr     r7, [r0], #0x04
+       ldr     ip, [r0], #0x04
+       orr     r4, r4, r5, lsl #16
+       mov     r5, r5, lsr #16
+       orr     r5, r5, r6, lsl #16
+       mov     r6, r6, lsr #16
+       orr     r6, r6, r7, lsl #16
+       mov     r7, r7, lsr #16
+       orr     r7, r7, ip, lsl #16
+       strt    r4, [r1], #0x04
+       strt    r5, [r1], #0x04
+       strt    r6, [r1], #0x04
+       strt    r7, [r1], #0x04
+.Lcopyout_bad2:
+       subs    r2, r2, #0x10
+       bge     .Lcopyout_bad2_loop16
+
+       adds    r2, r2, #0x10
+       popeq   {r4-r7}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x04
+       sublt   r0, r0, #0x02
+       blt     .Lcopyout_l4
+
+.Lcopyout_bad2_loop4:
+       mov     r4, ip, lsr #16
+       ldr     ip, [r0], #0x04
+       subs    r2, r2, #0x04
+       orr     r4, r4, ip, lsl #16
+       strt    r4, [r1], #0x04
+       bge     .Lcopyout_bad2_loop4
+       sub     r0, r0, #0x02
+       b       .Lcopyout_l4
+
+.Lcopyout_bad3_loop16:
+       mov     r4, ip, lsr #24
+       ldr     r5, [r0], #0x04
+       pld     [r0, #0x018]
+       ldr     r6, [r0], #0x04
+       ldr     r7, [r0], #0x04
+       ldr     ip, [r0], #0x04
+       orr     r4, r4, r5, lsl #8
+       mov     r5, r5, lsr #24
+       orr     r5, r5, r6, lsl #8
+       mov     r6, r6, lsr #24
+       orr     r6, r6, r7, lsl #8
+       mov     r7, r7, lsr #24
+       orr     r7, r7, ip, lsl #8
+       strt    r4, [r1], #0x04
+       strt    r5, [r1], #0x04
+       strt    r6, [r1], #0x04
+       strt    r7, [r1], #0x04
+.Lcopyout_bad3:
+       subs    r2, r2, #0x10
+       bge     .Lcopyout_bad3_loop16
+
+       adds    r2, r2, #0x10
+       popeq   {r4-r7}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x04
+       sublt   r0, r0, #0x01
+       blt     .Lcopyout_l4
+
+.Lcopyout_bad3_loop4:
+       mov     r4, ip, lsr #24
+       ldr     ip, [r0], #0x04
+       subs    r2, r2, #0x04
+       orr     r4, r4, ip, lsl #8
+       strt    r4, [r1], #0x04
+       bge     .Lcopyout_bad3_loop4
+       sub     r0, r0, #0x01
+
+.Lcopyout_l4:
+       pop     {r4-r7}
+       mov     r3, #0x00
+       adds    r2, r2, #0x04
+       moveq   pc, lr
+.Lcopyout_l4_2:
+       rsbs    r2, r2, #0x03
+       addne   pc, pc, r2, lsl #3
+       nop
+       ldrb    ip, [r0], #0x01
+       strbt   ip, [r1], #0x01
+       ldrb    ip, [r0], #0x01
+       strbt   ip, [r1], #0x01
+       ldrb    ip, [r0]
+       strbt   ip, [r1]
+       RET
+END(copyout)
+
+/*
+ * r0 = kernel space source address
+ * r1 = kernel space destination address
+ * r2 = length
+ *
+ * Copies bytes from kernel space to kernel space, aborting on page fault
+ */
+ENTRY(kcopy)
+       cmp     r2, #0x00
+#if /* XXX or <= 0 like below? */ 1
+       moveq   r0, #0
+       moveq   pc, lr
+#else
+       movle   r0, #0x00
+       RETc(le)                        /* Bail early if length is <= 0 */
+#endif
+
+       push    {r10-r11, lr}
+
+       /* Get curcpu from TPIDRPRW. */
+       mrc     CP15_TPIDRPRW(r10)
+       ldr     r10, [r10, #CI_CURPCB]
+
+       mov     r3, #0x00
+       adr     ip, .Lkcopy_fault
+       ldr     r11, [r10, #PCB_ONFAULT]
+       str     ip, [r10, #PCB_ONFAULT]
+       bl      .Lkcopy_guts
+       str     r11, [r10, #PCB_ONFAULT]
+       mov     r0, #0x00
+       pop     {r10-r11, pc}
+
+.Lkcopy_fault:
+       str     r11, [r10, #PCB_ONFAULT]
+       cmp     r3, #0x00
+       popgt   {r4-r7}         /* r3 > 0 Restore r4-r7 */
+       poplt   {r4-r9}         /* r3 < 0 Restore r4-r9 */
+       pop     {r10-r11, pc}
+
+.Lkcopy_guts:
+       pld     [r0]
+       /* Word-align the destination buffer */
+       ands    ip, r1, #0x03           /* Already word aligned? */
+       beq     .Lkcopy_wordaligned     /* Yup */
+       rsb     ip, ip, #0x04
+       cmp     r2, ip                  /* Enough bytes left to align it? */
+       blt     .Lkcopy_bad_endgame2    /* Nope. Just copy bytewise */
+       sub     r2, r2, ip
+       rsbs    ip, ip, #0x03
+       addne   pc, pc, ip, lsl #3
+       nop
+       ldrb    ip, [r0], #0x01
+       strb    ip, [r1], #0x01
+       ldrb    ip, [r0], #0x01
+       strb    ip, [r1], #0x01
+       ldrb    ip, [r0], #0x01
+       strb    ip, [r1], #0x01
+       cmp     r2, #0x00               /* All done? */
+       RETc(eq)
+
+       /* Destination buffer is now word aligned */
+.Lkcopy_wordaligned:
+       ands    ip, r0, #0x03           /* Is src also word-aligned? */
+       bne     .Lkcopy_bad_align       /* Nope. Things just got bad */
+       cmp     r2, #0x08               /* Less than 8 bytes remaining? */
+       blt     .Lkcopy_w_less_than8
+
+       /* Quad-align the destination buffer */
+       tst     r1, #0x07               /* Already quad aligned? */
+       ldrne   ip, [r0], #0x04
+       push    {r4-r9}         /* Free up some registers */
+       mov     r3, #-1                 /* Signal restore r4-r9 */
+       subne   r2, r2, #0x04
+       strne   ip, [r1], #0x04
+
+       /* Destination buffer quad aligned, source is word aligned */
+       subs    r2, r2, #0x80
+       blt     .Lkcopy_w_lessthan128
+
+       /* Copy 128 bytes at a time */
+.Lkcopy_w_loop128:
+       ldr     r4, [r0], #0x04         /* LD:00-03 */
+       ldr     r5, [r0], #0x04         /* LD:04-07 */
+       pld     [r0, #0x18]             /* Prefetch 0x20 */
+       ldr     r6, [r0], #0x04         /* LD:08-0b */
+       ldr     r7, [r0], #0x04         /* LD:0c-0f */
+       ldr     r8, [r0], #0x04         /* LD:10-13 */
+       ldr     r9, [r0], #0x04         /* LD:14-17 */
+       strd    r4, r5, [r1], #0x08     /* ST:00-07 */
+       ldr     r4, [r0], #0x04         /* LD:18-1b */
+       ldr     r5, [r0], #0x04         /* LD:1c-1f */
+       strd    r6, r7, [r1], #0x08     /* ST:08-0f */
+       ldr     r6, [r0], #0x04         /* LD:20-23 */
+       ldr     r7, [r0], #0x04         /* LD:24-27 */
+       pld     [r0, #0x18]             /* Prefetch 0x40 */
+       strd    r8, r9, [r1], #0x08     /* ST:10-17 */
+       ldr     r8, [r0], #0x04         /* LD:28-2b */
+       ldr     r9, [r0], #0x04         /* LD:2c-2f */
+       strd    r4, r5, [r1], #0x08     /* ST:18-1f */
+       ldr     r4, [r0], #0x04         /* LD:30-33 */
+       ldr     r5, [r0], #0x04         /* LD:34-37 */
+       strd    r6, r7, [r1], #0x08     /* ST:20-27 */
+       ldr     r6, [r0], #0x04         /* LD:38-3b */
+       ldr     r7, [r0], #0x04         /* LD:3c-3f */
+       strd    r8, r9, [r1], #0x08     /* ST:28-2f */
+       ldr     r8, [r0], #0x04         /* LD:40-43 */
+       ldr     r9, [r0], #0x04         /* LD:44-47 */
+       pld     [r0, #0x18]             /* Prefetch 0x60 */
+       strd    r4, r5, [r1], #0x08     /* ST:30-37 */
+       ldr     r4, [r0], #0x04         /* LD:48-4b */
+       ldr     r5, [r0], #0x04         /* LD:4c-4f */
+       strd    r6, r7, [r1], #0x08     /* ST:38-3f */
+       ldr     r6, [r0], #0x04         /* LD:50-53 */
+       ldr     r7, [r0], #0x04         /* LD:54-57 */
+       strd    r8, r9, [r1], #0x08     /* ST:40-47 */
+       ldr     r8, [r0], #0x04         /* LD:58-5b */
+       ldr     r9, [r0], #0x04         /* LD:5c-5f */
+       strd    r4, r5, [r1], #0x08     /* ST:48-4f */
+       ldr     r4, [r0], #0x04         /* LD:60-63 */
+       ldr     r5, [r0], #0x04         /* LD:64-67 */
+       pld     [r0, #0x18]             /* Prefetch 0x80 */
+       strd    r6, r7, [r1], #0x08     /* ST:50-57 */
+       ldr     r6, [r0], #0x04         /* LD:68-6b */
+       ldr     r7, [r0], #0x04         /* LD:6c-6f */
+       strd    r8, r9, [r1], #0x08     /* ST:58-5f */
+       ldr     r8, [r0], #0x04         /* LD:70-73 */
+       ldr     r9, [r0], #0x04         /* LD:74-77 */
+       strd    r4, r5, [r1], #0x08     /* ST:60-67 */
+       ldr     r4, [r0], #0x04         /* LD:78-7b */
+       ldr     r5, [r0], #0x04         /* LD:7c-7f */
+       strd    r6, r7, [r1], #0x08     /* ST:68-6f */
+       strd    r8, r9, [r1], #0x08     /* ST:70-77 */
+       subs    r2, r2, #0x80
+       strd    r4, r5, [r1], #0x08     /* ST:78-7f */
+       bge     .Lkcopy_w_loop128
+
+.Lkcopy_w_lessthan128:
+       adds    r2, r2, #0x80           /* Adjust for extra sub */
+       popeq   {r4-r9}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x20
+       blt     .Lkcopy_w_lessthan32
+
+       /* Copy 32 bytes at a time */
+.Lkcopy_w_loop32:
+       ldr     r4, [r0], #0x04
+       ldr     r5, [r0], #0x04
+       pld     [r0, #0x18]
+       ldr     r6, [r0], #0x04
+       ldr     r7, [r0], #0x04
+       ldr     r8, [r0], #0x04
+       ldr     r9, [r0], #0x04
+       strd    r4, r5, [r1], #0x08
+       ldr     r4, [r0], #0x04
+       ldr     r5, [r0], #0x04
+       strd    r6, r7, [r1], #0x08
+       strd    r8, r9, [r1], #0x08
+       subs    r2, r2, #0x20
+       strd    r4, r5, [r1], #0x08
+       bge     .Lkcopy_w_loop32
+
+.Lkcopy_w_lessthan32:
+       adds    r2, r2, #0x20           /* Adjust for extra sub */
+       popeq   {r4-r9}
+       moveq   pc, lr                  /* Return now if done */
+
+       and     r4, r2, #0x18
+       rsb     r5, r4, #0x18
+       subs    r2, r2, r4
+       add     pc, pc, r5, lsl #1
+       nop
+
+       /* At least 24 bytes remaining */
+       ldr     r4, [r0], #0x04
+       ldr     r5, [r0], #0x04
+       nop
+       strd    r4, r5, [r1], #0x08
+
+       /* At least 16 bytes remaining */
+       ldr     r4, [r0], #0x04
+       ldr     r5, [r0], #0x04
+       nop
+       strd    r4, r5, [r1], #0x08
+
+       /* At least 8 bytes remaining */
+       ldr     r4, [r0], #0x04
+       ldr     r5, [r0], #0x04
+       nop
+       strd    r4, r5, [r1], #0x08
+
+       /* Less than 8 bytes remaining */
+       pop     {r4-r9}
+       moveq   pc, lr                  /* Return now if done */
+       mov     r3, #0x00
+
+.Lkcopy_w_less_than8:
+       subs    r2, r2, #0x04
+       ldrge   ip, [r0], #0x04
+       strge   ip, [r1], #0x04
+       moveq   pc, lr                  /* Return now if done */
+       addlt   r2, r2, #0x04
+       ldrb    ip, [r0], #0x01
+       cmp     r2, #0x02
+       ldrbge  r2, [r0], #0x01
+       strb    ip, [r1], #0x01
+       ldrbgt  ip, [r0]
+       strbge  r2, [r1], #0x01
+       strbgt  ip, [r1]
+       RET
+
+/*
+ * At this point, it has not been possible to word align both buffers.
+ * The destination buffer (r1) is word aligned, but the source buffer
+ * (r0) is not.
+ */
+.Lkcopy_bad_align:
+       push    {r4-r7}
+       mov     r3, #0x01
+       bic     r0, r0, #0x03
+       cmp     ip, #2
+       ldr     ip, [r0], #0x04
+       bgt     .Lkcopy_bad3
+       beq     .Lkcopy_bad2
+       b       .Lkcopy_bad1
+
+.Lkcopy_bad1_loop16:
+       mov     r4, ip, lsr #8
+       ldr     r5, [r0], #0x04
+       pld     [r0, #0x018]
+       ldr     r6, [r0], #0x04
+       ldr     r7, [r0], #0x04
+       ldr     ip, [r0], #0x04
+       orr     r4, r4, r5, lsl #24
+       mov     r5, r5, lsr #8
+       orr     r5, r5, r6, lsl #24
+       mov     r6, r6, lsr #8
+       orr     r6, r6, r7, lsl #24
+       mov     r7, r7, lsr #8
+       orr     r7, r7, ip, lsl #24
+       str     r4, [r1], #0x04
+       str     r5, [r1], #0x04
+       str     r6, [r1], #0x04
+       str     r7, [r1], #0x04
+.Lkcopy_bad1:
+       subs    r2, r2, #0x10
+       bge     .Lkcopy_bad1_loop16
+
+       adds    r2, r2, #0x10
+       popeq   {r4-r7}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x04
+       sublt   r0, r0, #0x03
+       blt     .Lkcopy_bad_endgame
+
+.Lkcopy_bad1_loop4:
+       mov     r4, ip, lsr #8
+       ldr     ip, [r0], #0x04
+       subs    r2, r2, #0x04
+       orr     r4, r4, ip, lsl #24
+       str     r4, [r1], #0x04
+       bge     .Lkcopy_bad1_loop4
+       sub     r0, r0, #0x03
+       b       .Lkcopy_bad_endgame
+
+.Lkcopy_bad2_loop16:
+       mov     r4, ip, lsr #16
+       ldr     r5, [r0], #0x04
+       pld     [r0, #0x018]
+       ldr     r6, [r0], #0x04
+       ldr     r7, [r0], #0x04
+       ldr     ip, [r0], #0x04
+       orr     r4, r4, r5, lsl #16
+       mov     r5, r5, lsr #16
+       orr     r5, r5, r6, lsl #16
+       mov     r6, r6, lsr #16
+       orr     r6, r6, r7, lsl #16
+       mov     r7, r7, lsr #16
+       orr     r7, r7, ip, lsl #16
+       str     r4, [r1], #0x04
+       str     r5, [r1], #0x04
+       str     r6, [r1], #0x04
+       str     r7, [r1], #0x04
+.Lkcopy_bad2:
+       subs    r2, r2, #0x10
+       bge     .Lkcopy_bad2_loop16
+
+       adds    r2, r2, #0x10
+       popeq   {r4-r7}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x04
+       sublt   r0, r0, #0x02
+       blt     .Lkcopy_bad_endgame
+
+.Lkcopy_bad2_loop4:
+       mov     r4, ip, lsr #16
+       ldr     ip, [r0], #0x04
+       subs    r2, r2, #0x04
+       orr     r4, r4, ip, lsl #16
+       str     r4, [r1], #0x04
+       bge     .Lkcopy_bad2_loop4
+       sub     r0, r0, #0x02
+       b       .Lkcopy_bad_endgame
+
+.Lkcopy_bad3_loop16:
+       mov     r4, ip, lsr #24
+       ldr     r5, [r0], #0x04
+       pld     [r0, #0x018]
+       ldr     r6, [r0], #0x04
+       ldr     r7, [r0], #0x04
+       ldr     ip, [r0], #0x04
+       orr     r4, r4, r5, lsl #8
+       mov     r5, r5, lsr #24
+       orr     r5, r5, r6, lsl #8
+       mov     r6, r6, lsr #24
+       orr     r6, r6, r7, lsl #8
+       mov     r7, r7, lsr #24
+       orr     r7, r7, ip, lsl #8
+       str     r4, [r1], #0x04
+       str     r5, [r1], #0x04
+       str     r6, [r1], #0x04
+       str     r7, [r1], #0x04
+.Lkcopy_bad3:
+       subs    r2, r2, #0x10
+       bge     .Lkcopy_bad3_loop16
+
+       adds    r2, r2, #0x10
+       popeq   {r4-r7}
+       moveq   pc, lr                  /* Return now if done */
+       subs    r2, r2, #0x04
+       sublt   r0, r0, #0x01
+       blt     .Lkcopy_bad_endgame
+
+.Lkcopy_bad3_loop4:
+       mov     r4, ip, lsl #24
+       ldr     ip, [r0], #0x04
+       subs    r2, r2, #0x04
+       orr     r4, r4, ip, lsl #8
+       str     r4, [r1], #0x04
+       bge     .Lkcopy_bad3_loop4
+       sub     r0, r0, #0x01
+
+.Lkcopy_bad_endgame:
+       pop     {r4-r7}
+       mov     r3, #0x00
+       adds    r2, r2, #0x04
+       moveq   pc, lr
+.Lkcopy_bad_endgame2:
+       rsbs    r2, r2, #0x03
+       addne   pc, pc, r2, lsl #3
+       nop
+       ldrb    ip, [r0], #0x01
+       strb    ip, [r1], #0x01
+       ldrb    ip, [r0], #0x01
+       strb    ip, [r1], #0x01
+       ldrb    ip, [r0]
+       strb    ip, [r1]
+       RET
+END(kcopy)

Reply via email to