Module Name: src
Committed By: christos
Date: Sun Mar 17 00:42:32 UTC 2013
Added Files:
src/common/lib/libc/arch/sparc64/string: memcpy.S memset.S strmacros.h
Log Message:
Use a single copy of the source.
To generate a diff of this commit:
cvs rdiff -u -r0 -r1.1 src/common/lib/libc/arch/sparc64/string/memcpy.S \
src/common/lib/libc/arch/sparc64/string/memset.S \
src/common/lib/libc/arch/sparc64/string/strmacros.h
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Added files:
Index: src/common/lib/libc/arch/sparc64/string/memcpy.S
diff -u /dev/null src/common/lib/libc/arch/sparc64/string/memcpy.S:1.1
--- /dev/null Sat Mar 16 20:42:32 2013
+++ src/common/lib/libc/arch/sparc64/string/memcpy.S Sat Mar 16 20:42:31 2013
@@ -0,0 +1,1624 @@
+/* $NetBSD: memcpy.S,v 1.1 2013/03/17 00:42:31 christos Exp $ */
+
+/*
+ * Copyright (c) 1996-2002 Eduardo Horvath
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include "strmacros.h"
+
+/*
+ * kernel memcpy
+ * Assumes regions do not overlap; has no useful return value.
+ *
+ * Must not use %g7 (see copyin/copyout above).
+ */
+ENTRY(memcpy) /* dest, src, size */
+ /*
+ * Swap args for bcopy. Gcc generates calls to memcpy for
+ * structure assignments.
+ */
+ mov %o0, %o3
+ mov %o1, %o0
+ mov %o3, %o1
+#if !defined(_KERNEL) || defined(_RUMPKERNEL)
+ENTRY(bcopy) /* src, dest, size */
+#endif
+#ifdef DEBUG
+#if defined(_KERNEL) && !defined(_RUMPKERNEL)
+ set pmapdebug, %o4
+ ld [%o4], %o4
+ btst 0x80, %o4 ! PDB_COPY
+ bz,pt %icc, 3f
+ nop
+#endif
+ save %sp, -CC64FSZ, %sp
+ mov %i0, %o1
+ set 2f, %o0
+ mov %i1, %o2
+ call printf
+ mov %i2, %o3
+! ta 1; nop
+ restore
+ .data
+2: .asciz "memcpy(%p<-%p,%x)\n"
+ _ALIGN
+ .text
+3:
+#endif
+
+ cmp %o2, BCOPY_SMALL
+
+Lmemcpy_start:
+ bge,pt CCCR, 2f ! if >= this many, go be fancy.
+ cmp %o2, 256
+
+ mov %o1, %o5 ! Save memcpy return value
+ /*
+ * Not much to copy, just do it a byte at a time.
+ */
+ deccc %o2 ! while (--len >= 0)
+ bl 1f
+ .empty
+0:
+ inc %o0
+ ldsb [%o0 - 1], %o4 ! (++dst)[-1] = *src++;
+ stb %o4, [%o1]
+ deccc %o2
+ bge 0b
+ inc %o1
+1:
+ retl
+ mov %o5, %o0
+ NOTREACHED
+
+ /*
+ * Plenty of data to copy, so try to do it optimally.
+ */
+2:
+#ifdef USE_BLOCK_STORE_LOAD
+ ! If it is big enough, use VIS instructions
+ bge Lmemcpy_block
+ nop
+#endif /* USE_BLOCK_STORE_LOAD */
+Lmemcpy_fancy:
+
+ !!
+ !! First align the output to a 8-byte entity
+ !!
+
+ save %sp, -CC64FSZ, %sp
+
+ mov %i0, %l0
+ mov %i1, %l1
+
+ mov %i2, %l2
+ btst 1, %l1
+
+ bz,pt %icc, 4f
+ btst 2, %l1
+ ldub [%l0], %l4 ! Load 1st byte
+
+ deccc 1, %l2
+ ble,pn CCCR, Lmemcpy_finish ! XXXX
+ inc 1, %l0
+
+ stb %l4, [%l1] ! Store 1st byte
+ inc 1, %l1 ! Update address
+ btst 2, %l1
+4:
+ bz,pt %icc, 4f
+
+ btst 1, %l0
+ bz,a 1f
+ lduh [%l0], %l4 ! Load short
+
+ ldub [%l0], %l4 ! Load bytes
+
+ ldub [%l0+1], %l3
+ sllx %l4, 8, %l4
+ or %l3, %l4, %l4
+
+1:
+ deccc 2, %l2
+ ble,pn CCCR, Lmemcpy_finish ! XXXX
+ inc 2, %l0
+ sth %l4, [%l1] ! Store 1st short
+
+ inc 2, %l1
+4:
+ btst 4, %l1
+ bz,pt CCCR, 4f
+
+ btst 3, %l0
+ bz,a,pt CCCR, 1f
+ lduw [%l0], %l4 ! Load word -1
+
+ btst 1, %l0
+ bz,a,pt %icc, 2f
+ lduh [%l0], %l4
+
+ ldub [%l0], %l4
+
+ lduh [%l0+1], %l3
+ sllx %l4, 16, %l4
+ or %l4, %l3, %l4
+
+ ldub [%l0+3], %l3
+ sllx %l4, 8, %l4
+ ba,pt %icc, 1f
+ or %l4, %l3, %l4
+
+2:
+ lduh [%l0+2], %l3
+ sllx %l4, 16, %l4
+ or %l4, %l3, %l4
+
+1:
+ deccc 4, %l2
+ ble,pn CCCR, Lmemcpy_finish ! XXXX
+ inc 4, %l0
+
+ st %l4, [%l1] ! Store word
+ inc 4, %l1
+4:
+ !!
+ !! We are now 32-bit aligned in the dest.
+ !!
+Lmemcpy_common:
+
+ and %l0, 7, %l4 ! Shift amount
+ andn %l0, 7, %l0 ! Source addr
+
+ brz,pt %l4, Lmemcpy_noshift8 ! No shift version...
+
+ sllx %l4, 3, %l4 ! In bits
+ mov 8<<3, %l3
+
+ ldx [%l0], %o0 ! Load word -1
+ sub %l3, %l4, %l3 ! Reverse shift
+ deccc 12*8, %l2 ! Have enough room?
+
+ sllx %o0, %l4, %o0
+ bl,pn CCCR, 2f
+ and %l3, 0x38, %l3
+Lmemcpy_unrolled8:
+
+ /*
+ * This is about as close to optimal as you can get, since
+ * the shifts require EU0 and cannot be paired, and you have
+ * 3 dependent operations on the data.
+ */
+
+! ldx [%l0+0*8], %o0 ! Already done
+! sllx %o0, %l4, %o0 ! Already done
+ ldx [%l0+1*8], %o1
+ ldx [%l0+2*8], %o2
+ ldx [%l0+3*8], %o3
+ ldx [%l0+4*8], %o4
+ ba,pt %icc, 1f
+ ldx [%l0+5*8], %o5
+ .align 8
+1:
+ srlx %o1, %l3, %g1
+ inc 6*8, %l0
+
+ sllx %o1, %l4, %o1
+ or %g1, %o0, %g6
+ ldx [%l0+0*8], %o0
+
+ stx %g6, [%l1+0*8]
+ srlx %o2, %l3, %g1
+
+ sllx %o2, %l4, %o2
+ or %g1, %o1, %g6
+ ldx [%l0+1*8], %o1
+
+ stx %g6, [%l1+1*8]
+ srlx %o3, %l3, %g1
+
+ sllx %o3, %l4, %o3
+ or %g1, %o2, %g6
+ ldx [%l0+2*8], %o2
+
+ stx %g6, [%l1+2*8]
+ srlx %o4, %l3, %g1
+
+ sllx %o4, %l4, %o4
+ or %g1, %o3, %g6
+ ldx [%l0+3*8], %o3
+
+ stx %g6, [%l1+3*8]
+ srlx %o5, %l3, %g1
+
+ sllx %o5, %l4, %o5
+ or %g1, %o4, %g6
+ ldx [%l0+4*8], %o4
+
+ stx %g6, [%l1+4*8]
+ srlx %o0, %l3, %g1
+ deccc 6*8, %l2 ! Have enough room?
+
+ sllx %o0, %l4, %o0 ! Next loop
+ or %g1, %o5, %g6
+ ldx [%l0+5*8], %o5
+
+ stx %g6, [%l1+5*8]
+ bge,pt CCCR, 1b
+ inc 6*8, %l1
+
+Lmemcpy_unrolled8_cleanup:
+ !!
+ !! Finished 8 byte block, unload the regs.
+ !!
+ srlx %o1, %l3, %g1
+ inc 5*8, %l0
+
+ sllx %o1, %l4, %o1
+ or %g1, %o0, %g6
+
+ stx %g6, [%l1+0*8]
+ srlx %o2, %l3, %g1
+
+ sllx %o2, %l4, %o2
+ or %g1, %o1, %g6
+
+ stx %g6, [%l1+1*8]
+ srlx %o3, %l3, %g1
+
+ sllx %o3, %l4, %o3
+ or %g1, %o2, %g6
+
+ stx %g6, [%l1+2*8]
+ srlx %o4, %l3, %g1
+
+ sllx %o4, %l4, %o4
+ or %g1, %o3, %g6
+
+ stx %g6, [%l1+3*8]
+ srlx %o5, %l3, %g1
+
+ sllx %o5, %l4, %o5
+ or %g1, %o4, %g6
+
+ stx %g6, [%l1+4*8]
+ inc 5*8, %l1
+
+ mov %o5, %o0 ! Save our unused data
+ dec 5*8, %l2
+2:
+ inccc 12*8, %l2
+ bz,pn %icc, Lmemcpy_complete
+
+ !! Unrolled 8 times
+Lmemcpy_aligned8:
+! ldx [%l0], %o0 ! Already done
+! sllx %o0, %l4, %o0 ! Shift high word
+
+ deccc 8, %l2 ! Pre-decrement
+ bl,pn CCCR, Lmemcpy_finish
+1:
+ ldx [%l0+8], %o1 ! Load word 0
+ inc 8, %l0
+
+ srlx %o1, %l3, %g6
+ or %g6, %o0, %g6 ! Combine
+
+ stx %g6, [%l1] ! Store result
+ inc 8, %l1
+
+ deccc 8, %l2
+ bge,pn CCCR, 1b
+ sllx %o1, %l4, %o0
+
+ btst 7, %l2 ! Done?
+ bz,pt CCCR, Lmemcpy_complete
+
+ !!
+ !! Loadup the last dregs into %o0 and shift it into place
+ !!
+ srlx %l3, 3, %g6 ! # bytes in %o0
+ dec 8, %g6 ! - 8
+ !! n-8 - (by - 8) -> n - by
+ subcc %l2, %g6, %g0 ! # bytes we need
+ ble,pt %icc, Lmemcpy_finish
+ nop
+ ldx [%l0+8], %o1 ! Need another word
+ srlx %o1, %l3, %o1
+ ba,pt %icc, Lmemcpy_finish
+ or %o0, %o1, %o0 ! All loaded up.
+
+Lmemcpy_noshift8:
+ deccc 6*8, %l2 ! Have enough room?
+ bl,pn CCCR, 2f
+ nop
+ ba,pt %icc, 1f
+ nop
+ .align 32
+1:
+ ldx [%l0+0*8], %o0
+ ldx [%l0+1*8], %o1
+ ldx [%l0+2*8], %o2
+ stx %o0, [%l1+0*8]
+ stx %o1, [%l1+1*8]
+ stx %o2, [%l1+2*8]
+
+
+ ldx [%l0+3*8], %o3
+ ldx [%l0+4*8], %o4
+ ldx [%l0+5*8], %o5
+ inc 6*8, %l0
+ stx %o3, [%l1+3*8]
+ deccc 6*8, %l2
+ stx %o4, [%l1+4*8]
+ stx %o5, [%l1+5*8]
+ bge,pt CCCR, 1b
+ inc 6*8, %l1
+2:
+ inc 6*8, %l2
+1:
+ deccc 8, %l2
+ bl,pn %icc, 1f ! < 0 --> sub word
+ nop
+ ldx [%l0], %g6
+ inc 8, %l0
+ stx %g6, [%l1]
+ bg,pt %icc, 1b ! Exactly 0 --> done
+ inc 8, %l1
+1:
+ btst 7, %l2 ! Done?
+ bz,pt CCCR, Lmemcpy_complete
+ clr %l4
+ ldx [%l0], %o0
+Lmemcpy_finish:
+
+ brz,pn %l2, 2f ! 100% complete?
+ cmp %l2, 8 ! Exactly 8 bytes?
+ bz,a,pn CCCR, 2f
+ stx %o0, [%l1]
+
+ btst 4, %l2 ! Word store?
+ bz CCCR, 1f
+ srlx %o0, 32, %g6 ! Shift high word down
+ stw %g6, [%l1]
+ inc 4, %l1
+ mov %o0, %g6 ! Operate on the low bits
+1:
+ btst 2, %l2
+ mov %g6, %o0
+ bz 1f
+ srlx %o0, 16, %g6
+
+ sth %g6, [%l1] ! Store short
+ inc 2, %l1
+ mov %o0, %g6 ! Operate on low bytes
+1:
+ mov %g6, %o0
+ btst 1, %l2 ! Byte aligned?
+ bz 2f
+ srlx %o0, 8, %g6
+
+ stb %g6, [%l1] ! Store last byte
+ inc 1, %l1 ! Update address
+2:
+Lmemcpy_complete:
+#if 0
+ !!
+ !! verify copy success.
+ !!
+
+ mov %i0, %o2
+ mov %i1, %o4
+ mov %i2, %l4
+0:
+ ldub [%o2], %o1
+ inc %o2
+ ldub [%o4], %o3
+ inc %o4
+ cmp %o3, %o1
+ bnz 1f
+ dec %l4
+ brnz %l4, 0b
+ nop
+ ba 2f
+ nop
+
+1:
+ set 0f, %o0
+ call printf
+ sub %i2, %l4, %o5
+ set 1f, %o0
+ mov %i0, %o2
+ mov %i1, %o1
+ call printf
+ mov %i2, %o3
+ ta 1
+ .data
+0: .asciz "memcpy failed: %x@%p != %x@%p byte %d\n"
+1: .asciz "memcpy(%p, %p, %lx)\n"
+ .align 8
+ .text
+2:
+#endif
+ ret
+ restore %i1, %g0, %o0
+
+#ifdef USE_BLOCK_STORE_LOAD
+
+/*
+ * Block copy. Useful for >256 byte copies.
+ *
+ * Benchmarking has shown this always seems to be slower than
+ * the integer version, so this is disabled. Maybe someone will
+ * figure out why sometime.
+ */
+
+Lmemcpy_block:
+ sethi %hi(block_disable), %o3
+ ldx [ %o3 + %lo(block_disable) ], %o3
+ brnz,pn %o3, Lmemcpy_fancy
+ !! Make sure our trap table is installed
+ set _C_LABEL(trapbase), %o5
+ rdpr %tba, %o3
+ sub %o3, %o5, %o3
+ brnz,pn %o3, Lmemcpy_fancy ! No, then don't use block load/store
+ nop
+#if defined(_KERNEL) && !defined(_RUMPKERNEL)
+/*
+ * Kernel:
+ *
+ * Here we use VIS instructions to do a block clear of a page.
+ * But before we can do that we need to save and enable the FPU.
+ * The last owner of the FPU registers is fplwp, and
+ * fplwp->l_md.md_fpstate is the current fpstate. If that's not
+ * null, call savefpstate() with it to store our current fp state.
+ *
+ * Next, allocate an aligned fpstate on the stack. We will properly
+ * nest calls on a particular stack so this should not be a problem.
+ *
+ * Now we grab either curlwp (or if we're on the interrupt stack
+ * lwp0). We stash its existing fpstate in a local register and
+ * put our new fpstate in curlwp->p_md.md_fpstate. We point
+ * fplwp at curlwp (or lwp0) and enable the FPU.
+ *
+ * If we are ever preempted, our FPU state will be saved in our
+ * fpstate. Then, when we're resumed and we take an FPDISABLED
+ * trap, the trap handler will be able to fish our FPU state out
+ * of curlwp (or lwp0).
+ *
+ * On exiting this routine we undo the damage: restore the original
+ * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable
+ * the MMU.
+ *
+ *
+ * Register usage, Kernel only (after save):
+ *
+ * %i0 src
+ * %i1 dest
+ * %i2 size
+ *
+ * %l0 XXXX DEBUG old fpstate
+ * %l1 fplwp (hi bits only)
+ * %l2 orig fplwp
+ * %l3 orig fpstate
+ * %l5 curlwp
+ * %l6 old fpstate
+ *
+ * Register ussage, Kernel and user:
+ *
+ * %g1 src (retval for memcpy)
+ *
+ * %o0 src
+ * %o1 dest
+ * %o2 end dest
+ * %o5 last safe fetchable address
+ */
+
+ ENABLE_FPU(0)
+
+ mov %i0, %o0 ! Src addr.
+ mov %i1, %o1 ! Store our dest ptr here.
+ mov %i2, %o2 ! Len counter
+#endif /* _KERNEL */
+
+ !!
+ !! First align the output to a 64-bit entity
+ !!
+
+ mov %o1, %g1 ! memcpy retval
+ add %o0, %o2, %o5 ! End of source block
+
+ andn %o0, 7, %o3 ! Start of block
+ dec %o5
+ fzero %f0
+
+ andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr.
+ ldd [%o3], %f2 ! Load 1st word
+
+ dec 8, %o3 ! Move %o3 1 word back
+ btst 1, %o1
+ bz 4f
+
+ mov -7, %o4 ! Lowest src addr possible
+ alignaddr %o0, %o4, %o4 ! Base addr for load.
+
+ cmp %o3, %o4
+ be,pt CCCR, 1f ! Already loaded?
+ mov %o4, %o3
+ fmovd %f2, %f0 ! No. Shift
+ ldd [%o3+8], %f2 ! And load
+1:
+
+ faligndata %f0, %f2, %f4 ! Isolate 1st byte
+
+ stda %f4, [%o1] ASI_FL8_P ! Store 1st byte
+ inc 1, %o1 ! Update address
+ inc 1, %o0
+ dec 1, %o2
+4:
+ btst 2, %o1
+ bz 4f
+
+ mov -6, %o4 ! Calculate src - 6
+ alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
+
+ cmp %o3, %o4 ! Addresses same?
+ be,pt CCCR, 1f
+ mov %o4, %o3
+ fmovd %f2, %f0 ! Shuffle data
+ ldd [%o3+8], %f2 ! Load word 0
+1:
+ faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
+
+ stda %f4, [%o1] ASI_FL16_P ! Store 1st short
+ dec 2, %o2
+ inc 2, %o1
+ inc 2, %o0
+4:
+ brz,pn %o2, Lmemcpy_blockfinish ! XXXX
+
+ btst 4, %o1
+ bz 4f
+
+ mov -4, %o4
+ alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
+
+ cmp %o3, %o4 ! Addresses same?
+ beq,pt CCCR, 1f
+ mov %o4, %o3
+ fmovd %f2, %f0 ! Shuffle data
+ ldd [%o3+8], %f2 ! Load word 0
+1:
+ faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
+
+ st %f5, [%o1] ! Store word
+ dec 4, %o2
+ inc 4, %o1
+ inc 4, %o0
+4:
+ brz,pn %o2, Lmemcpy_blockfinish ! XXXX
+ !!
+ !! We are now 32-bit aligned in the dest.
+ !!
+Lmemcpy_block_common:
+
+ mov -0, %o4
+ alignaddr %o0, %o4, %o4 ! base - shift
+
+ cmp %o3, %o4 ! Addresses same?
+ beq,pt CCCR, 1f
+ mov %o4, %o3
+ fmovd %f2, %f0 ! Shuffle data
+ ldd [%o3+8], %f2 ! Load word 0
+1:
+ add %o3, 8, %o0 ! now use %o0 for src
+
+ !!
+ !! Continue until our dest is block aligned
+ !!
+Lmemcpy_block_aligned8:
+1:
+ brz %o2, Lmemcpy_blockfinish
+ btst BLOCK_ALIGN, %o1 ! Block aligned?
+ bz 1f
+
+ faligndata %f0, %f2, %f4 ! Generate result
+ deccc 8, %o2
+ ble,pn %icc, Lmemcpy_blockfinish ! Should never happen
+ fmovd %f4, %f48
+
+ std %f4, [%o1] ! Store result
+ inc 8, %o1
+
+ fmovd %f2, %f0
+ inc 8, %o0
+ ba,pt %xcc, 1b ! Not yet.
+ ldd [%o0], %f2 ! Load next part
+Lmemcpy_block_aligned64:
+1:
+
+/*
+ * 64-byte aligned -- ready for block operations.
+ *
+ * Here we have the destination block aligned, but the
+ * source pointer may not be. Sub-word alignment will
+ * be handled by faligndata instructions. But the source
+ * can still be potentially aligned to 8 different words
+ * in our 64-bit block, so we have 8 different copy routines.
+ *
+ * Once we figure out our source alignment, we branch
+ * to the appropriate copy routine, which sets up the
+ * alignment for faligndata and loads (sets) the values
+ * into the source registers and does the copy loop.
+ *
+ * When were down to less than 1 block to store, we
+ * exit the copy loop and execute cleanup code.
+ *
+ * Block loads and stores are not properly interlocked.
+ * Stores save one reg/cycle, so you can start overwriting
+ * registers the cycle after the store is issued.
+ *
+ * Block loads require a block load to a different register
+ * block or a membar #Sync before accessing the loaded
+ * data.
+ *
+ * Since the faligndata instructions may be offset as far
+ * as 7 registers into a block (if you are shifting source
+ * 7 -> dest 0), you need 3 source register blocks for full
+ * performance: one you are copying, one you are loading,
+ * and one for interlocking. Otherwise, we would need to
+ * sprinkle the code with membar #Sync and lose the advantage
+ * of running faligndata in parallel with block stores. This
+ * means we are fetching a full 128 bytes ahead of the stores.
+ * We need to make sure the prefetch does not inadvertently
+ * cross a page boundary and fault on data that we will never
+ * store.
+ *
+ */
+#if 1
+ and %o0, BLOCK_ALIGN, %o3
+ srax %o3, 3, %o3 ! Isolate the offset
+
+ brz %o3, L100 ! 0->0
+ btst 4, %o3
+ bnz %xcc, 4f
+ btst 2, %o3
+ bnz %xcc, 2f
+ btst 1, %o3
+ ba,pt %xcc, L101 ! 0->1
+ nop /* XXX spitfire bug */
+2:
+ bz %xcc, L102 ! 0->2
+ nop
+ ba,pt %xcc, L103 ! 0->3
+ nop /* XXX spitfire bug */
+4:
+ bnz %xcc, 2f
+ btst 1, %o3
+ bz %xcc, L104 ! 0->4
+ nop
+ ba,pt %xcc, L105 ! 0->5
+ nop /* XXX spitfire bug */
+2:
+ bz %xcc, L106 ! 0->6
+ nop
+ ba,pt %xcc, L107 ! 0->7
+ nop /* XXX spitfire bug */
+#else
+
+ !!
+ !! Isolate the word offset, which just happens to be
+ !! the slot in our jump table.
+ !!
+ !! This is 6 insns, most of which cannot be paired,
+ !! which is about the same as the above version.
+ !!
+ rd %pc, %o4
+1:
+ and %o0, 0x31, %o3
+ add %o3, (Lmemcpy_block_jmp - 1b), %o3
+ jmpl %o4 + %o3, %g0
+ nop
+
+ !!
+ !! Jump table
+ !!
+
+Lmemcpy_block_jmp:
+ ba,a,pt %xcc, L100
+ nop
+ ba,a,pt %xcc, L101
+ nop
+ ba,a,pt %xcc, L102
+ nop
+ ba,a,pt %xcc, L103
+ nop
+ ba,a,pt %xcc, L104
+ nop
+ ba,a,pt %xcc, L105
+ nop
+ ba,a,pt %xcc, L106
+ nop
+ ba,a,pt %xcc, L107
+ nop
+#endif
+
+ !!
+ !! Source is block aligned.
+ !!
+ !! Just load a block and go.
+ !!
+L100:
+#ifdef RETURN_NAME
+ sethi %hi(1f), %g1
+ ba,pt %icc, 2f
+ or %g1, %lo(1f), %g1
+1:
+ .asciz "L100"
+ .align 8
+2:
+#endif
+ fmovd %f0 , %f62
+ ldda [%o0] ASI_BLK_P, %f0
+ inc BLOCK_SIZE, %o0
+ cmp %o0, %o5
+ bleu,a,pn %icc, 3f
+ ldda [%o0] ASI_BLK_P, %f16
+ ba,pt %icc, 3f
+ membar #Sync
+
+ .align 32 ! ICache align.
+3:
+ faligndata %f62, %f0, %f32
+ inc BLOCK_SIZE, %o0
+ faligndata %f0, %f2, %f34
+ dec BLOCK_SIZE, %o2
+ faligndata %f2, %f4, %f36
+ cmp %o0, %o5
+ faligndata %f4, %f6, %f38
+ faligndata %f6, %f8, %f40
+ faligndata %f8, %f10, %f42
+ faligndata %f10, %f12, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f12, %f14, %f46
+
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f48
+ membar #Sync
+2:
+ stda %f32, [%o1] ASI_STORE
+ faligndata %f14, %f16, %f32
+ inc BLOCK_SIZE, %o0
+ faligndata %f16, %f18, %f34
+ inc BLOCK_SIZE, %o1
+ faligndata %f18, %f20, %f36
+ dec BLOCK_SIZE, %o2
+ faligndata %f20, %f22, %f38
+ cmp %o0, %o5
+ faligndata %f22, %f24, %f40
+ faligndata %f24, %f26, %f42
+ faligndata %f26, %f28, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f28, %f30, %f46
+
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f0
+ membar #Sync
+2:
+ stda %f32, [%o1] ASI_STORE
+ faligndata %f30, %f48, %f32
+ inc BLOCK_SIZE, %o0
+ faligndata %f48, %f50, %f34
+ inc BLOCK_SIZE, %o1
+ faligndata %f50, %f52, %f36
+ dec BLOCK_SIZE, %o2
+ faligndata %f52, %f54, %f38
+ cmp %o0, %o5
+ faligndata %f54, %f56, %f40
+ faligndata %f56, %f58, %f42
+ faligndata %f58, %f60, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f60, %f62, %f46
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top
+ membar #Sync
+2:
+ stda %f32, [%o1] ASI_STORE
+ ba 3b
+ inc BLOCK_SIZE, %o1
+
+ !!
+ !! Source at BLOCK_ALIGN+8
+ !!
+ !! We need to load almost 1 complete block by hand.
+ !!
+L101:
+#ifdef RETURN_NAME
+ sethi %hi(1f), %g1
+ ba,pt %icc, 2f
+ or %g1, %lo(1f), %g1
+1:
+ .asciz "L101"
+ .align 8
+2:
+#endif
+! fmovd %f0, %f0 ! Hoist fmovd
+ ldd [%o0], %f2
+ inc 8, %o0
+ ldd [%o0], %f4
+ inc 8, %o0
+ ldd [%o0], %f6
+ inc 8, %o0
+ ldd [%o0], %f8
+ inc 8, %o0
+ ldd [%o0], %f10
+ inc 8, %o0
+ ldd [%o0], %f12
+ inc 8, %o0
+ ldd [%o0], %f14
+ inc 8, %o0
+
+ cmp %o0, %o5
+ bleu,a,pn %icc, 3f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+3:
+ faligndata %f0, %f2, %f32
+ inc BLOCK_SIZE, %o0
+ faligndata %f2, %f4, %f34
+ cmp %o0, %o5
+ faligndata %f4, %f6, %f36
+ dec BLOCK_SIZE, %o2
+ faligndata %f6, %f8, %f38
+ faligndata %f8, %f10, %f40
+ faligndata %f10, %f12, %f42
+ faligndata %f12, %f14, %f44
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f48
+ membar #Sync
+2:
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f14, %f16, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f16, %f18, %f32
+ inc BLOCK_SIZE, %o0
+ faligndata %f18, %f20, %f34
+ inc BLOCK_SIZE, %o1
+ faligndata %f20, %f22, %f36
+ cmp %o0, %o5
+ faligndata %f22, %f24, %f38
+ dec BLOCK_SIZE, %o2
+ faligndata %f24, %f26, %f40
+ faligndata %f26, %f28, %f42
+ faligndata %f28, %f30, %f44
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f0
+ membar #Sync
+2:
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f30, %f48, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f48, %f50, %f32
+ inc BLOCK_SIZE, %o0
+ faligndata %f50, %f52, %f34
+ inc BLOCK_SIZE, %o1
+ faligndata %f52, %f54, %f36
+ cmp %o0, %o5
+ faligndata %f54, %f56, %f38
+ dec BLOCK_SIZE, %o2
+ faligndata %f56, %f58, %f40
+ faligndata %f58, %f60, %f42
+ faligndata %f60, %f62, %f44
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f62, %f0, %f46
+
+ stda %f32, [%o1] ASI_STORE
+ ba 3b
+ inc BLOCK_SIZE, %o1
+
+ !!
+ !! Source at BLOCK_ALIGN+16
+ !!
+ !! We need to load 6 doubles by hand.
+ !!
+L102:
+#ifdef RETURN_NAME
+ sethi %hi(1f), %g1
+ ba,pt %icc, 2f
+ or %g1, %lo(1f), %g1
+1:
+ .asciz "L102"
+ .align 8
+2:
+#endif
+ ldd [%o0], %f4
+ inc 8, %o0
+ fmovd %f0, %f2 ! Hoist fmovd
+ ldd [%o0], %f6
+ inc 8, %o0
+
+ ldd [%o0], %f8
+ inc 8, %o0
+ ldd [%o0], %f10
+ inc 8, %o0
+ ldd [%o0], %f12
+ inc 8, %o0
+ ldd [%o0], %f14
+ inc 8, %o0
+
+ cmp %o0, %o5
+ bleu,a,pn %icc, 3f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+3:
+ faligndata %f2, %f4, %f32
+ inc BLOCK_SIZE, %o0
+ faligndata %f4, %f6, %f34
+ cmp %o0, %o5
+ faligndata %f6, %f8, %f36
+ dec BLOCK_SIZE, %o2
+ faligndata %f8, %f10, %f38
+ faligndata %f10, %f12, %f40
+ faligndata %f12, %f14, %f42
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f48
+ membar #Sync
+2:
+ faligndata %f14, %f16, %f44
+
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f16, %f18, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f18, %f20, %f32
+ inc BLOCK_SIZE, %o0
+ faligndata %f20, %f22, %f34
+ inc BLOCK_SIZE, %o1
+ faligndata %f22, %f24, %f36
+ cmp %o0, %o5
+ faligndata %f24, %f26, %f38
+ dec BLOCK_SIZE, %o2
+ faligndata %f26, %f28, %f40
+ faligndata %f28, %f30, %f42
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f0
+ membar #Sync
+2:
+ faligndata %f30, %f48, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f48, %f50, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f50, %f52, %f32
+ inc BLOCK_SIZE, %o0
+ faligndata %f52, %f54, %f34
+ inc BLOCK_SIZE, %o1
+ faligndata %f54, %f56, %f36
+ cmp %o0, %o5
+ faligndata %f56, %f58, %f38
+ dec BLOCK_SIZE, %o2
+ faligndata %f58, %f60, %f40
+ faligndata %f60, %f62, %f42
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ faligndata %f62, %f0, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f0, %f2, %f46
+
+ stda %f32, [%o1] ASI_STORE
+ ba 3b
+ inc BLOCK_SIZE, %o1
+
+ !!
+ !! Source at BLOCK_ALIGN+24
+ !!
+ !! We need to load 5 doubles by hand.
+ !!
+L103:
+#ifdef RETURN_NAME
+ sethi %hi(1f), %g1
+ ba,pt %icc, 2f
+ or %g1, %lo(1f), %g1
+1:
+ .asciz "L103"
+ .align 8
+2:
+#endif
+ fmovd %f0, %f4
+ ldd [%o0], %f6
+ inc 8, %o0
+ ldd [%o0], %f8
+ inc 8, %o0
+ ldd [%o0], %f10
+ inc 8, %o0
+ ldd [%o0], %f12
+ inc 8, %o0
+ ldd [%o0], %f14
+ inc 8, %o0
+
+ cmp %o0, %o5
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ inc BLOCK_SIZE, %o0
+3:
+ faligndata %f4, %f6, %f32
+ cmp %o0, %o5
+ faligndata %f6, %f8, %f34
+ dec BLOCK_SIZE, %o2
+ faligndata %f8, %f10, %f36
+ faligndata %f10, %f12, %f38
+ faligndata %f12, %f14, %f40
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f48
+ membar #Sync
+2:
+ faligndata %f14, %f16, %f42
+ inc BLOCK_SIZE, %o0
+ faligndata %f16, %f18, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f18, %f20, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f20, %f22, %f32
+ cmp %o0, %o5
+ faligndata %f22, %f24, %f34
+ dec BLOCK_SIZE, %o2
+ faligndata %f24, %f26, %f36
+ inc BLOCK_SIZE, %o1
+ faligndata %f26, %f28, %f38
+ faligndata %f28, %f30, %f40
+ ble,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f0
+ membar #Sync
+2:
+ faligndata %f30, %f48, %f42
+ inc BLOCK_SIZE, %o0
+ faligndata %f48, %f50, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f50, %f52, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f52, %f54, %f32
+ cmp %o0, %o5
+ faligndata %f54, %f56, %f34
+ dec BLOCK_SIZE, %o2
+ faligndata %f56, %f58, %f36
+ faligndata %f58, %f60, %f38
+ inc BLOCK_SIZE, %o1
+ faligndata %f60, %f62, %f40
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ faligndata %f62, %f0, %f42
+ inc BLOCK_SIZE, %o0
+ faligndata %f0, %f2, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f2, %f4, %f46
+
+ stda %f32, [%o1] ASI_STORE
+ ba 3b
+ inc BLOCK_SIZE, %o1
+
+ !!
+ !! Source at BLOCK_ALIGN+32
+ !!
+ !! We need to load 4 doubles by hand.
+ !!
+L104:
+#ifdef RETURN_NAME
+ sethi %hi(1f), %g1
+ ba,pt %icc, 2f
+ or %g1, %lo(1f), %g1
+1:
+ .asciz "L104"
+ .align 8
+2:
+#endif
+ fmovd %f0, %f6
+ ldd [%o0], %f8
+ inc 8, %o0
+ ldd [%o0], %f10
+ inc 8, %o0
+ ldd [%o0], %f12
+ inc 8, %o0
+ ldd [%o0], %f14
+ inc 8, %o0
+
+ cmp %o0, %o5
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ inc BLOCK_SIZE, %o0
+3:
+ faligndata %f6, %f8, %f32
+ cmp %o0, %o5
+ faligndata %f8, %f10, %f34
+ dec BLOCK_SIZE, %o2
+ faligndata %f10, %f12, %f36
+ faligndata %f12, %f14, %f38
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f48
+ membar #Sync
+2:
+ faligndata %f14, %f16, %f40
+ faligndata %f16, %f18, %f42
+ inc BLOCK_SIZE, %o0
+ faligndata %f18, %f20, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f20, %f22, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f22, %f24, %f32
+ cmp %o0, %o5
+ faligndata %f24, %f26, %f34
+ faligndata %f26, %f28, %f36
+ inc BLOCK_SIZE, %o1
+ faligndata %f28, %f30, %f38
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f0
+ membar #Sync
+2:
+ faligndata %f30, %f48, %f40
+ dec BLOCK_SIZE, %o2
+ faligndata %f48, %f50, %f42
+ inc BLOCK_SIZE, %o0
+ faligndata %f50, %f52, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f52, %f54, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f54, %f56, %f32
+ cmp %o0, %o5
+ faligndata %f56, %f58, %f34
+ faligndata %f58, %f60, %f36
+ inc BLOCK_SIZE, %o1
+ faligndata %f60, %f62, %f38
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ faligndata %f62, %f0, %f40
+ dec BLOCK_SIZE, %o2
+ faligndata %f0, %f2, %f42
+ inc BLOCK_SIZE, %o0
+ faligndata %f2, %f4, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f4, %f6, %f46
+
+ stda %f32, [%o1] ASI_STORE
+ ba 3b
+ inc BLOCK_SIZE, %o1
+
+ !!
+ !! Source at BLOCK_ALIGN+40
+ !!
+ !! We need to load 3 doubles by hand.
+ !!
+L105:
+#ifdef RETURN_NAME
+ sethi %hi(1f), %g1
+ ba,pt %icc, 2f
+ or %g1, %lo(1f), %g1
+1:
+ .asciz "L105"
+ .align 8
+2:
+#endif
+ fmovd %f0, %f8
+ ldd [%o0], %f10
+ inc 8, %o0
+ ldd [%o0], %f12
+ inc 8, %o0
+ ldd [%o0], %f14
+ inc 8, %o0
+
+ cmp %o0, %o5
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ inc BLOCK_SIZE, %o0
+3:
+ faligndata %f8, %f10, %f32
+ cmp %o0, %o5
+ faligndata %f10, %f12, %f34
+ faligndata %f12, %f14, %f36
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f48
+ membar #Sync
+2:
+ faligndata %f14, %f16, %f38
+ dec BLOCK_SIZE, %o2
+ faligndata %f16, %f18, %f40
+ inc BLOCK_SIZE, %o0
+ faligndata %f18, %f20, %f42
+ faligndata %f20, %f22, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f22, %f24, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f24, %f26, %f32
+ cmp %o0, %o5
+ faligndata %f26, %f28, %f34
+ dec BLOCK_SIZE, %o2
+ faligndata %f28, %f30, %f36
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f0
+ membar #Sync
+2:
+ faligndata %f30, %f48, %f38
+ inc BLOCK_SIZE, %o1
+ faligndata %f48, %f50, %f40
+ inc BLOCK_SIZE, %o0
+ faligndata %f50, %f52, %f42
+ faligndata %f52, %f54, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f54, %f56, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f56, %f58, %f32
+ cmp %o0, %o5
+ faligndata %f58, %f60, %f34
+ dec BLOCK_SIZE, %o2
+ faligndata %f60, %f62, %f36
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ faligndata %f62, %f0, %f38
+ inc BLOCK_SIZE, %o1
+ faligndata %f0, %f2, %f40
+ inc BLOCK_SIZE, %o0
+ faligndata %f2, %f4, %f42
+ faligndata %f4, %f6, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f6, %f8, %f46
+
+ stda %f32, [%o1] ASI_STORE
+ ba 3b
+ inc BLOCK_SIZE, %o1
+
+
+ !!
+ !! Source at BLOCK_ALIGN+48
+ !!
+ !! We need to load 2 doubles by hand.
+ !!
+L106:
+#ifdef RETURN_NAME
+ sethi %hi(1f), %g1
+ ba,pt %icc, 2f
+ or %g1, %lo(1f), %g1
+1:
+ .asciz "L106"
+ .align 8
+2:
+#endif
+ fmovd %f0, %f10
+ ldd [%o0], %f12
+ inc 8, %o0
+ ldd [%o0], %f14
+ inc 8, %o0
+
+ cmp %o0, %o5
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ inc BLOCK_SIZE, %o0
+3:
+ faligndata %f10, %f12, %f32
+ cmp %o0, %o5
+ faligndata %f12, %f14, %f34
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f48
+ membar #Sync
+2:
+ faligndata %f14, %f16, %f36
+ dec BLOCK_SIZE, %o2
+ faligndata %f16, %f18, %f38
+ inc BLOCK_SIZE, %o0
+ faligndata %f18, %f20, %f40
+ faligndata %f20, %f22, %f42
+ faligndata %f22, %f24, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f24, %f26, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f26, %f28, %f32
+ cmp %o0, %o5
+ faligndata %f28, %f30, %f34
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f0
+ membar #Sync
+2:
+ faligndata %f30, %f48, %f36
+ dec BLOCK_SIZE, %o2
+ faligndata %f48, %f50, %f38
+ inc BLOCK_SIZE, %o1
+ faligndata %f50, %f52, %f40
+ faligndata %f52, %f54, %f42
+ inc BLOCK_SIZE, %o0
+ faligndata %f54, %f56, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f56, %f58, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f58, %f60, %f32
+ cmp %o0, %o5
+ faligndata %f60, %f62, %f34
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ faligndata %f62, %f0, %f36
+ dec BLOCK_SIZE, %o2
+ faligndata %f0, %f2, %f38
+ inc BLOCK_SIZE, %o1
+ faligndata %f2, %f4, %f40
+ faligndata %f4, %f6, %f42
+ inc BLOCK_SIZE, %o0
+ faligndata %f6, %f8, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f8, %f10, %f46
+
+ stda %f32, [%o1] ASI_STORE
+ ba 3b
+ inc BLOCK_SIZE, %o1
+
+
+ !!
+ !! Source at BLOCK_ALIGN+56
+ !!
+ !! We need to load 1 double by hand.
+ !!
+L107:
+#ifdef RETURN_NAME
+ sethi %hi(1f), %g1
+ ba,pt %icc, 2f
+ or %g1, %lo(1f), %g1
+1:
+ .asciz "L107"
+ .align 8
+2:
+#endif
+ fmovd %f0, %f12
+ ldd [%o0], %f14
+ inc 8, %o0
+
+ cmp %o0, %o5
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ inc BLOCK_SIZE, %o0
+3:
+ faligndata %f12, %f14, %f32
+ cmp %o0, %o5
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f48
+ membar #Sync
+2:
+ faligndata %f14, %f16, %f34
+ dec BLOCK_SIZE, %o2
+ faligndata %f16, %f18, %f36
+ inc BLOCK_SIZE, %o0
+ faligndata %f18, %f20, %f38
+ faligndata %f20, %f22, %f40
+ faligndata %f22, %f24, %f42
+ faligndata %f24, %f26, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f26, %f28, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f28, %f30, %f32
+ cmp %o0, %o5
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f0
+ membar #Sync
+2:
+ faligndata %f30, %f48, %f34
+ dec BLOCK_SIZE, %o2
+ faligndata %f48, %f50, %f36
+ inc BLOCK_SIZE, %o1
+ faligndata %f50, %f52, %f38
+ faligndata %f52, %f54, %f40
+ inc BLOCK_SIZE, %o0
+ faligndata %f54, %f56, %f42
+ faligndata %f56, %f58, %f44
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f58, %f60, %f46
+
+ stda %f32, [%o1] ASI_STORE
+
+ faligndata %f60, %f62, %f32
+ cmp %o0, %o5
+ bleu,a,pn %icc, 2f
+ ldda [%o0] ASI_BLK_P, %f16
+ membar #Sync
+2:
+ faligndata %f62, %f0, %f34
+ dec BLOCK_SIZE, %o2
+ faligndata %f0, %f2, %f36
+ inc BLOCK_SIZE, %o1
+ faligndata %f2, %f4, %f38
+ faligndata %f4, %f6, %f40
+ inc BLOCK_SIZE, %o0
+ faligndata %f6, %f8, %f42
+ faligndata %f8, %f10, %f44
+
+ brlez,pn %o2, Lmemcpy_blockdone
+ faligndata %f10, %f12, %f46
+
+ stda %f32, [%o1] ASI_STORE
+ ba 3b
+ inc BLOCK_SIZE, %o1
+
+Lmemcpy_blockdone:
+ inc BLOCK_SIZE, %o2 ! Fixup our overcommit
+ membar #Sync ! Finish any pending loads
+#define FINISH_REG(f) \
+ deccc 8, %o2; \
+ bl,a Lmemcpy_blockfinish; \
+ fmovd f, %f48; \
+ std f, [%o1]; \
+ inc 8, %o1
+
+ FINISH_REG(%f32)
+ FINISH_REG(%f34)
+ FINISH_REG(%f36)
+ FINISH_REG(%f38)
+ FINISH_REG(%f40)
+ FINISH_REG(%f42)
+ FINISH_REG(%f44)
+ FINISH_REG(%f46)
+ FINISH_REG(%f48)
+#undef FINISH_REG
+ !!
+ !! The low 3 bits have the sub-word bits needed to be
+ !! stored [because (x-8)&0x7 == x].
+ !!
+Lmemcpy_blockfinish:
+ brz,pn %o2, 2f ! 100% complete?
+ fmovd %f48, %f4
+ cmp %o2, 8 ! Exactly 8 bytes?
+ bz,a,pn CCCR, 2f
+ std %f4, [%o1]
+
+ btst 4, %o2 ! Word store?
+ bz CCCR, 1f
+ nop
+ st %f4, [%o1]
+ inc 4, %o1
+1:
+ btst 2, %o2
+ fzero %f0
+ bz 1f
+
+ mov -6, %o4
+ alignaddr %o1, %o4, %g0
+
+ faligndata %f0, %f4, %f8
+
+ stda %f8, [%o1] ASI_FL16_P ! Store short
+ inc 2, %o1
+1:
+ btst 1, %o2 ! Byte aligned?
+ bz 2f
+
+ mov -7, %o0 ! Calculate dest - 7
+ alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest.
+
+ faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8
+
+ stda %f8, [%o1] ASI_FL8_P ! Store 1st byte
+ inc 1, %o1 ! Update address
+2:
+ membar #Sync
+#if 0
+ !!
+ !! verify copy success.
+ !!
+
+ mov %i0, %o2
+ mov %i1, %o4
+ mov %i2, %l4
+0:
+ ldub [%o2], %o1
+ inc %o2
+ ldub [%o4], %o3
+ inc %o4
+ cmp %o3, %o1
+ bnz 1f
+ dec %l4
+ brnz %l4, 0b
+ nop
+ ba 2f
+ nop
+
+1:
+ set block_disable, %o0
+ stx %o0, [%o0]
+
+ set 0f, %o0
+ call prom_printf
+ sub %i2, %l4, %o5
+ set 1f, %o0
+ mov %i0, %o2
+ mov %i1, %o1
+ call prom_printf
+ mov %i2, %o3
+ ta 1
+ .data
+ _ALIGN
+0: .asciz "block memcpy failed: %x@%p != %x@%p byte %d\r\n"
+1: .asciz "memcpy(%p, %p, %lx)\r\n"
+ _ALIGN
+ .text
+2:
+#endif
+#if defined(_KERNEL) && !defined(_RUMPKERNEL)
+
+/*
+ * Weve saved our possible fpstate, now disable the fpu
+ * and continue with life.
+ */
+ RESTORE_FPU
+ ret
+ restore %g1, 0, %o0 ! Return DEST for memcpy
+#endif
+ retl
+ mov %g1, %o0
+/*
+ * Use block_disable to turn off block insns for
+ * memcpy/memset
+ */
+ .data
+ .align 8
+ .globl block_disable
+block_disable: .xword 1
+ .text
+#endif /* USE_BLOCK_STORE_LOAD */
Index: src/common/lib/libc/arch/sparc64/string/memset.S
diff -u /dev/null src/common/lib/libc/arch/sparc64/string/memset.S:1.1
--- /dev/null Sat Mar 16 20:42:32 2013
+++ src/common/lib/libc/arch/sparc64/string/memset.S Sat Mar 16 20:42:32 2013
@@ -0,0 +1,214 @@
+/* $NetBSD: memset.S,v 1.1 2013/03/17 00:42:32 christos Exp $ */
+
+/*
+ * Copyright (c) 1996-2002 Eduardo Horvath
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include "strmacros.h"
+
+/*
+ * XXXXXXXXXXXXXXXXXXXX
+ * We need to make sure that this doesn't use floating point
+ * before our trap handlers are installed or we could panic
+ * XXXXXXXXXXXXXXXXXXXX
+ */
+/*
+ * memset(addr, c, len)
+ *
+ * We want to use VIS instructions if we're clearing out more than
+ * 256 bytes, but to do that we need to properly save and restore the
+ * FP registers. Unfortunately the code to do that in the kernel needs
+ * to keep track of the current owner of the FPU, hence the different
+ * code.
+ *
+ * XXXXX To produce more efficient code, we do not allow lengths
+ * greater than 0x80000000000000000, which are negative numbers.
+ * This should not really be an issue since the VA hole should
+ * cause any such ranges to fail anyway.
+ */
+#if !defined(_KERNEL) || defined(_RUMPKERNEL)
+ENTRY(bzero)
+ ! %o0 = addr, %o1 = len
+ mov %o1, %o2
+ mov 0, %o1
+#endif
+ENTRY(memset)
+ ! %o0 = addr, %o1 = pattern, %o2 = len
+ mov %o0, %o4 ! Save original pointer
+
+Lmemset_internal:
+ btst 7, %o0 ! Word aligned?
+ bz,pn %xcc, 0f
+ nop
+ inc %o0
+ deccc %o2 ! Store up to 7 bytes
+ bge,a,pt CCCR, Lmemset_internal
+ stb %o1, [%o0 - 1]
+
+ retl ! Duplicate Lmemset_done
+ mov %o4, %o0
+0:
+ /*
+ * Duplicate the pattern so it fills 64-bits.
+ */
+ andcc %o1, 0x0ff, %o1 ! No need to extend zero
+ bz,pt %icc, 1f
+ sllx %o1, 8, %o3 ! sigh. all dependent insns.
+ or %o1, %o3, %o1
+ sllx %o1, 16, %o3
+ or %o1, %o3, %o1
+ sllx %o1, 32, %o3
+ or %o1, %o3, %o1
+1:
+#ifdef USE_BLOCK_STORE_LOAD
+ !! Now we are 64-bit aligned
+ cmp %o2, 256 ! Use block clear if len > 256
+ bge,pt CCCR, Lmemset_block ! use block store insns
+#endif /* USE_BLOCK_STORE_LOAD */
+ deccc 8, %o2
+Lmemset_longs:
+ bl,pn CCCR, Lmemset_cleanup ! Less than 8 bytes left
+ nop
+3:
+ inc 8, %o0
+ deccc 8, %o2
+ bge,pt CCCR, 3b
+ stx %o1, [%o0 - 8] ! Do 1 longword at a time
+
+ /*
+ * Len is in [-8..-1] where -8 => done, -7 => 1 byte to zero,
+ * -6 => two bytes, etc. Mop up this remainder, if any.
+ */
+Lmemset_cleanup:
+ btst 4, %o2
+ bz,pt CCCR, 5f ! if (len & 4) {
+ nop
+ stw %o1, [%o0] ! *(int *)addr = 0;
+ inc 4, %o0 ! addr += 4;
+5:
+ btst 2, %o2
+ bz,pt CCCR, 7f ! if (len & 2) {
+ nop
+ sth %o1, [%o0] ! *(short *)addr = 0;
+ inc 2, %o0 ! addr += 2;
+7:
+ btst 1, %o2
+ bnz,a %icc, Lmemset_done ! if (len & 1)
+ stb %o1, [%o0] ! *addr = 0;
+Lmemset_done:
+ retl
+ mov %o4, %o0 ! Restore ponter for memset (ugh)
+
+#ifdef USE_BLOCK_STORE_LOAD
+Lmemset_block:
+ sethi %hi(block_disable), %o3
+ ldx [ %o3 + %lo(block_disable) ], %o3
+ brnz,pn %o3, Lmemset_longs
+ !! Make sure our trap table is installed
+ set _C_LABEL(trapbase), %o5
+ rdpr %tba, %o3
+ sub %o3, %o5, %o3
+ brnz,pn %o3, Lmemset_longs ! No, then don't use block load/store
+ nop
+/*
+ * Kernel:
+ *
+ * Here we use VIS instructions to do a block clear of a page.
+ * But before we can do that we need to save and enable the FPU.
+ * The last owner of the FPU registers is fplwp, and
+ * fplwp->l_md.md_fpstate is the current fpstate. If that's not
+ * null, call savefpstate() with it to store our current fp state.
+ *
+ * Next, allocate an aligned fpstate on the stack. We will properly
+ * nest calls on a particular stack so this should not be a problem.
+ *
+ * Now we grab either curlwp (or if we're on the interrupt stack
+ * lwp0). We stash its existing fpstate in a local register and
+ * put our new fpstate in curlwp->p_md.md_fpstate. We point
+ * fplwp at curlwp (or lwp0) and enable the FPU.
+ *
+ * If we are ever preempted, our FPU state will be saved in our
+ * fpstate. Then, when we're resumed and we take an FPDISABLED
+ * trap, the trap handler will be able to fish our FPU state out
+ * of curlwp (or lwp0).
+ *
+ * On exiting this routine we undo the damage: restore the original
+ * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable
+ * the MMU.
+ *
+ */
+
+ ENABLE_FPU(0)
+
+ !! We are now 8-byte aligned. We need to become 64-byte aligned.
+ btst 63, %i0
+ bz,pt CCCR, 2f
+ nop
+1:
+ stx %i1, [%i0]
+ inc 8, %i0
+ btst 63, %i0
+ bnz,pt %xcc, 1b
+ dec 8, %i2
+
+2:
+ brz %i1, 3f ! Skip the memory op
+ fzero %f0 ! if pattern is 0
+
+#ifdef _LP64
+ stx %i1, [%i0] ! Flush this puppy to RAM
+ membar #StoreLoad
+ ldd [%i0], %f0
+#else
+ stw %i1, [%i0] ! Flush this puppy to RAM
+ membar #StoreLoad
+ ld [%i0], %f0
+ fmovsa %icc, %f0, %f1
+#endif
+
+3:
+ fmovd %f0, %f2 ! Duplicate the pattern
+ fmovd %f0, %f4
+ fmovd %f0, %f6
+ fmovd %f0, %f8
+ fmovd %f0, %f10
+ fmovd %f0, %f12
+ fmovd %f0, %f14
+
+ !! Remember: we were 8 bytes too far
+ dec 56, %i2 ! Go one iteration too far
+5:
+ stda %f0, [%i0] ASI_STORE ! Store 64 bytes
+ deccc BLOCK_SIZE, %i2
+ bg,pt %icc, 5b
+ inc BLOCK_SIZE, %i0
+
+ membar #Sync
+/*
+ * We've saved our possible fpstate, now disable the fpu
+ * and continue with life.
+ */
+ RESTORE_FPU
+ addcc %i2, 56, %i2 ! Restore the count
+ ba,pt %xcc, Lmemset_longs ! Finish up the remainder
+ restore
+#endif /* USE_BLOCK_STORE_LOAD */
Index: src/common/lib/libc/arch/sparc64/string/strmacros.h
diff -u /dev/null src/common/lib/libc/arch/sparc64/string/strmacros.h:1.1
--- /dev/null Sat Mar 16 20:42:32 2013
+++ src/common/lib/libc/arch/sparc64/string/strmacros.h Sat Mar 16 20:42:32 2013
@@ -0,0 +1,119 @@
+/* $NetBSD: strmacros.h,v 1.1 2013/03/17 00:42:32 christos Exp $ */
+
+/*
+ * Copyright (c) 1996-2002 Eduardo Horvath
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions and the following disclaimer.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <machine/asm.h>
+#if defined(_KERNEL) && !defined(_RUMPKERNEL)
+#define USE_BLOCK_STORE_LOAD /* enable block load/store ops */
+#include "assym.h"
+#include <machine/param.h>
+#include <machine/ctlreg.h>
+#include <machine/psl.h>
+#include <machine/frame.h>
+#include <machine/intr.h>
+#include <machine/locore.h>
+
+#ifdef USE_BLOCK_STORE_LOAD
+
+#define BLOCK_SIZE SPARC64_BLOCK_SIZE
+#define BLOCK_ALIGN SPARC64_BLOCK_ALIGN
+
+/*
+ * The following routines allow fpu use in the kernel.
+ *
+ * They allocate a stack frame and use all local regs. Extra
+ * local storage can be requested by setting the siz parameter,
+ * and can be accessed at %sp+CC64FSZ.
+ */
+
+#define ENABLE_FPU(siz) \
+ save %sp, -(CC64FSZ), %sp; /* Allocate a stack frame */ \
+ sethi %hi(FPLWP), %l1; \
+ add %fp, STKB-FS_SIZE, %l0; /* Allocate a fpstate */\
+ LDPTR [%l1 + %lo(FPLWP)], %l2; /* Load fplwp */ \
+ andn %l0, BLOCK_ALIGN, %l0; /* Align it */ \
+ clr %l3; /* NULL fpstate */ \
+ brz,pt %l2, 1f; /* fplwp == NULL? */ \
+ add %l0, -STKB-CC64FSZ-(siz), %sp; /* Set proper %sp */ \
+ LDPTR [%l2 + L_FPSTATE], %l3; \
+ brz,pn %l3, 1f; /* Make sure we have an fpstate */ \
+ mov %l3, %o0; \
+ call _C_LABEL(savefpstate); /* Save the old fpstate */ \
+1: \
+ set EINTSTACK-STKB, %l4; /* Are we on intr stack? */ \
+ cmp %sp, %l4; \
+ bgu,pt %xcc, 1f; \
+ set INTSTACK-STKB, %l4; \
+ cmp %sp, %l4; \
+ blu %xcc, 1f; \
+0: \
+ sethi %hi(_C_LABEL(lwp0)), %l4; /* Yes, use lpw0 */ \
+ ba,pt %xcc, 2f; /* XXXX needs to change to CPUs idle proc */ \
+ or %l4, %lo(_C_LABEL(lwp0)), %l5; \
+1: \
+ sethi %hi(CURLWP), %l4; /* Use curlwp */ \
+ LDPTR [%l4 + %lo(CURLWP)], %l5; \
+ brz,pn %l5, 0b; nop; /* If curlwp is NULL need to use lwp0 */\
+2: \
+ LDPTR [%l5 + L_FPSTATE], %l6; /* Save old fpstate */ \
+ STPTR %l0, [%l5 + L_FPSTATE]; /* Insert new fpstate */\
+ STPTR %l5, [%l1 + %lo(FPLWP)]; /* Set new fplwp */ \
+ wr %g0, FPRS_FEF, %fprs /* Enable FPU */
+
+/*
+ * Weve saved our possible fpstate, now disable the fpu
+ * and continue with life.
+ */
+#ifdef DEBUG
+#define __CHECK_FPU \
+ LDPTR [%l5 + L_FPSTATE], %l7; \
+ cmp %l7, %l0; \
+ tnz 1;
+#else
+#define __CHECK_FPU
+#endif
+
+#define RESTORE_FPU \
+ __CHECK_FPU \
+ STPTR %l2, [%l1 + %lo(FPLWP)]; /* Restore old fproc */ \
+ wr %g0, 0, %fprs; /* Disable fpu */ \
+ brz,pt %l3, 1f; /* Skip if no fpstate */ \
+ STPTR %l6, [%l5 + L_FPSTATE]; /* Restore old fpstate */\
+ \
+ mov %l3, %o0; \
+ call _C_LABEL(loadfpstate); /* Reload orig fpstate */\
+1: \
+ membar #Sync; /* Finish all FP ops */
+
+#endif /* USE_BLOCK_STORE_LOAD */
+
+#ifdef USE_BLOCK_STORE_LOAD
+#if 0
+#define ASI_STORE ASI_BLK_COMMIT_P
+#else
+#define ASI_STORE ASI_BLK_P
+#endif
+#endif /* USE_BLOCK_STORE_LOAD */
+#endif /* _KERNEL && !_RUMPKERNEL */