Niels, Here is a new version of the patch with the second version of memcpy and a conditional to remove big-endian.
Let me know if you have any trouble with it. Regards, Vince On Tue, 2009-03-24 at 16:36 +0100, Niels Roest wrote: > Hi John, > thanks for the comments, > just want to mention 1 or 2 things too. > > The testing routines do have a single cold, unmeasured, run first to > rule out previous cache state influence. > > The test itself is in fact really simple - a continuous copy of a large > region. So no repeats. This does focus on the use case that is most > obvious for DirectFB, namely copying chunks and lines of graphics > between surfaces, which will normally lead to cache misses anyway. I am > most concerned about alignment, since this is really unpredictable. > > I am not sure if we will benefit much from shuffling the code or using > different memory regions; you have to remember that the testing routines > produce a single score only, so these will need to be fine tuned a lot, > and we may even need to revert to multiple memcpy routines which are > optimised for multiple use cases. This might be an interesting approach, > it is one I will follow if performance measurements show that we can > expect a proper benefit from this - forgetting that DirectFB is mainly > about hardware acceleration anyway. > > For me I am very happy with the changes that Vince made, thanks Vince, > and if I have a BE/LE lock, I will include the patch. > > Greets > Niels > > John Williams wrote: > > Hi Vince, > > > > > > On Wed, Mar 25, 2009 at 12:57 AM, vince <vi...@bluush.com> wrote: > > > > > >> Ive change my benchmark to invalidate the cache before every test. My > >> result are the same. Attached is my test program. > >> > > > > No worries - just wanted to make sure we weren't missing the obvious! > > > > Might also be worth shuffling the sequencing of the tests (armasm, > > armasm2, libc), see if that has any impact. I'm not intimate with ARM > > cache details, but with a write-back cache you could be stalling on > > cacheline evictions later in the test. > > > > Another safety would be to perform the tests in different memory > > regions, with a complete cache flush and invalidate between each run. > > > > Not saying there's anything wrong with your code, just know its easy > > to get false results from simple benchmark code. Memory tests are > > another one where the obvious approach is often wrong. > > > > Cheers, > > > > John > > _______________________________________________ > > directfb-dev mailing list > > directfb-dev@directfb.org > > http://mail.directfb.org/cgi-bin/mailman/listinfo/directfb-dev > > > > > >
diff -Naur DirectFB-1.3.0-org/configure.in DirectFB-1.3.0/configure.in --- DirectFB-1.3.0-org/configure.in 2009-03-25 07:50:21.000000000 +0000 +++ DirectFB-1.3.0/configure.in 2009-03-25 07:51:31.000000000 +0000 @@ -198,6 +198,7 @@ *arm*) have_arm=yes + AC_DEFINE(ARCH_ARM,1,[Define to 1 if you are compiling for ARM.]) ;; ppc-*-linux* | powerpc-*) @@ -221,6 +222,7 @@ need_libc_r=no need_libdl=yes want_ppcasm=yes +want_armasm=yes case "$target_or_host" in *-linux*) @@ -236,6 +238,7 @@ need_libc_r=yes need_libdl=no want_ppcasm=yes + want_armasm=yes CPPFLAGS="$CPPFLAGS -I/usr/local/include" LDFLAGS="$LDFLAGS -L/usr/local/lib" ;; @@ -244,6 +247,7 @@ need_libc_r=yes need_libdl=no want_ppcasm=no + want_armasm=no CPPFLAGS="$CPPFLAGS -I/usr/local/include" LDFLAGS="$LDFLAGS -L/usr/local/lib" ;; @@ -252,6 +256,7 @@ need_libc_r=no need_libdl=no want_ppcasm=yes + want_armasm=yes CPPFLAGS="$CPPFLAGS -I/usr/pkg/include" LDFLAGS="$LDFLAGS -L/usr/pkg/lib" ;; @@ -260,6 +265,7 @@ need_libc_r=no need_libdl=yes want_ppcasm=no + want_armasm=no CPPFLAGS="$CPPFLAGS -I/sw/include" LDFLAGS="$LDFLAGS -L/sw/lib" ;; @@ -281,6 +287,13 @@ AC_DEFINE(USE_PPCASM,1,[Define to 1 if ppc assembly is available.]) fi + +AM_CONDITIONAL(BUILDARMASM, test "$have_arm" = "yes" && test "$want_armasm" = "yes") + +if test "$have_arm" = "yes" && test "$want_armasm" = "yes"; then + AC_DEFINE(USE_ARMASM,1,[Define to 1 if arm assembly is available.]) +fi + if test "$have_kos" = "yes"; then AC_DEFINE(USE_KOS,1,[Define to 1 if compiling on KallistiOS.]) fi diff -Naur DirectFB-1.3.0-org/lib/direct/armasm_memcpy.h DirectFB-1.3.0/lib/direct/armasm_memcpy.h --- DirectFB-1.3.0-org/lib/direct/armasm_memcpy.h 1970-01-01 01:00:00.000000000 +0100 +++ DirectFB-1.3.0/lib/direct/armasm_memcpy.h 2009-03-25 07:52:52.000000000 +0000 @@ -0,0 +1,32 @@ +/* + * ARM memcpy asm replacement. + * + * Copyright (C) 2009 Bluush Dev Team. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifndef __ARMASM_MEMCPY_H__ +#define __ARMASM_MEMCPY_H__ + +#if USE_ARMASM && !WORDS_BIGENDIAN + +void *direct_armasm_memcpy ( void *dest, const void *src, size_t n); + +#endif /* USE_ARMASM && !WORDS_BIGENDIAN */ + +#endif /* __ARMASM_MEMCPY_H__ */ + diff -Naur DirectFB-1.3.0-org/lib/direct/armasm_memcpy.S DirectFB-1.3.0/lib/direct/armasm_memcpy.S --- DirectFB-1.3.0-org/lib/direct/armasm_memcpy.S 1970-01-01 01:00:00.000000000 +0100 +++ DirectFB-1.3.0/lib/direct/armasm_memcpy.S 2009-03-25 07:52:52.000000000 +0000 @@ -0,0 +1,421 @@ +/* + * ARM memcpy asm replacement. + * + * Copyright (C) 2009 Bluush Dev Team. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#include <config.h> + +#if USE_ARMASM && !WORDS_BIGENDIAN + +#define _LABEL(f) f : + +.global direct_armasm_memcpy + + .code 32 + +_LABEL(direct_armasm_memcpy) + cmp r1, r0 + bcc Lmemcpy_backwards + + moveq r0, #0 + moveq pc, lr + + stmdb sp!, {r0, lr} + subs r2, r2, #4 + blt Lmemcpy_fl4 + ands r12, r0, #3 + bne Lmemcpy_fdestul + ands r12, r1, #3 + bne Lmemcpy_fsrcul + +_LABEL(Lmemcpy_ft8) + subs r2, r2, #8 + blt Lmemcpy_fl12 + subs r2, r2, #0x14 + blt Lmemcpy_fl32 + stmdb sp!, {r4} + +_LABEL(Lmemcpy_floop32) + ldmia r1!, {r3, r4, r12, lr} + stmia r0!, {r3, r4, r12, lr} + ldmia r1!, {r3, r4, r12, lr} + stmia r0!, {r3, r4, r12, lr} + subs r2, r2, #0x20 + bge Lmemcpy_floop32 + + cmn r2, #0x10 + ldmgeia r1!, {r3, r4, r12, lr} + stmgeia r0!, {r3, r4, r12, lr} + subge r2, r2, #0x10 + ldmia sp!, {r4} + +_LABEL(Lmemcpy_fl32) + adds r2, r2, #0x14 + + +_LABEL(Lmemcpy_floop12) + ldmgeia r1!, {r3, r12, lr} + stmgeia r0!, {r3, r12, lr} + subges r2, r2, #0x0c + bge Lmemcpy_floop12 + +_LABEL(Lmemcpy_fl12) + adds r2, r2, #8 + blt Lmemcpy_fl4 + + subs r2, r2, #4 + ldrlt r3, [r1], #4 + strlt r3, [r0], #4 + ldmgeia r1!, {r3, r12} + stmgeia r0!, {r3, r12} + subge r2, r2, #4 + +_LABEL(Lmemcpy_fl4) + adds r2, r2, #4 + ldmeqia sp!, {r0, pc} + + cmp r2, #2 + ldrb r3, [r1], #1 + strb r3, [r0], #1 + ldrgeb r3, [r1], #1 + strgeb r3, [r0], #1 + ldrgtb r3, [r1], #1 + strgtb r3, [r0], #1 + ldmia sp!, {r0, pc} + + +_LABEL(Lmemcpy_fdestul) + rsb r12, r12, #4 + cmp r12, #2 + + ldrb r3, [r1], #1 + strb r3, [r0], #1 + ldrgeb r3, [r1], #1 + strgeb r3, [r0], #1 + ldrgtb r3, [r1], #1 + strgtb r3, [r0], #1 + subs r2, r2, r12 + blt Lmemcpy_fl4 + + ands r12, r1, #3 + beq Lmemcpy_ft8 + + +_LABEL(Lmemcpy_fsrcul) + bic r1, r1, #3 + ldr lr, [r1], #4 + cmp r12, #2 + bgt Lmemcpy_fsrcul3 + beq Lmemcpy_fsrcul2 + cmp r2, #0x0c + blt Lmemcpy_fsrcul1loop4 + sub r2, r2, #0x0c + stmdb sp!, {r4, r5} + +_LABEL(Lmemcpy_fsrcul1loop16) + mov r3, lr, lsr #8 + ldmia r1!, {r4, r5, r12, lr} + orr r3, r3, r4, lsl #24 + mov r4, r4, lsr #8 + orr r4, r4, r5, lsl #24 + mov r5, r5, lsr #8 + orr r5, r5, r12, lsl #24 + mov r12, r12, lsr #8 + orr r12, r12, lr, lsl #24 + stmia r0!, {r3-r5, r12} + subs r2, r2, #0x10 + bge Lmemcpy_fsrcul1loop16 + ldmia sp!, {r4, r5} + adds r2, r2, #0x0c + blt Lmemcpy_fsrcul1l4 + +_LABEL(Lmemcpy_fsrcul1loop4) + mov r12, lr, lsr #8 + ldr lr, [r1], #4 + orr r12, r12, lr, lsl #24 + str r12, [r0], #4 + subs r2, r2, #4 + bge Lmemcpy_fsrcul1loop4 + +_LABEL(Lmemcpy_fsrcul1l4) + sub r1, r1, #3 + b Lmemcpy_fl4 + +_LABEL(Lmemcpy_fsrcul2) + cmp r2, #0x0c + blt Lmemcpy_fsrcul2loop4 + sub r2, r2, #0x0c + stmdb sp!, {r4, r5} + +_LABEL(Lmemcpy_fsrcul2loop16) + mov r3, lr, lsr #16 + ldmia r1!, {r4, r5, r12, lr} + orr r3, r3, r4, lsl #16 + mov r4, r4, lsr #16 + orr r4, r4, r5, lsl #16 + mov r5, r5, lsr #16 + orr r5, r5, r12, lsl #16 + mov r12, r12, lsr #16 + orr r12, r12, lr, lsl #16 + stmia r0!, {r3-r5, r12} + subs r2, r2, #0x10 + bge Lmemcpy_fsrcul2loop16 + ldmia sp!, {r4, r5} + adds r2, r2, #0x0c + blt Lmemcpy_fsrcul2l4 + +_LABEL(Lmemcpy_fsrcul2loop4) + mov r12, lr, lsr #16 + ldr lr, [r1], #4 + orr r12, r12, lr, lsl #16 + str r12, [r0], #4 + subs r2, r2, #4 + bge Lmemcpy_fsrcul2loop4 + +_LABEL(Lmemcpy_fsrcul2l4) + sub r1, r1, #2 + b Lmemcpy_fl4 + +_LABEL(Lmemcpy_fsrcul3) + cmp r2, #0x0c + blt Lmemcpy_fsrcul3loop4 + sub r2, r2, #0x0c + stmdb sp!, {r4, r5} + +_LABEL(Lmemcpy_fsrcul3loop16) + mov r3, lr, lsr #24 + ldmia r1!, {r4, r5, r12, lr} + orr r3, r3, r4, lsl #8 + mov r4, r4, lsr #24 + orr r4, r4, r5, lsl #8 + mov r5, r5, lsr #24 + orr r5, r5, r12, lsl #8 + mov r12, r12, lsr #24 + orr r12, r12, lr, lsl #8 + stmia r0!, {r3-r5, r12} + subs r2, r2, #0x10 + bge Lmemcpy_fsrcul3loop16 + ldmia sp!, {r4, r5} + adds r2, r2, #0x0c + blt Lmemcpy_fsrcul3l4 + +_LABEL(Lmemcpy_fsrcul3loop4) + mov r12, lr, lsr #24 + ldr lr, [r1], #4 + orr r12, r12, lr, lsl #8 + str r12, [r0], #4 + subs r2, r2, #4 + bge Lmemcpy_fsrcul3loop4 + +_LABEL(Lmemcpy_fsrcul3l4) + sub r1, r1, #1 + b Lmemcpy_fl4 + +_LABEL(Lmemcpy_backwards) + add r1, r1, r2 + add r0, r0, r2 + subs r2, r2, #4 + blt Lmemcpy_bl4 + ands r12, r0, #3 + bne Lmemcpy_bdestul + ands r12, r1, #3 + bne Lmemcpy_bsrcul + +_LABEL(Lmemcpy_bt8) + subs r2, r2, #8 + blt Lmemcpy_bl12 + stmdb sp!, {r4, lr} + subs r2, r2, #0x14 + blt Lmemcpy_bl32 + + +_LABEL(Lmemcpy_bloop32) + ldmdb r1!, {r3, r4, r12, lr} + stmdb r0!, {r3, r4, r12, lr} + ldmdb r1!, {r3, r4, r12, lr} + stmdb r0!, {r3, r4, r12, lr} + subs r2, r2, #0x20 + bge Lmemcpy_bloop32 + +_LABEL(Lmemcpy_bl32) + cmn r2, #0x10 + ldmgedb r1!, {r3, r4, r12, lr} + stmgedb r0!, {r3, r4, r12, lr} + subge r2, r2, #0x10 + adds r2, r2, #0x14 + ldmgedb r1!, {r3, r12, lr} + stmgedb r0!, {r3, r12, lr} + subge r2, r2, #0x0c + ldmia sp!, {r4, lr} + +_LABEL(Lmemcpy_bl12) + adds r2, r2, #8 + blt Lmemcpy_bl4 + subs r2, r2, #4 + ldrlt r3, [r1, #-4]! + strlt r3, [r0, #-4]! + ldmgedb r1!, {r3, r12} + stmgedb r0!, {r3, r12} + subge r2, r2, #4 + +_LABEL(Lmemcpy_bl4) + adds r2, r2, #4 + moveq pc, lr + + cmp r2, #2 + ldrb r3, [r1, #-1]! + strb r3, [r0, #-1]! + ldrgeb r3, [r1, #-1]! + strgeb r3, [r0, #-1]! + ldrgtb r3, [r1, #-1]! + strgtb r3, [r0, #-1]! + mov pc, lr + + +_LABEL(Lmemcpy_bdestul) + cmp r12, #2 + + ldrb r3, [r1, #-1]! + strb r3, [r0, #-1]! + ldrgeb r3, [r1, #-1]! + strgeb r3, [r0, #-1]! + ldrgtb r3, [r1, #-1]! + strgtb r3, [r0, #-1]! + subs r2, r2, r12 + blt Lmemcpy_bl4 + ands r12, r1, #3 + beq Lmemcpy_bt8 + + +_LABEL(Lmemcpy_bsrcul) + bic r1, r1, #3 + ldr r3, [r1, #0] + cmp r12, #2 + blt Lmemcpy_bsrcul1 + beq Lmemcpy_bsrcul2 + cmp r2, #0x0c + blt Lmemcpy_bsrcul3loop4 + sub r2, r2, #0x0c + stmdb sp!, {r4, r5, lr} + +_LABEL(Lmemcpy_bsrcul3loop16) + mov lr, r3, lsl #8 + ldmdb r1!, {r3-r5, r12} + orr lr, lr, r12, lsr #24 + mov r12, r12, lsl #8 + orr r12, r12, r5, lsr #24 + mov r5, r5, lsl #8 + orr r5, r5, r4, lsr #24 + mov r4, r4, lsl #8 + orr r4, r4, r3, lsr #24 + stmdb r0!, {r4, r5, r12, lr} + subs r2, r2, #0x10 + bge Lmemcpy_bsrcul3loop16 + ldmia sp!, {r4, r5, lr} + adds r2, r2, #0x0c + blt Lmemcpy_bsrcul3l4 + +_LABEL(Lmemcpy_bsrcul3loop4) + mov r12, r3, lsl #8 + ldr r3, [r1, #-4]! + orr r12, r12, r3, lsr #24 + str r12, [r0, #-4]! + subs r2, r2, #4 + bge Lmemcpy_bsrcul3loop4 + +_LABEL(Lmemcpy_bsrcul3l4) + add r1, r1, #3 + b Lmemcpy_bl4 + +_LABEL(Lmemcpy_bsrcul2) + cmp r2, #0x0c + blt Lmemcpy_bsrcul2loop4 + sub r2, r2, #0x0c + stmdb sp!, {r4, r5, lr} + +_LABEL(Lmemcpy_bsrcul2loop16) + mov lr, r3, lsl #16 + ldmdb r1!, {r3-r5, r12} + orr lr, lr, r12, lsr #16 + mov r12, r12, lsl #16 + orr r12, r12, r5, lsr #16 + mov r5, r5, lsl #16 + orr r5, r5, r4, lsr #16 + mov r4, r4, lsl #16 + orr r4, r4, r3, lsr #16 + stmdb r0!, {r4, r5, r12, lr} + subs r2, r2, #0x10 + bge Lmemcpy_bsrcul2loop16 + ldmia sp!, {r4, r5, lr} + adds r2, r2, #0x0c + blt Lmemcpy_bsrcul2l4 + +_LABEL(Lmemcpy_bsrcul2loop4) + mov r12, r3, lsl #16 + ldr r3, [r1, #-4]! + orr r12, r12, r3, lsr #16 + str r12, [r0, #-4]! + subs r2, r2, #4 + bge Lmemcpy_bsrcul2loop4 + +_LABEL(Lmemcpy_bsrcul2l4) + add r1, r1, #2 + b Lmemcpy_bl4 + +_LABEL(Lmemcpy_bsrcul1) + cmp r2, #0x0c + blt Lmemcpy_bsrcul1loop4 + sub r2, r2, #0x0c + stmdb sp!, {r4, r5, lr} + +_LABEL(Lmemcpy_bsrcul1loop32) + mov lr, r3, lsl #24 + ldmdb r1!, {r3-r5, r12} + orr lr, lr, r12, lsr #8 + mov r12, r12, lsl #24 + orr r12, r12, r5, lsr #8 + mov r5, r5, lsl #24 + orr r5, r5, r4, lsr #8 + mov r4, r4, lsl #24 + orr r4, r4, r3, lsr #8 + stmdb r0!, {r4, r5, r12, lr} + subs r2, r2, #0x10 + bge Lmemcpy_bsrcul1loop32 + ldmia sp!, {r4, r5, lr} + adds r2, r2, #0x0c + blt Lmemcpy_bsrcul1l4 + +_LABEL(Lmemcpy_bsrcul1loop4) + mov r12, r3, lsl #24 + ldr r3, [r1, #-4]! + orr r12, r12, r3, lsr #8 + str r12, [r0, #-4]! + subs r2, r2, #4 + bge Lmemcpy_bsrcul1loop4 + +_LABEL(Lmemcpy_bsrcul1l4) + add r1, r1, #1 + b Lmemcpy_bl4 + + + .ltorg + +#endif /* USE_ARMASM && !WORDS_BIGENDIAN */ + + diff -Naur DirectFB-1.3.0-org/lib/direct/Makefile.am DirectFB-1.3.0/lib/direct/Makefile.am --- DirectFB-1.3.0-org/lib/direct/Makefile.am 2009-03-25 07:50:20.000000000 +0000 +++ DirectFB-1.3.0/lib/direct/Makefile.am 2009-03-25 07:51:31.000000000 +0000 @@ -29,6 +29,10 @@ ppcasm_headers = ppcasm_memcpy.h ppc_asm.h endif +if BUILDARMASM +armasm_sources = armasm_memcpy.S +armasm_header = armasm_memcpy.h +endif # If the old location isn't cleared, builds of external modules fail install-exec-local: @@ -39,6 +43,7 @@ include_HEADERS = \ $(ppcasm_headers) \ + $(armasm_headers) \ build.h \ clock.h \ conf.h \ @@ -69,6 +74,7 @@ libdirect_la_SOURCES = \ $(ppcasm_sources) \ + $(armasm_sources) \ clock.c \ conf.c \ debug.c \ diff -Naur DirectFB-1.3.0-org/lib/direct/memcpy.c DirectFB-1.3.0/lib/direct/memcpy.c --- DirectFB-1.3.0-org/lib/direct/memcpy.c 2009-03-25 07:50:20.000000000 +0000 +++ DirectFB-1.3.0/lib/direct/memcpy.c 2009-03-25 07:53:07.000000000 +0000 @@ -44,7 +44,7 @@ #include <direct/memcpy.h> #include <direct/messages.h> -#if defined (ARCH_PPC) || (SIZEOF_LONG == 8) +#if defined (ARCH_PPC) || defined (ARCH_ARM) || (SIZEOF_LONG == 8) # define RUN_BENCHMARK 1 #else # define RUN_BENCHMARK 0 @@ -58,6 +58,10 @@ #include "ppcasm_memcpy.h" #endif +#ifdef USE_ARMASM && !WORDS_BIGENDIAN +#include "armasm_memcpy.h" +#endif + #if SIZEOF_LONG == 8 @@ -152,6 +156,9 @@ { "ppccache", "ppcasm_cacheable_memcpy()", direct_ppcasm_cacheable_memcpy, 0, 0}, #endif /* __LINUX__ */ #endif /* USE_PPCASM */ +#ifdef USE_ARMASM && !WORDS_BIGENDIAN + { "arm", "armasm_memcpy()", direct_armasm_memcpy, 0, 0}, +#endif { NULL, NULL, NULL, 0, 0} };
_______________________________________________ directfb-dev mailing list directfb-dev@directfb.org http://mail.directfb.org/cgi-bin/mailman/listinfo/directfb-dev