Hello, Ive been working on trying to improve the performance of directfb 1.3.0 on the arm platform. The attached patch will replace the default libc memcpy with a faster implementation. Ive tested this patch using an AT91RM9200, but should work on other ARM targets.
Hope this will be useful to others. Regards, Vince
diff -Naur DirectFB-1.3.0-org/configure.in DirectFB-1.3.0/configure.in --- DirectFB-1.3.0-org/configure.in 2009-03-18 09:11:21.000000000 +0000 +++ DirectFB-1.3.0/configure.in 2009-03-18 09:12:47.000000000 +0000 @@ -198,6 +198,7 @@ *arm*) have_arm=yes + AC_DEFINE(ARCH_ARM,1,[Define to 1 if you are compiling for ARM.]) ;; ppc-*-linux* | powerpc-*) @@ -221,6 +222,7 @@ need_libc_r=no need_libdl=yes want_ppcasm=yes +want_armasm=yes case "$target_or_host" in *-linux*) @@ -236,6 +238,7 @@ need_libc_r=yes need_libdl=no want_ppcasm=yes + want_armasm=yes CPPFLAGS="$CPPFLAGS -I/usr/local/include" LDFLAGS="$LDFLAGS -L/usr/local/lib" ;; @@ -244,6 +247,7 @@ need_libc_r=yes need_libdl=no want_ppcasm=no + want_armasm=no CPPFLAGS="$CPPFLAGS -I/usr/local/include" LDFLAGS="$LDFLAGS -L/usr/local/lib" ;; @@ -252,6 +256,7 @@ need_libc_r=no need_libdl=no want_ppcasm=yes + want_armasm=yes CPPFLAGS="$CPPFLAGS -I/usr/pkg/include" LDFLAGS="$LDFLAGS -L/usr/pkg/lib" ;; @@ -260,6 +265,7 @@ need_libc_r=no need_libdl=yes want_ppcasm=no + want_armasm=no CPPFLAGS="$CPPFLAGS -I/sw/include" LDFLAGS="$LDFLAGS -L/sw/lib" ;; @@ -281,6 +287,13 @@ AC_DEFINE(USE_PPCASM,1,[Define to 1 if ppc assembly is available.]) fi + +AM_CONDITIONAL(BUILDARMASM, test "$have_arm" = "yes" && test "$want_armasm" = "yes") + +if test "$have_arm" = "yes" && test "$want_armasm" = "yes"; then + AC_DEFINE(USE_ARMASM,1,[Define to 1 if arm assembly is available.]) +fi + if test "$have_kos" = "yes"; then AC_DEFINE(USE_KOS,1,[Define to 1 if compiling on KallistiOS.]) fi diff -Naur DirectFB-1.3.0-org/lib/direct/armasm_memcpy.h DirectFB-1.3.0/lib/direct/armasm_memcpy.h --- DirectFB-1.3.0-org/lib/direct/armasm_memcpy.h 1970-01-01 01:00:00.000000000 +0100 +++ DirectFB-1.3.0/lib/direct/armasm_memcpy.h 2009-03-18 10:22:26.000000000 +0000 @@ -0,0 +1,28 @@ +/* + * ARM memcpy asm replacement. + * + * Copyright (C) 2009 Bluush Dev Team. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifndef __ARMASM_MEMCPY_H__ +#define __ARMASM_MEMCPY_H__ + +void *direct_armasm_memcpy ( void *dest, const void *src, size_t n); + +#endif /* __ARMASM_MEMCPY_H__ */ + diff -Naur DirectFB-1.3.0-org/lib/direct/armasm_memcpy.S DirectFB-1.3.0/lib/direct/armasm_memcpy.S --- DirectFB-1.3.0-org/lib/direct/armasm_memcpy.S 1970-01-01 01:00:00.000000000 +0100 +++ DirectFB-1.3.0/lib/direct/armasm_memcpy.S 2009-03-18 10:22:41.000000000 +0000 @@ -0,0 +1,126 @@ +/* + * ARM memcpy asm replacement. + * + * Copyright (C) 2009 Bluush Dev Team. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + + +#define _LABEL(f) f : + + +.global direct_armasm_memcpy + + + + .code 32 + + +/* + * Fast copy n bytes from source p2 to destination p1. + * + * void *direct_armasm_memcpy(void *p1, const void *p2, int n) + */ + +_LABEL(direct_armasm_memcpy) + teq r2,#0 /* is arg n == 0 ? */ + moveq pc,lr /* if n == 0, return */ + + stmdb sp!,{lr} /* push return address */ + mov r12,r0 /* copy pointer p1 */ + cmp r2,#0x8 /* is string long or short? */ + ble ByteSerial /* jump if long string */ + + sub r3,r0,r1 /* compare pointers p1, p2 */ + tst r3,#3 /* strings aligned same? */ + bne ByteSerial /* jump if strings not aligned */ + +/* + * Both strings are similarly aligned WRT word boundaries. + * At least a portion of the data can be copied an entire + * word at a time, which is faster than copying bytes. + */ +_LABEL(WordSerial) + ands r3,r0,#3 /* check byte alignment */ + beq WordAligned /* jump if p1, p2 word-aligned */ + + rsb r3,r3,#4 /* m = no. of odd initial bytes */ + sub r2,r2,r3 /* n = n - m */ + +/* + * If the two strings do not begin on word boundaries, begin + * by copying the odd bytes that precede the first full word. + */ +_LABEL(PreLoop) + ldrb lr,[r1],#1 /* read byte from string 2 */ + subs r3,r3,#1 /* --m (decrement loop count) */ + strb lr,[r12],#1 /* write byte to string 1 */ + bne PreLoop /* loop if more bytes to move */ + +_LABEL(WordAligned) + movs r3,r2,asr #5 /* any chunks of 8 words? */ + beq OctsDone /* jump if no 8-word chunks */ + + and r2,r2,#0x1f /* subtract chunks from n */ + stmdb sp!,{r4-r10} /* save registers on stack */ + +/* + * The strings are long enough that we can transfer at least + * some portion of the data in 8-word chunks. + */ +_LABEL(OctLoop) + ldmia r1!,{r4-r10,lr} /* load 8 words from string 2 */ + subs r3,r3,#1 /* more 8-word chunks to move? */ + stmia r12!,{r4-r10,lr} /* write 8 words to string 1 */ + bne OctLoop /* loop if more chunks */ + + ldmia sp!,{r4-r10} /* restore registers from stack */ + +_LABEL(OctsDone) + movs r3,r2,asr #2 /* any more whole words to move? */ + beq WordsDone /* jump if no more whole words */ + +/* + * Copy as much of the remaining data as possible one word at + * a time. + */ +_LABEL(WordLoop2) + ldr lr,[r1],#4 /* read next word from string 2 */ + subs r3,r3,#1 /* decrement word count */ + str lr,[r12],#4 /* write next word to string 1 */ + bne WordLoop2 /* loop while more words to move */ + +_LABEL(WordsDone) + ands r2,r2,#3 /* any last bytes to transfer? */ + ldmeqia sp!,{pc} /* return if already done */ + +/* + * The two strings do not end on word boundaries. + * Copy the remaining data one byte at a time. + */ +_LABEL(ByteSerial) + ldrb lr,[r1],#1 /* read byte from string 2 */ + subs r2,r2,#1 /* --n (decrement loop count) */ + strb lr,[r12],#1 /* write byte to string 1 */ + bne ByteSerial /* loop if more bytes to move */ + + ldmia sp!,{pc} /* return to caller */ + + .ltorg + + + diff -Naur DirectFB-1.3.0-org/lib/direct/Makefile.am DirectFB-1.3.0/lib/direct/Makefile.am --- DirectFB-1.3.0-org/lib/direct/Makefile.am 2009-03-18 09:11:21.000000000 +0000 +++ DirectFB-1.3.0/lib/direct/Makefile.am 2009-03-18 09:15:21.000000000 +0000 @@ -29,6 +29,10 @@ ppcasm_headers = ppcasm_memcpy.h ppc_asm.h endif +if BUILDARMASM +armasm_sources = armasm_memcpy.S +armasm_header = armasm_memcpy.h +endif # If the old location isn't cleared, builds of external modules fail install-exec-local: @@ -39,6 +43,7 @@ include_HEADERS = \ $(ppcasm_headers) \ + $(armasm_headers) \ build.h \ clock.h \ conf.h \ @@ -69,6 +74,7 @@ libdirect_la_SOURCES = \ $(ppcasm_sources) \ + $(armasm_sources) \ clock.c \ conf.c \ debug.c \ diff -Naur DirectFB-1.3.0-org/lib/direct/memcpy.c DirectFB-1.3.0/lib/direct/memcpy.c --- DirectFB-1.3.0-org/lib/direct/memcpy.c 2009-03-18 09:11:21.000000000 +0000 +++ DirectFB-1.3.0/lib/direct/memcpy.c 2009-03-18 09:14:45.000000000 +0000 @@ -44,7 +44,7 @@ #include <direct/memcpy.h> #include <direct/messages.h> -#if defined (ARCH_PPC) || (SIZEOF_LONG == 8) +#if defined (ARCH_PPC) || defined (ARCH_ARM) || (SIZEOF_LONG == 8) # define RUN_BENCHMARK 1 #else # define RUN_BENCHMARK 0 @@ -58,6 +58,10 @@ #include "ppcasm_memcpy.h" #endif +#ifdef USE_ARMASM +#include "armasm_memcpy.h" +#endif + #if SIZEOF_LONG == 8 @@ -152,6 +156,9 @@ { "ppccache", "ppcasm_cacheable_memcpy()", direct_ppcasm_cacheable_memcpy, 0, 0}, #endif /* __LINUX__ */ #endif /* USE_PPCASM */ +#ifdef USE_ARMASM + { "arm", "armasm_memcpy()", direct_armasm_memcpy, 0, 0}, +#endif { NULL, NULL, NULL, 0, 0} };
_______________________________________________ directfb-dev mailing list directfb-dev@directfb.org http://mail.directfb.org/cgi-bin/mailman/listinfo/directfb-dev