> Try the following one. 1) -minline-all-stringops > -mstringop-strategy=rep_8byte -O2 vs 1) -mstringop_strategy=libcall > -O2. > > David > > > #include <string.h> > #include <stdio.h> > #include <stdlib.h> > #ifndef LEN > #define LEN 16 > #endif > > void copy(char* s1, char* s2,int len) __attribute__((noinline)); > void copy(char* s1, char* s2,int len) > { > memcpy(s2,s1,len); > }
I guess the catch here is that you force the copy to be noinline and thus you eliminate the benefits of inlined sequence. With inline stringop one saves regalloc and often can get rid of the alignment tests. This is script I use to tune the tables. Honza test() { rm -f a.out cat <<END | $1 -x c -O3 $3 -DAVG_SIZE=$2 $STRINGOP -DMEMORY_COPIES=$memsize - #define BUFFER_SIZE (16*1024*1024 + AVG_SIZE*2) /*#define MEMORY_COPIES (1024*1024*64*(long long)10)*/ $type t[BUFFER_SIZE]; main() { unsigned int i; for (i=0;i<((long long)MEMORY_COPIES + AVG_SIZE * 2 - 1)/AVG_SIZE*2;i++) #ifdef test_memset __builtin_memset (t+(i*1024*1024+i*1)%(BUFFER_SIZE - AVG_SIZE*2), i, (AVG_SIZE + i) % (AVG_SIZE * 2 + 0)); #else __builtin_memcpy (t+(i*1024*1024+i*1)%(BUFFER_SIZE - AVG_SIZE*2), t+((i+1)*1024*1024*4+i*1)%(BUFFER_SIZE - AVG_SIZE *2), (AVG_SIZE + i) % (AVG_SIZE * 2 + 0)); #endif return 0; } END TIME=`/usr/bin/time -f "%E" ./a.out 2>&1` echo -n " "$TIME echo $TIME $4 >>/tmp/accum } testrow() { echo -n "" >/tmp/accum printf "block size %7i" $3 test "$2" "$3" "-mstringop-strategy=libcall" libcall test "$2" "$3" "-mstringop-strategy=rep_byte -malign-stringops" rep1 test "$2" "$3" "-mstringop-strategy=rep_byte -mno-align-stringops" rep1noalign test "$2" "$3" "-mstringop-strategy=rep_4byte -malign-stringops" rep4 test "$2" "$3" "-mstringop-strategy=rep_4byte -mno-align-stringops" rep4noalign if [ "$mode" == 64 ] then test "$2" "$3" "-mstringop-strategy=rep_8byte -malign-stringops" rep8 test "$2" "$3" "-mstringop-strategy=rep_8byte -mno-align-stringops" rep8noalign fi test "$2" "$3" "-mstringop-strategy=loop -malign-stringops" loop test "$2" "$3" "-mstringop-strategy=loop -mno-align-stringops" loopnoalign test "$2" "$3" "-mstringop-strategy=unrolled_loop -malign-stringops" unrl test "$2" "$3" "-mstringop-strategy=unrolled_loop -mno-align-stringops" unrlnoalign test "$2" "$3" "-mstringop-strategy=sse_loop -malign-stringops" sse test "$2" "$3" "-mstringop-strategy=sse_loop -mno-align-stringops -msse2" ssenoalign test "$2" "$3" "-mstringop-strategy=byte_loop" byte best=`cat /tmp/accum | sort | head -1` test "$2" "$3" " -fprofile-generate" >/dev/null 2>&1 test "$2" "$3" " -fprofile-use" test "$2" "$3" " -minline-stringops-dynamically" echo " best: $best" } test_all_sizes() { if [ "$mode" == 64 ] then echo " libcall rep1 noalg rep4 noalg rep8 noalg loop noalg unrl noalg sse noalg byte profiled dynamic" else echo " libcall rep1 noalg rep4 noalg loop noalg unrl noalg sse noalg byte profiled dynamic" fi #for size in 1 2 3 4 6 8 10 12 14 16 24 32 48 64 128 256 512 1024 4096 8192 81920 819200 8192000 #for size in 8192000 819200 81920 8192 4096 2048 1024 512 256 128 64 48 32 24 16 14 12 10 8 6 5 4 3 2 1 for size in 8192000 819200 81920 20480 8192 4096 2048 1024 512 256 128 64 48 32 24 16 14 12 10 8 6 4 1 #for size in 128 256 1024 4096 8192 81920 819200 do testrow "$1" "$2" $size done } mode=$1 shift export memsize=$1 shift cmdline=$* if [ "$mode" != 32 ] then if [ "$mode" != 64 ] then echo "Usage:" echo "test_stringop mode size cmdline" echo "mode is either 32 or 64" echo "size is amount of memory copied in each test. Should be chosed small enough so runtime is less than minute for each test and sorting works" echo "Example: test_stringop 32 640000000 ./xgcc -B ./ -march=pentium3" exit fi fi echo "memcpy mode:$mode size:$memsize" export STRINGOP="" type=char test_all_sizes $mode "$cmdline -m$mode" echo "Aligned" type=long test_all_sizes $mode "$cmdline -m$mode" echo "memset" type=char export STRINGOP="-Dtest_memset=1" test_all_sizes $mode "$cmdline -m$mode" echo "Aligned" type=long test_all_sizes $mode "$cmdline -m$mode"