Attached memcpy micro benchmark, cpu info ,comparison results between rep movsq/b and memcpy on atom, ivb.
Thanks Ling 2012/10/23, ling.ma.prog...@gmail.com <ling.ma.prog...@gmail.com>: > From: Ma Ling <ling.ma.prog...@gmail.com> > > CISC code has higher instruction density, saving memory and > improving i-cache hit rate. However decode become challenge, > only one mulitple-uops(2~3)instruction could be decoded in one cycle, > and instructions containing more 4 uops(rep movsq/b) have to be handled by > MS-ROM, > the process take long time and eat up the advantage from it for small size. > > > In order to avoid this disavantage, we take use of general instruction code > for small size copy. The result shows it can get 1~2x improvement > on Core2, Nehalem, Sandy Bridge, Ivy Bridge, Atom, and Bulldozer as well. > > Signed-off-by: Ma Ling <ling.ma.prog...@gmail.com> > --- > In this version we decrease warm up distance from 512 to 256 for coming > CPUs, > which manage to reduce latency, but long time to decode is still consumed. > > Thanks > Ling > > arch/x86/lib/memcpy_64.S | 14 +++++++++++++- > 1 files changed, 13 insertions(+), 1 deletions(-) > > diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S > index 1c273be..6a24c8c 100644 > --- a/arch/x86/lib/memcpy_64.S > +++ b/arch/x86/lib/memcpy_64.S > @@ -5,7 +5,6 @@ > #include <asm/cpufeature.h> > #include <asm/dwarf2.h> > #include <asm/alternative-asm.h> > - > /* > * memcpy - Copy a memory block. > * > @@ -19,6 +18,15 @@ > */ > > /* > + * memcpy_c() and memcpy_c_e() use rep movsq/movsb respectively, > + * the instruction have to get micro ops from Microcode Sequencser Rom. > + * And the decode process take long latency, in order to avoid it, > + * we choose loop unrolling routine for small size. > + * Could vary the warm up distance. > + */ > + > + > +/* > * memcpy_c() - fast string ops (REP MOVSQ) based variant. > * > * This gets patched over the unrolled variant (below) via the > @@ -26,6 +34,8 @@ > */ > .section .altinstr_replacement, "ax", @progbits > .Lmemcpy_c: > + cmpq $256, %rdx > + jbe memcpy > movq %rdi, %rax > movq %rdx, %rcx > shrq $3, %rcx > @@ -46,6 +56,8 @@ > */ > .section .altinstr_replacement, "ax", @progbits > .Lmemcpy_c_e: > + cmpq $256, %rdx > + jbe memcpy > movq %rdi, %rax > movq %rdx, %rcx > rep movsb > -- > 1.6.5.2 > >
processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 28 model name : Intel(R) Atom(TM) CPU N450 @ 1.66GHz stepping : 10 microcode : 0x107 cpu MHz : 1000.000 cache size : 512 KB physical id : 0 siblings : 2 core id : 0 cpu cores : 1 apicid : 0 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 10 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good nopl aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm movbe lahf_lm dts bogomips : 3324.62 clflush size : 64 cache_alignment : 64 address sizes : 32 bits physical, 48 bits virtual power management: processor : 1 vendor_id : GenuineIntel cpu family : 6 model : 28 model name : Intel(R) Atom(TM) CPU N450 @ 1.66GHz stepping : 10 microcode : 0x107 cpu MHz : 1000.000 cache size : 512 KB physical id : 0 siblings : 2 core id : 0 cpu cores : 1 apicid : 1 initial apicid : 1 fpu : yes fpu_exception : yes cpuid level : 10 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good nopl aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm movbe lahf_lm dts bogomips : 3324.62 clflush size : 64 cache_alignment : 64 address sizes : 32 bits physical, 48 bits virtual power management:
memcpy_new memcpy_c memcpy_c_e TPT: Len 0, alignment 0/ 0: 50 90 70 TPT: Len 4, alignment 0/ 0: 60 110 80 TPT: Len 8, alignment 0/ 0: 60 100 100 TPT: Len 12, alignment 0/ 0: 50 120 110 TPT: Len 16, alignment 0/ 0: 60 100 130 TPT: Len 20, alignment 0/ 0: 60 120 140 TPT: Len 24, alignment 0/ 0: 60 100 160 TPT: Len 28, alignment 0/ 0: 60 120 180 TPT: Len 32, alignment 0/ 0: 60 100 190 TPT: Len 36, alignment 0/ 0: 70 120 200 TPT: Len 40, alignment 0/ 0: 70 100 220 TPT: Len 44, alignment 0/ 0: 70 120 240 TPT: Len 48, alignment 0/ 0: 70 110 250 TPT: Len 52, alignment 0/ 0: 70 130 270 TPT: Len 56, alignment 0/ 0: 70 110 280 TPT: Len 60, alignment 0/ 0: 70 130 290 TPT: Len 0, alignment 4/ 0: 50 90 70 TPT: Len 0, alignment 0/ 4: 50 90 70 TPT: Len 0, alignment 0/ 0: 50 90 70 TPT: Len 0, alignment 0/ 8: 50 90 70 TPT: Len 0, alignment 8/ 0: 50 90 70 TPT: Len 0, alignment 0/16: 50 90 70 TPT: Len 0, alignment 16/ 0: 50 90 70 TPT: Len 64, alignment 4/ 0: 90 120 200 TPT: Len 64, alignment 0/ 4: 90 130 300 TPT: Len 64, alignment 0/ 0: 70 110 310 TPT: Len 64, alignment 0/ 8: 80 160 200 TPT: Len 64, alignment 8/ 0: 70 110 200 TPT: Len 64, alignment 0/16: 80 130 200 TPT: Len 64, alignment 16/ 0: 70 110 200 TPT: Len 128, alignment 4/ 0: 120 150 330 TPT: Len 128, alignment 0/ 4: 130 160 540 TPT: Len 128, alignment 0/ 0: 100 130 550 TPT: Len 128, alignment 0/ 8: 100 230 330 TPT: Len 128, alignment 8/ 0: 100 120 330 TPT: Len 128, alignment 0/16: 100 170 330 TPT: Len 128, alignment 16/ 0: 90 120 330 TPT: Len 192, alignment 4/ 0: 150 180 450 TPT: Len 192, alignment 0/ 4: 160 190 780 TPT: Len 192, alignment 0/ 0: 110 140 790 TPT: Len 192, alignment 0/ 8: 110 300 450 TPT: Len 192, alignment 8/ 0: 110 140 450 TPT: Len 192, alignment 0/16: 110 220 450 TPT: Len 192, alignment 16/ 0: 110 140 450 TPT: Len 256, alignment 4/ 0: 180 210 610 TPT: Len 256, alignment 0/ 4: 190 220 1050 TPT: Len 256, alignment 0/ 0: 130 160 180 TPT: Len 256, alignment 0/ 8: 140 370 610 TPT: Len 256, alignment 8/ 0: 130 160 610 TPT: Len 256, alignment 0/16: 140 260 630 TPT: Len 256, alignment 16/ 0: 130 160 630
processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz stepping : 9 microcode : 0x12 cpu MHz : 3292.525 cache size : 6144 KB physical id : 0 siblings : 4 core id : 0 cpu cores : 4 apicid : 0 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms bogomips : 6585.05 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: processor : 1 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz stepping : 9 microcode : 0x12 cpu MHz : 3292.525 cache size : 6144 KB physical id : 0 siblings : 4 core id : 1 cpu cores : 4 apicid : 2 initial apicid : 2 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms bogomips : 6585.05 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: processor : 2 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz stepping : 9 microcode : 0x12 cpu MHz : 3292.525 cache size : 6144 KB physical id : 0 siblings : 4 core id : 2 cpu cores : 4 apicid : 4 initial apicid : 4 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms bogomips : 6585.05 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: processor : 3 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz stepping : 9 microcode : 0x12 cpu MHz : 3292.525 cache size : 6144 KB physical id : 0 siblings : 4 core id : 3 cpu cores : 4 apicid : 6 initial apicid : 6 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms bogomips : 6585.05 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management:
memcpy_new memcpy_c memcpy_c_e TPT: Len 0, alignment 0/ 0: 24 92 76 TPT: Len 4, alignment 0/ 0: 24 72 44 TPT: Len 8, alignment 0/ 0: 24 92 44 TPT: Len 12, alignment 0/ 0: 28 72 48 TPT: Len 16, alignment 0/ 0: 28 92 44 TPT: Len 20, alignment 0/ 0: 24 72 48 TPT: Len 24, alignment 0/ 0: 24 92 44 TPT: Len 28, alignment 0/ 0: 24 72 48 TPT: Len 32, alignment 0/ 0: 28 92 48 TPT: Len 36, alignment 0/ 0: 28 72 48 TPT: Len 40, alignment 0/ 0: 28 92 44 TPT: Len 44, alignment 0/ 0: 24 72 44 TPT: Len 48, alignment 0/ 0: 24 92 48 TPT: Len 52, alignment 0/ 0: 24 72 44 TPT: Len 56, alignment 0/ 0: 24 92 44 TPT: Len 60, alignment 0/ 0: 24 72 48 TPT: Len 0, alignment 4/ 0: 24 92 72 TPT: Len 0, alignment 0/ 4: 24 92 72 TPT: Len 0, alignment 0/ 0: 28 92 72 TPT: Len 0, alignment 0/ 8: 24 92 76 TPT: Len 0, alignment 8/ 0: 24 92 72 TPT: Len 0, alignment 0/16: 24 92 76 TPT: Len 0, alignment 16/ 0: 24 92 76 TPT: Len 64, alignment 4/ 0: 32 92 44 TPT: Len 64, alignment 0/ 4: 28 96 44 TPT: Len 64, alignment 0/ 0: 28 92 48 TPT: Len 64, alignment 0/ 8: 28 96 44 TPT: Len 64, alignment 8/ 0: 28 92 48 TPT: Len 64, alignment 0/16: 32 92 44 TPT: Len 64, alignment 16/ 0: 28 92 44 TPT: Len 128, alignment 4/ 0: 36 96 60 TPT: Len 128, alignment 0/ 4: 36 108 56 TPT: Len 128, alignment 0/ 0: 36 96 60 TPT: Len 128, alignment 0/ 8: 36 108 56 TPT: Len 128, alignment 8/ 0: 36 96 56 TPT: Len 128, alignment 0/16: 36 104 56 TPT: Len 128, alignment 16/ 0: 36 96 60 TPT: Len 192, alignment 4/ 0: 40 108 60 TPT: Len 192, alignment 0/ 4: 40 120 60 TPT: Len 192, alignment 0/ 0: 40 108 60 TPT: Len 192, alignment 0/ 8: 40 116 60 TPT: Len 192, alignment 8/ 0: 40 104 60 TPT: Len 192, alignment 0/16: 40 116 60 TPT: Len 192, alignment 16/ 0: 40 104 60 TPT: Len 256, alignment 4/ 0: 52 116 64 TPT: Len 256, alignment 0/ 4: 56 136 56 TPT: Len 256, alignment 0/ 0: 52 112 68 TPT: Len 256, alignment 0/ 8: 56 128 64 TPT: Len 256, alignment 8/ 0: 52 112 64 TPT: Len 256, alignment 0/16: 52 128 64 TPT: Len 256, alignment 16/ 0: 52 116 64
#include<stdio.h> #include <stdlib.h> typedef unsigned long long int hp_timing_t; #define MAXSAMPLESTPT 1000 #define MAXCOPYSIZE (1024 * 1024 * 100) #define ORIG 0 #define NEW 1 static char* buf1 = NULL; static char* buf2 = NULL; static int repeat_one_test = 32; hp_timing_t _dl_hp_timing_overhead; # define HP_TIMING_NOW(Var) \ ({ unsigned long long _hi, _lo; \ asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \ (Var) = _hi << 32 | _lo; }) #define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start)) #define HP_TIMING_TOTAL(total_time, start, end) \ do \ { \ hp_timing_t tmptime; \ HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \ total_time += tmptime; \ } \ while (0) #define HP_TIMING_BEST(best_time, start, end) \ do \ { \ hp_timing_t tmptime; \ HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \ if (best_time > tmptime) \ best_time = tmptime; \ } \ while (0) void memcpy_new(char *dst, char *src, int len); void memcpy_c(char *dst, char *src, int len); void memcpy_c_e(char *dst, char *src, int len); void (*do_memcpy)(char *dst, char *src, int len); static void do_one_test ( char *dst, char *src, size_t len) { hp_timing_t start __attribute ((unused)); hp_timing_t stop __attribute ((unused)); hp_timing_t best_time = ~ (hp_timing_t) 0; size_t i,j; for (i = 0; i < repeat_one_test; ++i) { HP_TIMING_NOW (start); do_memcpy ( dst, src, len); HP_TIMING_NOW (stop); HP_TIMING_BEST (best_time, start, stop); } printf ("\t%zd", (size_t) best_time); } static void do_test (size_t align1, size_t align2, size_t len) { size_t i, j; char *s1, *s2; s1 = (char *) (buf1 + align1); s2 = (char *) (buf2 + align2); printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2); do_memcpy = memcpy_new; do_one_test (s2, s1, len); do_memcpy = memcpy_c; do_one_test (s2, s1, len); do_memcpy = memcpy_c_e; do_one_test (s2, s1, len); putchar ('\n'); } static test_init(void) { int i; buf1 = valloc(MAXCOPYSIZE); buf2 = valloc(MAXCOPYSIZE); for (i = 0; i < MAXCOPYSIZE ; i = i + 64) { buf1[i] = buf2[i] = i & 0xff; } } void memset_c(char *dst, char *src, int len) { __asm__("mov %rdx, %rcx"); __asm__("shr $3, %rcx"); __asm__("rep stosq"); } void memset_2(char *dst, char *src, int len) { __asm__("sub $128, %rdx"); __asm__("1:"); __asm__("sub $128, %rdx"); __asm__("movdqa %xmm0, (%rdi)"); __asm__("movdqa %xmm0, 16(%rdi)"); __asm__("movdqa %xmm0, 32(%rdi)"); __asm__("movdqa %xmm0, 48(%rdi)"); __asm__("movdqa %xmm0, 64(%rdi)"); __asm__("movdqa %xmm0, 80(%rdi)"); __asm__("movdqa %xmm0, 96(%rdi)"); __asm__("movdqa %xmm0, 112(%rdi)"); __asm__("jae 1b"); } void memcpy_c(char *dst, char *src, int len) { __asm__("mov %rdi, %rax"); __asm__("movl %edx, %ecx"); __asm__("shrl $3, %ecx"); __asm__("andl $7, %edx"); __asm__("rep movsq"); __asm__("movl %edx, %ecx"); __asm__("rep movsb"); __asm__("1:"); } void memcpy_c_e(char *dst, char *src, int len) { __asm__("movq %rdi, %rax"); __asm__("movq %rdx, %rcx"); __asm__("rep movsb"); } void memcpy_new(char *dst, char *src, int len) { __asm__("movq %rdi, %rax"); __asm__("cmpq $0x20, %rdx"); __asm__("jb .Lhandle_tail"); /* * We check whether memory false dependence could occur, * then jump to corresponding copy mode. */ __asm__("cmp %dil, %sil"); __asm__("jl .Lcopy_backward"); __asm__("subq $0x20, %rdx"); __asm__(".Lcopy_forward_loop:"); __asm__("subq $0x20, %rdx"); /* * Move in blocks of 4x8 bytes: */ __asm__("movq 0*8(%rsi), %r8"); __asm__("movq 1*8(%rsi), %r9"); __asm__("movq 2*8(%rsi), %r10"); __asm__("movq 3*8(%rsi), %r11"); __asm__("leaq 4*8(%rsi), %rsi"); __asm__("movq %r8, 0*8(%rdi)"); __asm__("movq %r9, 1*8(%rdi)"); __asm__("movq %r10, 2*8(%rdi)"); __asm__("movq %r11, 3*8(%rdi)"); __asm__("leaq 4*8(%rdi), %rdi"); __asm__("jae .Lcopy_forward_loop"); __asm__("addl $0x20, %edx"); __asm__("jmp .Lhandle_tail"); __asm__(".Lcopy_backward:"); /* * Calculate copy position to tail. */ __asm__("addq %rdx, %rsi"); __asm__("addq %rdx, %rdi"); __asm__("subq $0x20, %rdx"); /* * At most 3 ALU operations in one cycle, * so append NOPS in the same 16bytes trunk. */ __asm__(".p2align 4"); __asm__(".Lcopy_backward_loop:"); __asm__("subq $0x20, %rdx"); __asm__("movq -1*8(%rsi), %r8"); __asm__("movq -2*8(%rsi), %r9"); __asm__("movq -3*8(%rsi), %r10"); __asm__("movq -4*8(%rsi), %r11"); __asm__("leaq -4*8(%rsi), %rsi"); __asm__("movq %r8, -1*8(%rdi)"); __asm__("movq %r9, -2*8(%rdi)"); __asm__("movq %r10, -3*8(%rdi)"); __asm__("movq %r11, -4*8(%rdi)"); __asm__("leaq -4*8(%rdi), %rdi"); __asm__("jae .Lcopy_backward_loop"); /* * Calculate copy position to head. */ __asm__("addl $0x20, %edx"); __asm__("subq %rdx, %rsi"); __asm__("subq %rdx, %rdi"); __asm__(".Lhandle_tail:"); __asm__("cmpl $16, %edx"); __asm__("jb .Lless_16bytes"); /* * Move data from 16 bytes to 31 bytes. */ __asm__("movq 0*8(%rsi), %r8"); __asm__("movq 1*8(%rsi), %r9"); __asm__("movq -2*8(%rsi, %rdx), %r10"); __asm__("movq -1*8(%rsi, %rdx), %r11"); __asm__("movq %r8, 0*8(%rdi)"); __asm__("movq %r9, 1*8(%rdi)"); __asm__("movq %r10, -2*8(%rdi, %rdx)"); __asm__("movq %r11, -1*8(%rdi, %rdx)"); __asm__("jmp .Lend"); __asm__(".p2align 4"); __asm__(".Lless_16bytes:"); __asm__("cmpl $8, %edx"); __asm__("jb .Lless_8bytes"); /* * Move data from 8 bytes to 15 bytes. */ __asm__("movq 0*8(%rsi), %r8"); __asm__("movq -1*8(%rsi, %rdx), %r9"); __asm__("movq %r8, 0*8(%rdi)"); __asm__("movq %r9, -1*8(%rdi, %rdx)"); __asm__("jmp .Lend"); __asm__(".p2align 4"); __asm__(".Lless_8bytes:"); __asm__("cmpl $4, %edx"); __asm__("jb .Lless_3bytes"); /* * Move data from 4 bytes to 7 bytes. */ __asm__("movl (%rsi), %ecx"); __asm__("movl -4(%rsi, %rdx), %r8d"); __asm__("movl %ecx, (%rdi)"); __asm__("movl %r8d, -4(%rdi, %rdx)"); __asm__("jmp .Lend"); __asm__(".p2align 4"); __asm__(".Lless_3bytes:"); __asm__("subl $1, %edx"); __asm__("jb .Lend"); /* * Move data from 1 bytes to 3 bytes. */ __asm__("movzbl (%rsi), %ecx"); __asm__("jz .Lstore_1byte"); __asm__("movzbq 1(%rsi), %r8"); __asm__("movzbq (%rsi, %rdx), %r9"); __asm__("movb %r8b, 1(%rdi)"); __asm__("movb %r9b, (%rdi, %rdx)"); __asm__(".Lstore_1byte:"); __asm__("movb %cl, (%rdi)"); __asm__(".Lend:"); } void main(void) { int i; test_init(); printf ("%23s", ""); printf ("\t%s\t%s\t%s\n", "memcpy_new", "memcpy_c", "memcpy_c_e"); for(i = 0; i< 64;i += 4 ) do_test(0, 0, i); for(i = 0; i< 576;i += 64 ) { do_test(4, 0, i); do_test(0, 4, i); do_test(0, 0, i); do_test(0, 8, i); do_test(8, 0, i); do_test(0, 8*2, i); do_test(8*2,0, i); } return ; }