Re: [PATCH RFC V2] [x86] Optimize small size memcpy by avoding long latency from decode stage

Ling Ma Mon, 22 Oct 2012 02:23:36 -0700

Attached memcpy micro benchmark, cpu info ,comparison results between
rep movsq/b and memcpy on atom, ivb.


Thanks
Ling


2012/10/23, ling.ma.prog...@gmail.com <ling.ma.prog...@gmail.com>:
> From: Ma Ling <ling.ma.prog...@gmail.com>
>
> CISC code has higher instruction density, saving memory and
> improving i-cache hit rate. However decode become challenge,
> only one mulitple-uops(2~3)instruction could be decoded in one cycle,
> and instructions containing more 4 uops(rep movsq/b) have to be handled by
> MS-ROM,
> the process take long time and eat up the advantage from it for small size.
>
>
> In order to avoid this disavantage, we take use of general instruction code
> for small size copy. The result shows it can get 1~2x improvement
> on Core2, Nehalem, Sandy Bridge, Ivy Bridge, Atom, and Bulldozer as well.
>
> Signed-off-by: Ma Ling <ling.ma.prog...@gmail.com>
> ---
> In this version we decrease warm up distance from 512 to 256 for coming
> CPUs,
> which manage to reduce latency, but long time to decode is still consumed.
>
> Thanks
> Ling
>
>  arch/x86/lib/memcpy_64.S |   14 +++++++++++++-
>  1 files changed, 13 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index 1c273be..6a24c8c 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -5,7 +5,6 @@
>  #include <asm/cpufeature.h>
>  #include <asm/dwarf2.h>
>  #include <asm/alternative-asm.h>
> -
>  /*
>   * memcpy - Copy a memory block.
>   *
> @@ -19,6 +18,15 @@
>   */
>
>  /*
> + * memcpy_c() and memcpy_c_e() use rep movsq/movsb respectively,
> + * the instruction have to get micro ops from Microcode Sequencser Rom.
> + * And the decode  process take long latency, in order to avoid it,
> + * we choose loop unrolling routine for small size.
> + * Could vary the warm up  distance.
> + */
> +
> +
> +/*
>   * memcpy_c() - fast string ops (REP MOVSQ) based variant.
>   *
>   * This gets patched over the unrolled variant (below) via the
> @@ -26,6 +34,8 @@
>   */
>       .section .altinstr_replacement, "ax", @progbits
>  .Lmemcpy_c:
> +     cmpq $256, %rdx
> +     jbe  memcpy     
>       movq %rdi, %rax
>       movq %rdx, %rcx
>       shrq $3, %rcx
> @@ -46,6 +56,8 @@
>   */
>       .section .altinstr_replacement, "ax", @progbits
>  .Lmemcpy_c_e:
> +     cmpq $256, %rdx
> +     jbe  memcpy
>       movq %rdi, %rax
>       movq %rdx, %rcx
>       rep movsb
> --
> 1.6.5.2
>
>

processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 28
model name      : Intel(R) Atom(TM) CPU N450   @ 1.66GHz
stepping        : 10
microcode       : 0x107
cpu MHz         : 1000.000
cache size      : 512 KB
physical id     : 0
siblings        : 2
core id         : 0
cpu cores       : 1
apicid          : 0
initial apicid  : 0
fpu             : yes
fpu_exception   : yes
cpuid level     : 10
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc 
arch_perfmon pebs bts rep_good nopl aperfmperf pni dtes64 monitor ds_cpl est 
tm2 ssse3 cx16 xtpr pdcm movbe lahf_lm dts
bogomips        : 3324.62
clflush size    : 64
cache_alignment : 64
address sizes   : 32 bits physical, 48 bits virtual
power management:

processor       : 1
vendor_id       : GenuineIntel
cpu family      : 6
model           : 28
model name      : Intel(R) Atom(TM) CPU N450   @ 1.66GHz
stepping        : 10
microcode       : 0x107
cpu MHz         : 1000.000
cache size      : 512 KB
physical id     : 0
siblings        : 2
core id         : 0
cpu cores       : 1
apicid          : 1
initial apicid  : 1
fpu             : yes
fpu_exception   : yes
cpuid level     : 10
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc 
arch_perfmon pebs bts rep_good nopl aperfmperf pni dtes64 monitor ds_cpl est 
tm2 ssse3 cx16 xtpr pdcm movbe lahf_lm dts
bogomips        : 3324.62
clflush size    : 64
cache_alignment : 64
address sizes   : 32 bits physical, 48 bits virtual
power management:

                        memcpy_new      memcpy_c        memcpy_c_e
TPT: Len    0, alignment  0/ 0: 50      90      70
TPT: Len    4, alignment  0/ 0: 60      110     80
TPT: Len    8, alignment  0/ 0: 60      100     100
TPT: Len   12, alignment  0/ 0: 50      120     110
TPT: Len   16, alignment  0/ 0: 60      100     130
TPT: Len   20, alignment  0/ 0: 60      120     140
TPT: Len   24, alignment  0/ 0: 60      100     160
TPT: Len   28, alignment  0/ 0: 60      120     180
TPT: Len   32, alignment  0/ 0: 60      100     190
TPT: Len   36, alignment  0/ 0: 70      120     200
TPT: Len   40, alignment  0/ 0: 70      100     220
TPT: Len   44, alignment  0/ 0: 70      120     240
TPT: Len   48, alignment  0/ 0: 70      110     250
TPT: Len   52, alignment  0/ 0: 70      130     270
TPT: Len   56, alignment  0/ 0: 70      110     280
TPT: Len   60, alignment  0/ 0: 70      130     290
TPT: Len    0, alignment  4/ 0: 50      90      70
TPT: Len    0, alignment  0/ 4: 50      90      70
TPT: Len    0, alignment  0/ 0: 50      90      70
TPT: Len    0, alignment  0/ 8: 50      90      70
TPT: Len    0, alignment  8/ 0: 50      90      70
TPT: Len    0, alignment  0/16: 50      90      70
TPT: Len    0, alignment 16/ 0: 50      90      70
TPT: Len   64, alignment  4/ 0: 90      120     200
TPT: Len   64, alignment  0/ 4: 90      130     300
TPT: Len   64, alignment  0/ 0: 70      110     310
TPT: Len   64, alignment  0/ 8: 80      160     200
TPT: Len   64, alignment  8/ 0: 70      110     200
TPT: Len   64, alignment  0/16: 80      130     200
TPT: Len   64, alignment 16/ 0: 70      110     200
TPT: Len  128, alignment  4/ 0: 120     150     330
TPT: Len  128, alignment  0/ 4: 130     160     540
TPT: Len  128, alignment  0/ 0: 100     130     550
TPT: Len  128, alignment  0/ 8: 100     230     330
TPT: Len  128, alignment  8/ 0: 100     120     330
TPT: Len  128, alignment  0/16: 100     170     330
TPT: Len  128, alignment 16/ 0: 90      120     330
TPT: Len  192, alignment  4/ 0: 150     180     450
TPT: Len  192, alignment  0/ 4: 160     190     780
TPT: Len  192, alignment  0/ 0: 110     140     790
TPT: Len  192, alignment  0/ 8: 110     300     450
TPT: Len  192, alignment  8/ 0: 110     140     450
TPT: Len  192, alignment  0/16: 110     220     450
TPT: Len  192, alignment 16/ 0: 110     140     450
TPT: Len  256, alignment  4/ 0: 180     210     610
TPT: Len  256, alignment  0/ 4: 190     220     1050
TPT: Len  256, alignment  0/ 0: 130     160     180
TPT: Len  256, alignment  0/ 8: 140     370     610
TPT: Len  256, alignment  8/ 0: 130     160     610
TPT: Len  256, alignment  0/16: 140     260     630
TPT: Len  256, alignment 16/ 0: 130     160     630

processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 58
model name      : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz
stepping        : 9
microcode       : 0x12
cpu MHz         : 3292.525
cache size      : 6144 KB
physical id     : 0
siblings        : 4
core id         : 0
cpu cores       : 4
apicid          : 0
initial apicid  : 0
fpu             : yes
fpu_exception   : yes
cpuid level     : 13
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm 
constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc 
aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr 
pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c 
rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi 
flexpriority ept vpid fsgsbase smep erms
bogomips        : 6585.05
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

processor       : 1
vendor_id       : GenuineIntel
cpu family      : 6
model           : 58
model name      : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz
stepping        : 9
microcode       : 0x12
cpu MHz         : 3292.525
cache size      : 6144 KB
physical id     : 0
siblings        : 4
core id         : 1
cpu cores       : 4
apicid          : 2
initial apicid  : 2
fpu             : yes
fpu_exception   : yes
cpuid level     : 13
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm 
constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc 
aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr 
pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c 
rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi 
flexpriority ept vpid fsgsbase smep erms
bogomips        : 6585.05
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

processor       : 2
vendor_id       : GenuineIntel
cpu family      : 6
model           : 58
model name      : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz
stepping        : 9
microcode       : 0x12
cpu MHz         : 3292.525
cache size      : 6144 KB
physical id     : 0
siblings        : 4
core id         : 2
cpu cores       : 4
apicid          : 4
initial apicid  : 4
fpu             : yes
fpu_exception   : yes
cpuid level     : 13
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm 
constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc 
aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr 
pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c 
rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi 
flexpriority ept vpid fsgsbase smep erms
bogomips        : 6585.05
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

processor       : 3
vendor_id       : GenuineIntel
cpu family      : 6
model           : 58
model name      : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz
stepping        : 9
microcode       : 0x12
cpu MHz         : 3292.525
cache size      : 6144 KB
physical id     : 0
siblings        : 4
core id         : 3
cpu cores       : 4
apicid          : 6
initial apicid  : 6
fpu             : yes
fpu_exception   : yes
cpuid level     : 13
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm 
constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc 
aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr 
pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c 
rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi 
flexpriority ept vpid fsgsbase smep erms
bogomips        : 6585.05
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

                        memcpy_new      memcpy_c        memcpy_c_e
TPT: Len    0, alignment  0/ 0: 24      92      76
TPT: Len    4, alignment  0/ 0: 24      72      44
TPT: Len    8, alignment  0/ 0: 24      92      44
TPT: Len   12, alignment  0/ 0: 28      72      48
TPT: Len   16, alignment  0/ 0: 28      92      44
TPT: Len   20, alignment  0/ 0: 24      72      48
TPT: Len   24, alignment  0/ 0: 24      92      44
TPT: Len   28, alignment  0/ 0: 24      72      48
TPT: Len   32, alignment  0/ 0: 28      92      48
TPT: Len   36, alignment  0/ 0: 28      72      48
TPT: Len   40, alignment  0/ 0: 28      92      44
TPT: Len   44, alignment  0/ 0: 24      72      44
TPT: Len   48, alignment  0/ 0: 24      92      48
TPT: Len   52, alignment  0/ 0: 24      72      44
TPT: Len   56, alignment  0/ 0: 24      92      44
TPT: Len   60, alignment  0/ 0: 24      72      48
TPT: Len    0, alignment  4/ 0: 24      92      72
TPT: Len    0, alignment  0/ 4: 24      92      72
TPT: Len    0, alignment  0/ 0: 28      92      72
TPT: Len    0, alignment  0/ 8: 24      92      76
TPT: Len    0, alignment  8/ 0: 24      92      72
TPT: Len    0, alignment  0/16: 24      92      76
TPT: Len    0, alignment 16/ 0: 24      92      76
TPT: Len   64, alignment  4/ 0: 32      92      44
TPT: Len   64, alignment  0/ 4: 28      96      44
TPT: Len   64, alignment  0/ 0: 28      92      48
TPT: Len   64, alignment  0/ 8: 28      96      44
TPT: Len   64, alignment  8/ 0: 28      92      48
TPT: Len   64, alignment  0/16: 32      92      44
TPT: Len   64, alignment 16/ 0: 28      92      44
TPT: Len  128, alignment  4/ 0: 36      96      60
TPT: Len  128, alignment  0/ 4: 36      108     56
TPT: Len  128, alignment  0/ 0: 36      96      60
TPT: Len  128, alignment  0/ 8: 36      108     56
TPT: Len  128, alignment  8/ 0: 36      96      56
TPT: Len  128, alignment  0/16: 36      104     56
TPT: Len  128, alignment 16/ 0: 36      96      60
TPT: Len  192, alignment  4/ 0: 40      108     60
TPT: Len  192, alignment  0/ 4: 40      120     60
TPT: Len  192, alignment  0/ 0: 40      108     60
TPT: Len  192, alignment  0/ 8: 40      116     60
TPT: Len  192, alignment  8/ 0: 40      104     60
TPT: Len  192, alignment  0/16: 40      116     60
TPT: Len  192, alignment 16/ 0: 40      104     60
TPT: Len  256, alignment  4/ 0: 52      116     64
TPT: Len  256, alignment  0/ 4: 56      136     56
TPT: Len  256, alignment  0/ 0: 52      112     68
TPT: Len  256, alignment  0/ 8: 56      128     64
TPT: Len  256, alignment  8/ 0: 52      112     64
TPT: Len  256, alignment  0/16: 52      128     64
TPT: Len  256, alignment 16/ 0: 52      116     64

#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define  MAXSAMPLESTPT        1000
#define  MAXCOPYSIZE          (1024 * 1024 * 100)
#define  ORIG  0
#define  NEW   1
static char* buf1 = NULL;
static char* buf2 = NULL;
static int repeat_one_test = 32;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
  ({ unsigned long long _hi, _lo; \
     asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
     (Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End)	(Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
	total_time += tmptime;						\
    }									\
  while (0)

#define HP_TIMING_BEST(best_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
      if (best_time > tmptime)						\
	best_time = tmptime;						\
    }									\
  while (0)


void memcpy_new(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void memcpy_c_e(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_test ( char *dst, char *src,
	     size_t len)
{
      hp_timing_t start __attribute ((unused));
      hp_timing_t stop __attribute ((unused));
      hp_timing_t best_time = ~ (hp_timing_t) 0;
      size_t i,j;

      for (i = 0; i < repeat_one_test; ++i)
	{
	  HP_TIMING_NOW (start);
	  do_memcpy ( dst, src, len);
	  HP_TIMING_NOW (stop);
	  HP_TIMING_BEST (best_time, start, stop);
	}

      printf ("\t%zd", (size_t) best_time);
}

static void
do_test (size_t align1, size_t align2, size_t len)
{
  size_t i, j;
  char *s1, *s2;

  s1 = (char *) (buf1 + align1);
  s2 = (char *) (buf2 + align2);


   printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
   do_memcpy = memcpy_new;
   do_one_test (s2, s1, len);
   do_memcpy = memcpy_c;
   do_one_test (s2, s1, len);
   do_memcpy = memcpy_c_e;
   do_one_test (s2, s1, len);

    putchar ('\n');
}

static test_init(void)
{
  int i;
  buf1 = valloc(MAXCOPYSIZE);
  buf2 = valloc(MAXCOPYSIZE);

  for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
        buf1[i] = buf2[i] = i & 0xff;
  }

}

void memset_c(char *dst, char *src, int len)
{
	__asm__("mov %rdx, %rcx");
	__asm__("shr $3, %rcx");
	__asm__("rep stosq");
}
void memset_2(char *dst, char *src, int len)
{
	__asm__("sub  $128, %rdx");
	__asm__("1:");
	__asm__("sub  $128, %rdx");
	__asm__("movdqa %xmm0, (%rdi)");
	__asm__("movdqa %xmm0, 16(%rdi)");
	__asm__("movdqa %xmm0, 32(%rdi)");
	__asm__("movdqa %xmm0, 48(%rdi)");
	__asm__("movdqa %xmm0, 64(%rdi)");
	__asm__("movdqa %xmm0, 80(%rdi)");
	__asm__("movdqa %xmm0, 96(%rdi)");
	__asm__("movdqa %xmm0, 112(%rdi)");
	__asm__("jae 1b");

}

void memcpy_c(char *dst, char *src, int len)
{

	__asm__("mov %rdi, %rax");

	__asm__("movl %edx, %ecx");
	__asm__("shrl $3, %ecx");
	__asm__("andl $7, %edx");
	__asm__("rep movsq");
	__asm__("movl %edx, %ecx");
	__asm__("rep movsb");
	__asm__("1:");

}
void memcpy_c_e(char *dst, char *src, int len)
{

	__asm__("movq %rdi, %rax");
	__asm__("movq %rdx, %rcx");
	__asm__("rep movsb");

}
void memcpy_new(char *dst, char *src, int len)
{
	__asm__("movq %rdi, %rax");

	__asm__("cmpq $0x20, %rdx");
	__asm__("jb .Lhandle_tail");

	/*
	 * We check whether memory false dependence could occur,
	 * then jump to corresponding copy mode.
	 */
	__asm__("cmp  %dil, %sil");
	__asm__("jl .Lcopy_backward");
	__asm__("subq $0x20, %rdx");
__asm__(".Lcopy_forward_loop:");
	__asm__("subq $0x20,	%rdx");

	/*
	 * Move in blocks of 4x8 bytes:
	 */
	__asm__("movq 0*8(%rsi),	%r8");
	__asm__("movq 1*8(%rsi),	%r9");
	__asm__("movq 2*8(%rsi),	%r10");
	__asm__("movq 3*8(%rsi),	%r11");
	__asm__("leaq 4*8(%rsi),	%rsi");

	__asm__("movq %r8,	0*8(%rdi)");
	__asm__("movq %r9,	1*8(%rdi)");
	__asm__("movq %r10,	2*8(%rdi)");
	__asm__("movq %r11,	3*8(%rdi)");
	__asm__("leaq 4*8(%rdi),	%rdi");
	__asm__("jae  .Lcopy_forward_loop");
	__asm__("addl $0x20,	%edx");
	__asm__("jmp  .Lhandle_tail");

__asm__(".Lcopy_backward:");
	/*
	 * Calculate copy position to tail.
	 */
	__asm__("addq %rdx,	%rsi");
	__asm__("addq %rdx,	%rdi");
	__asm__("subq $0x20,	%rdx");
	/*
	 * At most 3 ALU operations in one cycle,
	 * so append NOPS in the same 16bytes trunk.
	 */
	__asm__(".p2align 4");
__asm__(".Lcopy_backward_loop:");
	__asm__("subq $0x20,	%rdx");
	__asm__("movq -1*8(%rsi),	%r8");
	__asm__("movq -2*8(%rsi),	%r9");
	__asm__("movq -3*8(%rsi),	%r10");
	__asm__("movq -4*8(%rsi),	%r11");
	__asm__("leaq -4*8(%rsi),	%rsi");
	__asm__("movq %r8,		-1*8(%rdi)");
	__asm__("movq %r9,		-2*8(%rdi)");
	__asm__("movq %r10,		-3*8(%rdi)");
	__asm__("movq %r11,		-4*8(%rdi)");
	__asm__("leaq -4*8(%rdi),	%rdi");
	__asm__("jae  .Lcopy_backward_loop");

	/*
	 * Calculate copy position to head.
	 */
	__asm__("addl $0x20,	%edx");
	__asm__("subq %rdx,	%rsi");
	__asm__("subq %rdx,	%rdi");
__asm__(".Lhandle_tail:");
	__asm__("cmpl $16,	%edx");
	__asm__("jb   .Lless_16bytes");

	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
	__asm__("movq 0*8(%rsi), %r8");
	__asm__("movq 1*8(%rsi),	%r9");
	__asm__("movq -2*8(%rsi, %rdx),	%r10");
	__asm__("movq -1*8(%rsi, %rdx),	%r11");
	__asm__("movq %r8,	0*8(%rdi)");
	__asm__("movq %r9,	1*8(%rdi)");
	__asm__("movq %r10,	-2*8(%rdi, %rdx)");
	__asm__("movq %r11,	-1*8(%rdi, %rdx)");
	__asm__("jmp .Lend");
	__asm__(".p2align 4");
__asm__(".Lless_16bytes:");
	__asm__("cmpl $8,	%edx");
	__asm__("jb   .Lless_8bytes");
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	__asm__("movq 0*8(%rsi),	%r8");
	__asm__("movq -1*8(%rsi, %rdx),	%r9");
	__asm__("movq %r8,	0*8(%rdi)");
	__asm__("movq %r9,	-1*8(%rdi, %rdx)");
	__asm__("jmp .Lend");
	__asm__(".p2align 4");
__asm__(".Lless_8bytes:");
	__asm__("cmpl $4,	%edx");
	__asm__("jb   .Lless_3bytes");

	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	__asm__("movl (%rsi), %ecx");
	__asm__("movl -4(%rsi, %rdx), %r8d");
	__asm__("movl %ecx, (%rdi)");
	__asm__("movl %r8d, -4(%rdi, %rdx)");
	__asm__("jmp .Lend");
	__asm__(".p2align 4");
__asm__(".Lless_3bytes:");
	__asm__("subl $1, %edx");
	__asm__("jb .Lend");
	/*
	 * Move data from 1 bytes to 3 bytes.
	 */
	__asm__("movzbl (%rsi), %ecx");
	__asm__("jz .Lstore_1byte");
	__asm__("movzbq 1(%rsi), %r8");
	__asm__("movzbq (%rsi, %rdx), %r9");
	__asm__("movb %r8b, 1(%rdi)");
	__asm__("movb %r9b, (%rdi, %rdx)");
__asm__(".Lstore_1byte:");
	__asm__("movb %cl, (%rdi)");


__asm__(".Lend:");
}


void main(void)
{
  int i;
  test_init();
  printf ("%23s", "");
  printf ("\t%s\t%s\t%s\n", "memcpy_new", "memcpy_c", "memcpy_c_e");
    for(i = 0; i< 64;i += 4 ) 
	do_test(0, 0, i);
    for(i = 0; i< 576;i += 64 ) {
	do_test(4, 0, i);
	do_test(0, 4, i);
	do_test(0, 0, i);
	do_test(0, 8, i);
	do_test(8, 0, i);
	do_test(0, 8*2, i);
	do_test(8*2,0, i);
    }
	return ;
}

Re: [PATCH RFC V2] [x86] Optimize small size memcpy by avoding long latency from decode stage

Reply via email to