Re: [PATCH RFC] x86:Improve memset with general 64bit instruction

Ling Ma Mon, 07 Apr 2014 07:54:13 -0700

Append test suit
after tar, run ./test command please.

thanks


2014-04-07 22:50 GMT+08:00, ling.ma.prog...@gmail.com
<ling.ma.prog...@gmail.com>:
> From: Ling Ma <ling...@alibaba-inc.com>
>
> In this patch we manage to reduce miss branch prediction by
> avoiding using branch instructions and force destination to be aligned
> with general 64bit instruction.
> Below compared results shows we improve performance up to 1.8x
> (We modified test suit from Ondra, send after this patch)
>
> Bytes: ORG_TIME: NEW_TIME: ORG vs NEW:
> 7       0.51    0.48    1.06
> 16      0.55    0.38    1.44
> 18      0.61    0.44    1.38
> 21      0.62    0.47    1.31
> 25      0.64    0.45    1.42
> 30      0.65    0.45    1.44
> 36      0.66    0.44    1.50
> 38      0.67    0.46    1.45
> 62      0.70    0.44    1.59
> 75      0.71    0.44    1.61
> 85      0.73    0.46    1.58
> 120     0.78    0.44    1.77
> 193     0.81    0.46    1.76
> 245     0.84    0.52    1.61
> 256     0.83    0.45    1.84
> 356     0.86    0.55    1.56
> 601     0.98    0.65    1.50
> 958     1.14    0.81    1.40
> 1024    1.19    0.86    1.38
> 2048    1.69    1.34    1.26
> Signed-off-by: Ling Ma <ling...@alibaba-inc.com>
> ---
>  arch/x86/include/asm/alternative-asm.h |   4 +-
>  arch/x86/lib/memset_64.S               | 172
> +++++++++++++++++++++------------
>  2 files changed, 110 insertions(+), 66 deletions(-)
>
> diff --git a/arch/x86/include/asm/alternative-asm.h
> b/arch/x86/include/asm/alternative-asm.h
> index 372231c..aaac545 100644
> --- a/arch/x86/include/asm/alternative-asm.h
> +++ b/arch/x86/include/asm/alternative-asm.h
> @@ -22,8 +22,8 @@
>       .long \orig - .
>       .long \alt - .
>       .word \feature
> -     .byte \orig_len
> -     .byte \alt_len
> +     .word \orig_len
> +     .word \alt_len
>  .endm
>
>  #endif  /*  __ASSEMBLY__  */
> diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
> index 2dcb380..3eca27c 100644
> --- a/arch/x86/lib/memset_64.S
> +++ b/arch/x86/lib/memset_64.S
> @@ -59,77 +59,121 @@
>  ENTRY(memset)
>  ENTRY(__memset)
>       CFI_STARTPROC
> -     movq %rdi,%r10
> -
> -     /* expand byte value  */
>       movzbl %sil,%ecx
> -     movabs $0x0101010101010101,%rax
> -     imulq  %rcx,%rax
> -
> -     /* align dst */
> -     movl  %edi,%r9d
> -     andl  $7,%r9d
> -     jnz  .Lbad_alignment
> -     CFI_REMEMBER_STATE
> -.Lafter_bad_alignment:
> -
> -     movq  %rdx,%rcx
> -     shrq  $6,%rcx
> -     jz       .Lhandle_tail
> -
> +     mov $0x0101010101010101,%rsi
> +     imulq  %rsi,%rcx
> +     movq %rdi,%rax
> +     lea     (%rdi, %rdx), %r8
> +     cmp     $128, %rdx
> +     ja      .Lmore128bytes
> +     cmp     $64, %edx
> +     jb      .Lless_64bytes
> +     /*
> +      * Move data from 65 bytes to 128 bytes.
> +      */
> +     mov %rcx, 0x00(%rdi)
> +     mov %rcx, 0x08(%rdi)
> +     mov %rcx, 0x10(%rdi)
> +     mov %rcx, 0x18(%rdi)
> +     mov %rcx, 0x20(%rdi)
> +     mov %rcx, 0x28(%rdi)
> +     mov %rcx, 0x30(%rdi)
> +     mov %rcx, 0x38(%rdi)
> +     mov %rcx, -0x40(%r8)
> +     mov %rcx, -0x38(%r8)
> +     mov %rcx, -0x30(%r8)
> +     mov %rcx, -0x28(%r8)
> +     mov %rcx, -0x20(%r8)
> +     mov %rcx, -0x18(%r8)
> +     mov %rcx, -0x10(%r8)
> +     mov %rcx, -0x08(%r8)
> +     ret
>       .p2align 4
> -.Lloop_64:
> -     decq  %rcx
> -     movq  %rax,(%rdi)
> -     movq  %rax,8(%rdi)
> -     movq  %rax,16(%rdi)
> -     movq  %rax,24(%rdi)
> -     movq  %rax,32(%rdi)
> -     movq  %rax,40(%rdi)
> -     movq  %rax,48(%rdi)
> -     movq  %rax,56(%rdi)
> -     leaq  64(%rdi),%rdi
> -     jnz    .Lloop_64
> -
> -     /* Handle tail in loops. The loops should be faster than hard
> -        to predict jump tables. */
> +.Lless_64bytes:
> +     cmp     $32, %edx
> +     jb      .Lless_32bytes
> +     /*
> +      * Move data from 33 bytes to 64 bytes.
> +      */
> +     mov %rcx, 0x00(%rdi)
> +     mov %rcx, 0x08(%rdi)
> +     mov %rcx, 0x10(%rdi)
> +     mov %rcx, 0x18(%rdi)
> +     mov %rcx, -0x20(%r8)
> +     mov %rcx, -0x18(%r8)
> +     mov %rcx, -0x10(%r8)
> +     mov %rcx, -0x08(%r8)
> +     ret
>       .p2align 4
> -.Lhandle_tail:
> -     movl    %edx,%ecx
> -     andl    $63&(~7),%ecx
> -     jz              .Lhandle_7
> -     shrl    $3,%ecx
> +.Lless_32bytes:
> +     cmp     $16, %edx
> +     jb      .Lless_16bytes
> +     mov %rcx, 0x00(%rdi)
> +     mov %rcx, 0x08(%rdi)
> +     mov %rcx, -0x10(%r8)
> +     mov %rcx, -0x08(%r8)
> +     ret
>       .p2align 4
> -.Lloop_8:
> -     decl   %ecx
> -     movq  %rax,(%rdi)
> -     leaq  8(%rdi),%rdi
> -     jnz    .Lloop_8
> -
> -.Lhandle_7:
> -     andl    $7,%edx
> -     jz      .Lende
> +.Lless_16bytes:
> +     cmp     $8, %edx
> +     jb      .Lless_8bytes
> +     mov %rcx, (%rdi)
> +     mov %rcx, -0x08(%r8)
> +     ret
>       .p2align 4
> -.Lloop_1:
> -     decl    %edx
> -     movb    %al,(%rdi)
> -     leaq    1(%rdi),%rdi
> -     jnz     .Lloop_1
> -
> -.Lende:
> -     movq    %r10,%rax
> +.Lless_8bytes:
> +     cmp     $4, %edx
> +     jb      .Lless_4bytes
> +     mov %ecx, (%rdi)
> +     mov %ecx, -0x04(%r8)
> +     .p2align 4
> +.Lless_4bytes:
> +     cmp     $2, %edx
> +     jb      .Lless_2bytes
> +     mov     %cx, (%rdi)
> +     mov     %cx, -0x02(%r8)
> +     ret
> +     .p2align 4
> +.Lless_2bytes:
> +     cmp     $1, %edx
> +     jb      .Lless_1bytes
> +     mov     %cl, (%rdi)
> +.Lless_1bytes:
>       ret
>
> -     CFI_RESTORE_STATE
> -.Lbad_alignment:
> -     cmpq $7,%rdx
> -     jbe     .Lhandle_7
> -     movq %rax,(%rdi)        /* unaligned store */
> -     movq $8,%r8
> -     subq %r9,%r8
> -     addq %r8,%rdi
> -     subq %r8,%rdx
> -     jmp .Lafter_bad_alignment
> +     .p2align 4
> +.Lmore128bytes:
> +     mov     %rcx, (%rdi)
> +     mov     %rdi, %r9
> +     and     $-0x08, %rdi
> +     add     $0x08, %rdi
> +     sub     %rdi, %r9
> +     add     %r9, %rdx
> +     sub     $0x40, %rdx
> +.Lgobble_64_loop:
> +     mov             %rcx, 0x00(%rdi)
> +     mov             %rcx, 0x08(%rdi)
> +     mov             %rcx, 0x10(%rdi)
> +     mov             %rcx, 0x18(%rdi)
> +     mov             %rcx, 0x20(%rdi)
> +     mov             %rcx, 0x28(%rdi)
> +     mov             %rcx, 0x30(%rdi)
> +     mov             %rcx, 0x38(%rdi)
> +     lea     0x40(%rdi), %rdi
> +     sub     $0x40, %rdx
> +     jae     .Lgobble_64_loop
> +     /*
> +      * Move data from 0 bytes to 63 bytes.
> +      */
> +     mov             %rcx, -0x40(%r8)
> +     mov             %rcx, -0x38(%r8)
> +     mov             %rcx, -0x30(%r8)
> +     mov             %rcx, -0x28(%r8)
> +     mov             %rcx, -0x20(%r8)
> +     mov             %rcx, -0x18(%r8)
> +     mov             %rcx, -0x10(%r8)
> +     mov             %rcx, -0x08(%r8)
> +     ret
>  .Lfinal:
>       CFI_ENDPROC
>  ENDPROC(memset)
> --
> 1.8.1.4
>
>

memset_kernel.tar
Description: Unix tar archive

Re: [PATCH RFC] x86:Improve memset with general 64bit instruction

Reply via email to