Author: mjg
Date: Fri Oct  5 19:25:09 2018
New Revision: 339205
URL: https://svnweb.freebsd.org/changeset/base/339205

Log:
  amd64: make memset less slow with mov
  
  rep stos has a high startup time even on modern microarchitectures like
  Skylake. Intel optimization manuals discuss how for small sizes it is
  beneficial to go for streaming stores. Since those cannot be used without
  extra penalty in the kernel I investigated performance impact of just
  regular movs.
  
  The patch below implements a very simple scheme: a 32-byte loop followed
  by filling in the remainder of at most 31 bytes. It has a 256 breaking
  point on which it falls back to rep stos. It provides a significant win
  over the current primitive on several machines I tested (both Intel and
  AMD). A 64-byte loop did not provide any benefit even for multiple of 64
  sizes.
  
  See the review for benchmark data.
  
  Reviewed by:  kib
  Approved by:  re (gjb)
  Sponsored by: The FreeBSD Foundation
  Differential Revision:        https://reviews.freebsd.org/D17398

Modified:
  head/sys/amd64/amd64/support.S

Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S      Fri Oct  5 18:15:44 2018        
(r339204)
+++ head/sys/amd64/amd64/support.S      Fri Oct  5 19:25:09 2018        
(r339205)
@@ -320,43 +320,92 @@ END(memcpy_erms)
  * memset(dst, c,   len)
  *        rdi, rsi, rdx
  */
-ENTRY(memset_std)
+.macro MEMSET erms
        PUSH_FRAME_POINTER
        movq    %rdi,%r9
        movq    %rdx,%rcx
        movzbq  %sil,%r8
        movabs  $0x0101010101010101,%rax
        imulq   %r8,%rax
-       cmpq    $15,%rcx
-       jbe     1f
-       shrq    $3,%rcx
-       rep
-       stosq
-       movq    %rdx,%rcx
-       andq    $7,%rcx
-       jne     1f
+
+       cmpq    $32,%rcx
+       jb      1016f
+
+       cmpq    $256,%rcx
+       ja      1256f
+
+1032:
+       movq    %rax,(%rdi)
+       movq    %rax,8(%rdi)
+       movq    %rax,16(%rdi)
+       movq    %rax,24(%rdi)
+       leaq    32(%rdi),%rdi
+       subq    $32,%rcx
+       cmpq    $32,%rcx
+       jae     1032b
+       cmpb    $0,%cl
+       je      1000f
+1016:
+       cmpb    $16,%cl
+       jl      1008f
+       movq    %rax,(%rdi)
+       movq    %rax,8(%rdi)
+       subb    $16,%cl
+       jz      1000f
+       leaq    16(%rdi),%rdi
+1008:
+       cmpb    $8,%cl
+       jl      1004f
+       movq    %rax,(%rdi)
+       subb    $8,%cl
+       jz      1000f
+       leaq    8(%rdi),%rdi
+1004:
+       cmpb    $4,%cl
+       jl      1002f
+       movl    %eax,(%rdi)
+       subb    $4,%cl
+       jz      1000f
+       leaq    4(%rdi),%rdi
+1002:
+       cmpb    $2,%cl
+       jl      1001f
+       movw    %ax,(%rdi)
+       subb    $2,%cl
+       jz      1000f
+       leaq    2(%rdi),%rdi
+1001:
+       cmpb    $1,%cl
+       jl      1000f
+       movb    %al,(%rdi)
+1000:
        movq    %r9,%rax
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
-1:
+1256:
+.if \erms == 1
        rep
        stosb
+.else
+       shrq    $3,%rcx
+       rep
+       stosq
+       movq    %rdx,%rcx
+       andb    $7,%cl
+       jne     1004b
+.endif
        movq    %r9,%rax
        POP_FRAME_POINTER
        ret
+.endm
+
+ENTRY(memset_std)
+       MEMSET erms=0
 END(memset_std)
 
 ENTRY(memset_erms)
-       PUSH_FRAME_POINTER
-       movq    %rdi,%r9
-       movq    %rdx,%rcx
-       movb    %sil,%al
-       rep
-       stosb
-       movq    %r9,%rax
-       POP_FRAME_POINTER
-       ret
+       MEMSET erms=1
 END(memset_erms)
 
 /* fillw(pat, base, cnt) */
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to