git-arm64-memcmp.diff: Backport optimized memcmp for AArch64, improving performance from 25% to 500% (LP: #1720832)

Adam Conrad Wed, 11 Oct 2017 13:30:54 -0700

This is an automated email from the git hooks/post-receive script.

adconrad pushed a commit to branch glibc-2.26
in repository glibc.


commit d4f6d3805e95af3f6aaf47d16ed6eac7783391f4
Author: Adam Conrad <adcon...@0c3.net>
Date:   Wed Oct 11 14:08:40 2017 -0600

    debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp for 
AArch64, improving performance from 25% to 500% (LP: #1720832)
---
 debian/changelog                         |   2 +
 debian/patches/arm/git-arm64-memcmp.diff | 232 +++++++++++++++++++++++++++++++
 debian/patches/series                    |   1 +
 3 files changed, 235 insertions(+)

diff --git a/debian/changelog b/debian/changelog
index 1356817..8e797cb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -30,6 +30,8 @@ glibc (2.26-0experimental0) UNRELEASED; urgency=medium
       agree with the sorting we see in Debian, may need another look.
     - debian/patches/any/local-cudacc-float128.diff: Local patch to prevent
       defining __HAVE_FLOAT128 on NVIDIA's CUDA compilers (LP: #1717257)
+    - debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp
+      for AArch64, improving performance from 25% to 500% (LP: #1720832)
     - debian/control.in/libc: Drop ancient Breaks satisfied in oldoldstable.
     - debian/{debhelper.in/libc.preinst,sysdeps/amd64.mk,sysdeps/i386.mk}:
       Bump MIN_KERNEL_SUPPORTED to 3.2 on x86, following upstream's change.
diff --git a/debian/patches/arm/git-arm64-memcmp.diff 
b/debian/patches/arm/git-arm64-memcmp.diff
new file mode 100644
index 0000000..4b31caf
--- /dev/null
+++ b/debian/patches/arm/git-arm64-memcmp.diff
@@ -0,0 +1,232 @@
+commit 922369032c604b4dcfd535e1bcddd4687e7126a5
+Author: Wilco Dijkstra <wdijk...@arm.com>
+Date:   Thu Aug 10 17:00:38 2017 +0100
+
+    [AArch64] Optimized memcmp.
+    
+    This is an optimized memcmp for AArch64.  This is a complete rewrite
+    using a different algorithm.  The previous version split into cases
+    where both inputs were aligned, the inputs were mutually aligned and
+    unaligned using a byte loop.  The new version combines all these cases,
+    while small inputs of less than 8 bytes are handled separately.
+    
+    This allows the main code to be sped up using unaligned loads since
+    there are now at least 8 bytes to be compared.  After the first 8 bytes,
+    align the first input.  This ensures each iteration does at most one
+    unaligned access and mutually aligned inputs behave as aligned.
+    After the main loop, process the last 8 bytes using unaligned accesses.
+    
+    This improves performance of (mutually) aligned cases by 25% and
+    unaligned by >500% (yes >6 times faster) on large inputs.
+    
+            * sysdeps/aarch64/memcmp.S (memcmp):
+            Rewrite of optimized memcmp.
+
+diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
+index 4cfcb89297..b99c081bba 100644
+--- a/sysdeps/aarch64/memcmp.S
++++ b/sysdeps/aarch64/memcmp.S
+@@ -22,132 +22,98 @@
+ 
+ /* Assumptions:
+  *
+- * ARMv8-a, AArch64
++ * ARMv8-a, AArch64, unaligned accesses.
+  */
+ 
+ /* Parameters and result.  */
+ #define src1          x0
+ #define src2          x1
+ #define limit         x2
+-#define result                x0
++#define result                w0
+ 
+ /* Internal variables.  */
+ #define data1         x3
+ #define data1w                w3
+ #define data2         x4
+ #define data2w                w4
+-#define has_nul               x5
+-#define diff          x6
+-#define endloop               x7
+-#define tmp1          x8
+-#define tmp2          x9
+-#define tmp3          x10
+-#define pos           x11
+-#define limit_wd      x12
+-#define mask          x13
++#define tmp1          x5
+ 
+ ENTRY_ALIGN (memcmp, 6)
+       DELOUSE (0)
+       DELOUSE (1)
+       DELOUSE (2)
+-      cbz     limit, L(ret0)
+-      eor     tmp1, src1, src2
+-      tst     tmp1, #7
+-      b.ne    L(misaligned8)
+-      ands    tmp1, src1, #7
+-      b.ne    L(mutual_align)
+-      add     limit_wd, limit, #7
+-      lsr     limit_wd, limit_wd, #3
+-      /* Start of performance-critical section  -- one 64B cache line.  */
+-L(loop_aligned):
+-      ldr     data1, [src1], #8
+-      ldr     data2, [src2], #8
+-L(start_realigned):
+-      subs    limit_wd, limit_wd, #1
+-      eor     diff, data1, data2      /* Non-zero if differences found.  */
+-      csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
+-      cbz     endloop, L(loop_aligned)
+-      /* End of performance-critical section  -- one 64B cache line.  */
+-
+-      /* Not reached the limit, must have found a diff.  */
+-      cbnz    limit_wd, L(not_limit)
+-
+-      /* Limit % 8 == 0 => all bytes significant.  */
+-      ands    limit, limit, #7
+-      b.eq    L(not_limit)
+-
+-      lsl     limit, limit, #3        /* Bits -> bytes.  */
+-      mov     mask, #~0
+-#ifdef __AARCH64EB__
+-      lsr     mask, mask, limit
+-#else
+-      lsl     mask, mask, limit
+-#endif
+-      bic     data1, data1, mask
+-      bic     data2, data2, mask
+-
+-      orr     diff, diff, mask
+-L(not_limit):
+ 
+-#ifndef       __AARCH64EB__
+-      rev     diff, diff
++      subs    limit, limit, 8
++      b.lo    .Lless8
++
++      /* Limit >= 8, so check first 8 bytes using unaligned loads.  */
++      ldr     data1, [src1], 8
++      ldr     data2, [src2], 8
++      and     tmp1, src1, 7
++      add     limit, limit, tmp1
++      cmp     data1, data2
++      bne     .Lreturn
++
++      /* Align src1 and adjust src2 with bytes not yet done.  */
++      sub     src1, src1, tmp1
++      sub     src2, src2, tmp1
++
++      subs    limit, limit, 8
++      b.ls    .Llast_bytes
++
++      /* Loop performing 8 bytes per iteration using aligned src1.
++         Limit is pre-decremented by 8 and must be larger than zero.
++         Exit if <= 8 bytes left to do or if the data is not equal.  */
++      .p2align 4
++.Lloop8:
++      ldr     data1, [src1], 8
++      ldr     data2, [src2], 8
++      subs    limit, limit, 8
++      ccmp    data1, data2, 0, hi  /* NZCV = 0b0000.  */
++      b.eq    .Lloop8
++
++      cmp     data1, data2
++      bne     .Lreturn
++
++      /* Compare last 1-8 bytes using unaligned access.  */
++.Llast_bytes:
++      ldr     data1, [src1, limit]
++      ldr     data2, [src2, limit]
++
++      /* Compare data bytes and set return value to 0, -1 or 1.  */
++.Lreturn:
++#ifndef __AARCH64EB__
+       rev     data1, data1
+       rev     data2, data2
+ #endif
+-      /* The MS-non-zero bit of DIFF marks either the first bit
+-         that is different, or the end of the significant data.
+-         Shifting left now will bring the critical information into the
+-         top bits.  */
+-      clz     pos, diff
+-      lsl     data1, data1, pos
+-      lsl     data2, data2, pos
+-      /* But we need to zero-extend (char is unsigned) the value and then
+-         perform a signed 32-bit subtraction.  */
+-      lsr     data1, data1, #56
+-      sub     result, data1, data2, lsr #56
+-      RET
+-
+-L(mutual_align):
+-      /* Sources are mutually aligned, but are not currently at an
+-         alignment boundary.  Round down the addresses and then mask off
+-         the bytes that precede the start point.  */
+-      bic     src1, src1, #7
+-      bic     src2, src2, #7
+-      add     limit, limit, tmp1      /* Adjust the limit for the extra.  */
+-      lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
+-      ldr     data1, [src1], #8
+-      neg     tmp1, tmp1              /* Bits to alignment -64.  */
+-      ldr     data2, [src2], #8
+-      mov     tmp2, #~0
+-#ifdef __AARCH64EB__
+-      /* Big-endian.  Early bytes are at MSB.  */
+-      lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
+-#else
+-      /* Little-endian.  Early bytes are at LSB.  */
+-      lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
+-#endif
+-      add     limit_wd, limit, #7
+-      orr     data1, data1, tmp2
+-      orr     data2, data2, tmp2
+-      lsr     limit_wd, limit_wd, #3
+-      b       L(start_realigned)
+-
+-L(ret0):
+-      mov     result, #0
+-      RET
+-
+-      .p2align 6
+-L(misaligned8):
+-      sub     limit, limit, #1
+-1:
+-      /* Perhaps we can do better than this.  */
+-      ldrb    data1w, [src1], #1
+-      ldrb    data2w, [src2], #1
+-      subs    limit, limit, #1
+-      ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+-      b.eq    1b
+-      sub     result, data1, data2
+-      RET
++      cmp     data1, data2
++.Lret_eq:
++      cset    result, ne
++      cneg    result, result, lo
++      ret
++
++      .p2align 4
++      /* Compare up to 8 bytes.  Limit is [-8..-1].  */
++.Lless8:
++      adds    limit, limit, 4
++      b.lo    .Lless4
++      ldr     data1w, [src1], 4
++      ldr     data2w, [src2], 4
++      cmp     data1w, data2w
++      b.ne    .Lreturn
++      sub     limit, limit, 4
++.Lless4:
++      adds    limit, limit, 4
++      beq     .Lret_eq
++.Lbyte_loop:
++      ldrb    data1w, [src1], 1
++      ldrb    data2w, [src2], 1
++      subs    limit, limit, 1
++      ccmp    data1w, data2w, 0, ne   /* NZCV = 0b0000.  */
++      b.eq    .Lbyte_loop
++      sub     result, data1w, data2w
++      ret
++
+ END (memcmp)
+ #undef bcmp
+ weak_alias (memcmp, bcmp)
diff --git a/debian/patches/series b/debian/patches/series
index 1548798..060efe9 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -43,6 +43,7 @@ arm/local-soname-hack.diff
 arm/local-vfp-sysdeps.diff
 arm/unsubmitted-ldso-multilib.diff
 arm/local-arm-futex.diff
+arm/git-arm64-memcmp.diff
 
 hppa/local-inlining.diff
 hppa/local-elf-make-cflags.diff

-- 
Alioth's /usr/local/bin/git-commit-notice on 
/srv/git.debian.org/git/pkg-glibc/glibc.git

[glibc] 01/01: debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp for AArch64, improving performance from 25% to 500% (LP: #1720832)

Reply via email to