This is an automated email from the git hooks/post-receive script. adconrad pushed a commit to branch glibc-2.26 in repository glibc.
commit d4f6d3805e95af3f6aaf47d16ed6eac7783391f4 Author: Adam Conrad <adcon...@0c3.net> Date: Wed Oct 11 14:08:40 2017 -0600 debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp for AArch64, improving performance from 25% to 500% (LP: #1720832) --- debian/changelog | 2 + debian/patches/arm/git-arm64-memcmp.diff | 232 +++++++++++++++++++++++++++++++ debian/patches/series | 1 + 3 files changed, 235 insertions(+) diff --git a/debian/changelog b/debian/changelog index 1356817..8e797cb 100644 --- a/debian/changelog +++ b/debian/changelog @@ -30,6 +30,8 @@ glibc (2.26-0experimental0) UNRELEASED; urgency=medium agree with the sorting we see in Debian, may need another look. - debian/patches/any/local-cudacc-float128.diff: Local patch to prevent defining __HAVE_FLOAT128 on NVIDIA's CUDA compilers (LP: #1717257) + - debian/patches/arm/git-arm64-memcmp.diff: Backport optimized memcmp + for AArch64, improving performance from 25% to 500% (LP: #1720832) - debian/control.in/libc: Drop ancient Breaks satisfied in oldoldstable. - debian/{debhelper.in/libc.preinst,sysdeps/amd64.mk,sysdeps/i386.mk}: Bump MIN_KERNEL_SUPPORTED to 3.2 on x86, following upstream's change. diff --git a/debian/patches/arm/git-arm64-memcmp.diff b/debian/patches/arm/git-arm64-memcmp.diff new file mode 100644 index 0000000..4b31caf --- /dev/null +++ b/debian/patches/arm/git-arm64-memcmp.diff @@ -0,0 +1,232 @@ +commit 922369032c604b4dcfd535e1bcddd4687e7126a5 +Author: Wilco Dijkstra <wdijk...@arm.com> +Date: Thu Aug 10 17:00:38 2017 +0100 + + [AArch64] Optimized memcmp. + + This is an optimized memcmp for AArch64. This is a complete rewrite + using a different algorithm. The previous version split into cases + where both inputs were aligned, the inputs were mutually aligned and + unaligned using a byte loop. The new version combines all these cases, + while small inputs of less than 8 bytes are handled separately. + + This allows the main code to be sped up using unaligned loads since + there are now at least 8 bytes to be compared. After the first 8 bytes, + align the first input. This ensures each iteration does at most one + unaligned access and mutually aligned inputs behave as aligned. + After the main loop, process the last 8 bytes using unaligned accesses. + + This improves performance of (mutually) aligned cases by 25% and + unaligned by >500% (yes >6 times faster) on large inputs. + + * sysdeps/aarch64/memcmp.S (memcmp): + Rewrite of optimized memcmp. + +diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S +index 4cfcb89297..b99c081bba 100644 +--- a/sysdeps/aarch64/memcmp.S ++++ b/sysdeps/aarch64/memcmp.S +@@ -22,132 +22,98 @@ + + /* Assumptions: + * +- * ARMv8-a, AArch64 ++ * ARMv8-a, AArch64, unaligned accesses. + */ + + /* Parameters and result. */ + #define src1 x0 + #define src2 x1 + #define limit x2 +-#define result x0 ++#define result w0 + + /* Internal variables. */ + #define data1 x3 + #define data1w w3 + #define data2 x4 + #define data2w w4 +-#define has_nul x5 +-#define diff x6 +-#define endloop x7 +-#define tmp1 x8 +-#define tmp2 x9 +-#define tmp3 x10 +-#define pos x11 +-#define limit_wd x12 +-#define mask x13 ++#define tmp1 x5 + + ENTRY_ALIGN (memcmp, 6) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) +- cbz limit, L(ret0) +- eor tmp1, src1, src2 +- tst tmp1, #7 +- b.ne L(misaligned8) +- ands tmp1, src1, #7 +- b.ne L(mutual_align) +- add limit_wd, limit, #7 +- lsr limit_wd, limit_wd, #3 +- /* Start of performance-critical section -- one 64B cache line. */ +-L(loop_aligned): +- ldr data1, [src1], #8 +- ldr data2, [src2], #8 +-L(start_realigned): +- subs limit_wd, limit_wd, #1 +- eor diff, data1, data2 /* Non-zero if differences found. */ +- csinv endloop, diff, xzr, ne /* Last Dword or differences. */ +- cbz endloop, L(loop_aligned) +- /* End of performance-critical section -- one 64B cache line. */ +- +- /* Not reached the limit, must have found a diff. */ +- cbnz limit_wd, L(not_limit) +- +- /* Limit % 8 == 0 => all bytes significant. */ +- ands limit, limit, #7 +- b.eq L(not_limit) +- +- lsl limit, limit, #3 /* Bits -> bytes. */ +- mov mask, #~0 +-#ifdef __AARCH64EB__ +- lsr mask, mask, limit +-#else +- lsl mask, mask, limit +-#endif +- bic data1, data1, mask +- bic data2, data2, mask +- +- orr diff, diff, mask +-L(not_limit): + +-#ifndef __AARCH64EB__ +- rev diff, diff ++ subs limit, limit, 8 ++ b.lo .Lless8 ++ ++ /* Limit >= 8, so check first 8 bytes using unaligned loads. */ ++ ldr data1, [src1], 8 ++ ldr data2, [src2], 8 ++ and tmp1, src1, 7 ++ add limit, limit, tmp1 ++ cmp data1, data2 ++ bne .Lreturn ++ ++ /* Align src1 and adjust src2 with bytes not yet done. */ ++ sub src1, src1, tmp1 ++ sub src2, src2, tmp1 ++ ++ subs limit, limit, 8 ++ b.ls .Llast_bytes ++ ++ /* Loop performing 8 bytes per iteration using aligned src1. ++ Limit is pre-decremented by 8 and must be larger than zero. ++ Exit if <= 8 bytes left to do or if the data is not equal. */ ++ .p2align 4 ++.Lloop8: ++ ldr data1, [src1], 8 ++ ldr data2, [src2], 8 ++ subs limit, limit, 8 ++ ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ ++ b.eq .Lloop8 ++ ++ cmp data1, data2 ++ bne .Lreturn ++ ++ /* Compare last 1-8 bytes using unaligned access. */ ++.Llast_bytes: ++ ldr data1, [src1, limit] ++ ldr data2, [src2, limit] ++ ++ /* Compare data bytes and set return value to 0, -1 or 1. */ ++.Lreturn: ++#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 + #endif +- /* The MS-non-zero bit of DIFF marks either the first bit +- that is different, or the end of the significant data. +- Shifting left now will bring the critical information into the +- top bits. */ +- clz pos, diff +- lsl data1, data1, pos +- lsl data2, data2, pos +- /* But we need to zero-extend (char is unsigned) the value and then +- perform a signed 32-bit subtraction. */ +- lsr data1, data1, #56 +- sub result, data1, data2, lsr #56 +- RET +- +-L(mutual_align): +- /* Sources are mutually aligned, but are not currently at an +- alignment boundary. Round down the addresses and then mask off +- the bytes that precede the start point. */ +- bic src1, src1, #7 +- bic src2, src2, #7 +- add limit, limit, tmp1 /* Adjust the limit for the extra. */ +- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ +- ldr data1, [src1], #8 +- neg tmp1, tmp1 /* Bits to alignment -64. */ +- ldr data2, [src2], #8 +- mov tmp2, #~0 +-#ifdef __AARCH64EB__ +- /* Big-endian. Early bytes are at MSB. */ +- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +-#else +- /* Little-endian. Early bytes are at LSB. */ +- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +-#endif +- add limit_wd, limit, #7 +- orr data1, data1, tmp2 +- orr data2, data2, tmp2 +- lsr limit_wd, limit_wd, #3 +- b L(start_realigned) +- +-L(ret0): +- mov result, #0 +- RET +- +- .p2align 6 +-L(misaligned8): +- sub limit, limit, #1 +-1: +- /* Perhaps we can do better than this. */ +- ldrb data1w, [src1], #1 +- ldrb data2w, [src2], #1 +- subs limit, limit, #1 +- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ +- b.eq 1b +- sub result, data1, data2 +- RET ++ cmp data1, data2 ++.Lret_eq: ++ cset result, ne ++ cneg result, result, lo ++ ret ++ ++ .p2align 4 ++ /* Compare up to 8 bytes. Limit is [-8..-1]. */ ++.Lless8: ++ adds limit, limit, 4 ++ b.lo .Lless4 ++ ldr data1w, [src1], 4 ++ ldr data2w, [src2], 4 ++ cmp data1w, data2w ++ b.ne .Lreturn ++ sub limit, limit, 4 ++.Lless4: ++ adds limit, limit, 4 ++ beq .Lret_eq ++.Lbyte_loop: ++ ldrb data1w, [src1], 1 ++ ldrb data2w, [src2], 1 ++ subs limit, limit, 1 ++ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ ++ b.eq .Lbyte_loop ++ sub result, data1w, data2w ++ ret ++ + END (memcmp) + #undef bcmp + weak_alias (memcmp, bcmp) diff --git a/debian/patches/series b/debian/patches/series index 1548798..060efe9 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -43,6 +43,7 @@ arm/local-soname-hack.diff arm/local-vfp-sysdeps.diff arm/unsubmitted-ldso-multilib.diff arm/local-arm-futex.diff +arm/git-arm64-memcmp.diff hppa/local-inlining.diff hppa/local-elf-make-cflags.diff -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-glibc/glibc.git