[resending as text/plain] Hi
These patches optimise 64 bit division by removing the use of the __gnu_[u]ldivmod_helper functions and hence avoiding the redundant calculation of the remainder in those functions. Bootstrapped, tested and checked for arm-unknown-linux-gnueabihf. Benchmarked on Chromebook and Raspberry Pi using attached divbench3.c. Loop1 varies the divisor and loop2 varies the dividend. Chromebook: before: loop1 unsigned: 3.474419 loop2 unsigned: 6.564871 loop1 signed: 4.127967 loop2 signed: 6.071490 after: loop1 unsigned: 2.781364 loop2 unsigned: 6.166478 loop1 signed: 2.800974 loop2 signed: 6.129588 Raspberry pi: before loop1 unsigned: 28.881753 loop2 unsigned: 19.876385 loop1 signed: 32.074941 loop2 signed: 20.594860 after: loop1 unsigned: 24.893846 loop2 unsigned: 19.537562 loop1 signed: 25.334509 loop2 signed: 19.615088 Any comments? OK for stage 1? Patch 1: 2014-02-27 Charles Baylis <charles.bay...@linaro.org> * config/arm/bpabi.S (__aeabi_uldivmod): Perform division using call to __udivmoddi4. Patch 2: 2014-02-27 Charles Baylis <charles.bay...@linaro.org> * config/arm/bpabi.S (__aeabi_ldivmod): Perform signed division via call to __udivmoddi4 and fixing up for negative operands.
From 35254b813303e7fb40eb8aa0bb749216fd8f96fc Mon Sep 17 00:00:00 2001 From: Charles Baylis <charles.bay...@linaro.org> Date: Tue, 25 Feb 2014 18:34:38 +0000 Subject: [PATCH 1/2] Optimise __aeabi_uldivmod 2014-02-25 Charles Baylis <charles.bay...@linaro.org> * config/arm/bpabi.S (__aeabi_uldivmod): Perform division using call to __udivmoddi4. * config/arm/bpabi.S (__aeabi_uldivmod): Optimise stack pointer manipulation. --- libgcc/config/arm/bpabi.S | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/libgcc/config/arm/bpabi.S b/libgcc/config/arm/bpabi.S index 7772301..e020af5 100644 --- a/libgcc/config/arm/bpabi.S +++ b/libgcc/config/arm/bpabi.S @@ -120,6 +120,16 @@ ARM_FUNC_START aeabi_ulcmp #endif .endm +/* we can use STRD/LDRD on v5TE and later, and any Thumb-2 architecture. */ +#if (defined(__ARM_EABI__) \ + && (defined(__thumb2__) \ + || (__ARM_ARCH >= 5 && defined(__TARGET_FEATURE_DSP)))) +#define CAN_USE_LDRD 1 +#else +#define CAN_USE_LDRD 0 +#endif + + #ifdef L_aeabi_ldivmod ARM_FUNC_START aeabi_ldivmod @@ -149,18 +159,23 @@ ARM_FUNC_START aeabi_uldivmod cfi_start __aeabi_uldivmod, LSYM(Lend_aeabi_uldivmod) test_div_by_zero unsigned - sub sp, sp, #8 -#if defined(__thumb2__) - mov ip, sp - push {ip, lr} +#if defined(__thumb2__) && CAN_USE_LDRD + sub ip, sp, #8 + strd ip,lr, [sp, #-16]! #else + sub sp, sp, #8 do_push {sp, lr} #endif 98: cfi_push 98b - __aeabi_uldivmod, 0xe, -0xc, 0x10 - bl SYM(__gnu_uldivmod_helper) __PLT__ + bl SYM(__udivmoddi4) __PLT__ ldr lr, [sp, #4] +#if CAN_USE_LDRD + ldrd r2, r3, [sp, #8] + add sp, sp, #16 +#else add sp, sp, #8 do_pop {r2, r3} +#endif RET cfi_end LSYM(Lend_aeabi_uldivmod) -- 1.8.3.2
From 975d9c624e77ee00476e6866250b0e2e31461fca Mon Sep 17 00:00:00 2001 From: Charles Baylis <charles.bay...@linaro.org> Date: Tue, 25 Feb 2014 16:27:59 +0000 Subject: [PATCH 2/2] Optimise __aeabi_ldivmod 2014-02-25 Charles Baylis <charles.bay...@linaro.org> * config/arm/bpabi.S (__aeabi_ldivmod): Perform signed division using unsigned division via call to __udivmoddi4 and additional logic. --- libgcc/config/arm/bpabi.S | 74 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/libgcc/config/arm/bpabi.S b/libgcc/config/arm/bpabi.S index e020af5..8b75a28 100644 --- a/libgcc/config/arm/bpabi.S +++ b/libgcc/config/arm/bpabi.S @@ -136,20 +136,84 @@ ARM_FUNC_START aeabi_ldivmod cfi_start __aeabi_ldivmod, LSYM(Lend_aeabi_ldivmod) test_div_by_zero signed - sub sp, sp, #8 -#if defined(__thumb2__) - mov ip, sp - push {ip, lr} +#if defined(__thumb2__) && CAN_USE_LDRD + sub ip, sp, #8 + strd ip,lr, [sp, #-16]! #else + sub sp, sp, #8 do_push {sp, lr} #endif + cmp xxh, #0 + blt 1f + cmp yyh, #0 + blt 2f + +98: cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10 + bl SYM(__udivmoddi4) __PLT__ + ldr lr, [sp, #4] +#if CAN_USE_LDRD + ldrd r2, r3, [sp, #8] + add sp, sp, #16 +#else + add sp, sp, #8 + do_pop {r2, r3} +#endif + RET +1: /* xxh:xxl is negative */ + rsbs xxl, xxl, #0 + sbc xxh, xxh, xxh, lsl #1 + cmp yyh, #0 + blt 3f +98: cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10 + bl SYM(__udivmoddi4) __PLT__ + ldr lr, [sp, #4] +#if CAN_USE_LDRD + ldrd r2, r3, [sp, #8] + add sp, sp, #16 +#else + add sp, sp, #8 + do_pop {r2, r3} +#endif + rsbs xxl, xxl, #0 + sbc xxh, xxh, xxh, lsl #1 + rsbs yyl, yyl, #0 + sbc yyh, yyh, yyh, lsl #1 + RET + +2: /* only yyh:yyl is negative */ + rsbs yyl, yyl, #0 + sbc yyh, yyh, yyh, lsl #1 98: cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10 - bl SYM(__gnu_ldivmod_helper) __PLT__ + bl SYM(__udivmoddi4) __PLT__ ldr lr, [sp, #4] +#if CAN_USE_LDRD + ldrd r2, r3, [sp, #8] + add sp, sp, #16 +#else add sp, sp, #8 do_pop {r2, r3} +#endif + rsbs xxl, xxl, #0 + sbc xxh, xxh, xxh, lsl #1 RET + +3: /* both xxh:xxl and yyh:yyl are negative */ + rsbs yyl, yyl, #0 + sbc yyh, yyh, yyh, lsl #1 cfi_end LSYM(Lend_aeabi_ldivmod) +98: cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10 + bl SYM(__udivmoddi4) __PLT__ + ldr lr, [sp, #4] +#if CAN_USE_LDRD + ldrd r2, r3, [sp, #8] + add sp, sp, #16 +#else + add sp, sp, #8 + do_pop {r2, r3} +#endif + rsbs yyl, yyl, #0 + sbc yyh, yyh, yyh, lsl #1 + RET #endif /* L_aeabi_ldivmod */ -- 1.8.3.2
#include <stdint.h> #include <stdio.h> #include <unistd.h> #include <sys/time.h> double tv_to_s(struct timeval tv) { return tv.tv_sec + ((double)tv.tv_usec)/1.0e6; } #define STEP (0x7fffffffffff0000/100000000) #define END (0x7fffffffffff0001-STEP) #define START1 (37ll) #define START2 (3ll) uint64_t __aeabi_uldivmod(uint64_t,uint64_t); int64_t __aeabi_ldivmod(int64_t,int64_t); int main(int argc, char **argv) { double time1, time2, time3, time4; struct timeval start, end; volatile uint64_t dummy; uint64_t i; volatile int64_t sdummy; int64_t si; gettimeofday (&start, NULL); for (i = START2; i < END; i += STEP) { dummy = __aeabi_uldivmod(END, i); } gettimeofday (&end, NULL); time1 = tv_to_s (end) - tv_to_s (start); gettimeofday (&start, NULL); for (i = START1; i < END; i += STEP * 5) { dummy = __aeabi_uldivmod(i, 373459); } gettimeofday (&end, NULL); time2 = tv_to_s (end) - tv_to_s (start); gettimeofday (&start, NULL); for (si = START2; si < END; si += STEP) { sdummy = __aeabi_ldivmod(END, si); } gettimeofday (&end, NULL); time3 = tv_to_s (end) - tv_to_s (start); gettimeofday (&start, NULL); for (si = START1; si < END; si += STEP * 5) { sdummy = __aeabi_ldivmod(si, 373459); } gettimeofday (&end, NULL); time4 = tv_to_s (end) - tv_to_s (start); printf ("loop1 unsigned: %12f\n" "loop2 unsigned: %12f\n" "loop1 signed: %12f\n" "loop2 signed: %12f\n", time1, time2, time3, time4); return 0; }