Three patches herein. If there's a better way to submit patches, please advise; I've never used hg before.
The first patch gives gcc control over ctz/clz. Particularly for armv6t2 and later, which have rbit for use for ctz. The second patch improves multiplication a bit. I'm still playing with addmul_2, but this is a start for addmul_1/mul_1. I couldn't do better than the existing submul_1. Unfortunately the Xscale machines in the gcc build farm are turned off, so I can't test to see if I've regressed on that platform. The third patch tidies up add_n/sub_n, and provides for the carry-in entry points. It's a bit touchy speed testing these. There's no cycle counter available in userspace, and Hz is depressingly low. So I've had to bump the minimum iterations way way up in order to get semi- reliable results. Which causes the speed testing to take quite a long time. Feedback welcome. r~
# HG changeset patch # User Richard Henderson <r...@twiddle.net> # Date 1328087950 -39600 # Branch rth/arm # Node ID f36514a912491d979c763a010a4d7354e10aa033 # Parent 03f2f54ff061cc30d21877d99c98519fcb81d7af arm: Merge add_n and sub_n into aors_n. This gets us mpn_add/sub_nc as well. Minor code rearrangements to achieve this carry-in. Make sure to keep stack 8-byte aligned. diff -r 03f2f54ff061 -r f36514a91249 mpn/arm/add_n.asm --- a/mpn/arm/add_n.asm Wed Feb 01 20:16:06 2012 +1100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ -dnl ARM mpn_add_n -- Add two limb vectors of the same length > 0 and store sum -dnl in a third limb vector. -dnl Contributed by Robert Harley. - -dnl Copyright 1997, 2000, 2001 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. - -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published -dnl by the Free Software Foundation; either version 3 of the License, or (at -dnl your option) any later version. - -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. - -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. - -include(`../config.m4') - -C This code runs at 5 cycles/limb. - -define(`rp',`r0') -define(`up',`r1') -define(`vp',`r2') -define(`n',`r3') - - -ASM_START() -PROLOGUE(mpn_add_n) - stmfd sp!, { r8, r9, lr } - movs n, n, lsr #1 - bcc L(skip1) - ldr r12, [up], #4 - ldr lr, [vp], #4 - adds r12, r12, lr - str r12, [rp], #4 -L(skip1): - tst n, #1 - beq L(skip2) - ldmia up!, { r8, r9 } - ldmia vp!, { r12, lr } - adcs r8, r8, r12 - adcs r9, r9, lr - stmia rp!, { r8, r9 } -L(skip2): - bics n, n, #1 - beq L(return) - stmfd sp!, { r4, r5, r6, r7 } -L(add_n_loop): - ldmia up!, { r4, r5, r6, r7 } - ldmia vp!, { r8, r9, r12, lr } - adcs r4, r4, r8 - ldr r8, [rp, #12] C cache allocate - adcs r5, r5, r9 - adcs r6, r6, r12 - adcs r7, r7, lr - stmia rp!, { r4, r5, r6, r7 } - sub n, n, #2 - teq n, #0 - bne L(add_n_loop) - ldmfd sp!, { r4, r5, r6, r7 } -L(return): - adc r0, n, #0 - ldmfd sp!, { r8, r9, pc } -EPILOGUE(mpn_add_n) diff -r 03f2f54ff061 -r f36514a91249 mpn/arm/aors_n.asm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mpn/arm/aors_n.asm Wed Feb 01 20:19:10 2012 +1100 @@ -0,0 +1,102 @@ +dnl ARM mpn_add_n/mpn_sub_n -- mpn addition and subtraction. +dnl Contributed by Robert Harley. + +dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C This code runs at 5 cycles/limb. + +define(`rp',`r0') +define(`up',`r1') +define(`vp',`r2') +define(`n',`r3') + +ifdef(`OPERATION_add_n',` + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(ADDSUBC, adcs) + define(INITC, `cmn r0, #0') + define(SETC, `lsrs $1, $1, #1') + define(GENRET, `adc r0, n, n') +') + +ifdef(`OPERATION_sub_n',` + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(ADDSUBC, sbcs) + define(INITC, `cmp r0, r0') + define(SETC, `rsbs $1, $1, #0') + define(GENRET, `sbc r0, r0, r0; and r0, r0, #1') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() + +PROLOGUE(func_nc) + ldr ip, [sp, #0] C load carry-in + SETC(ip) C move carry-in to carry flag + b L(ent) +EPILOGUE(func_nc) + +PROLOGUE(func) + INITC +L(ent): + push { r4, r5, ip, lr } + tst n, #1 C n % 2 == 1? + beq L(skip1) + + ldr r4, [up], #4 C handle one limb + ldr r5, [vp], #4 + ADDSUBC r4, r4, r5 + str r4, [rp], #4 + +L(skip1): + tst n, #2 C n % 4 == 2? + beq L(skip2) + + ldmia up!, { r4, r5 } C handle two limbs + ldmia vp!, { ip, lr } + ADDSUBC r4, r4, ip + ADDSUBC r5, r5, lr + stmia rp!, { r4, r5 } + +L(skip2): + bics n, n, #3 C n <= 3? + beq L(return) + + push { r6, r7, r8, r9 } +L(loop): + ldmia up!, { r4, r5, r6, r7 } + ldmia vp!, { r8, r9, ip, lr } + ADDSUBC r4, r4, r8 + ldr r8, [rp, #12] C cache allocate + ADDSUBC r5, r5, r9 + ADDSUBC r6, r6, ip + ADDSUBC r7, r7, lr + stmia rp!, { r4, r5, r6, r7 } + sub n, n, #4 C Dec and test == 0 without + tst n, n C clobbering carry flag. + bne L(loop) + + pop { r6, r7, r8, r9 } +L(return): + GENRET + pop { r4, r5, ip, pc } +EPILOGUE(func) diff -r 03f2f54ff061 -r f36514a91249 mpn/arm/sub_n.asm --- a/mpn/arm/sub_n.asm Wed Feb 01 20:16:06 2012 +1100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -dnl ARM mpn_sub_n -- Subtract two limb vectors of the same length > 0 and -dnl store difference in a third limb vector. -dnl Contributed by Robert Harley. - -dnl Copyright 1997, 2000, 2001 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. - -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published -dnl by the Free Software Foundation; either version 3 of the License, or (at -dnl your option) any later version. - -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. - -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. - -include(`../config.m4') - -C This code runs at 5 cycles/limb. - -define(`rp',`r0') -define(`up',`r1') -define(`vp',`r2') -define(`n',`r3') - - -ASM_START() -PROLOGUE(mpn_sub_n) - stmfd sp!, { r8, r9, lr } - subs r12, r12, r12 - tst n, #1 - beq L(skip1) - ldr r12, [up], #4 - ldr lr, [vp], #4 - subs r12, r12, lr - str r12, [rp], #4 -L(skip1): - tst n, #2 - beq L(skip2) - ldmia up!, { r8, r9 } - ldmia vp!, { r12, lr } - sbcs r8, r8, r12 - sbcs r9, r9, lr - stmia rp!, { r8, r9 } -L(skip2): - bics n, n, #3 - beq L(return) - stmfd sp!, { r4, r5, r6, r7 } -L(sub_n_loop): - ldmia up!, { r4, r5, r6, r7 } - ldmia vp!, { r8, r9, r12, lr } - sbcs r4, r4, r8 - ldr r8, [rp, #12] C cache allocate - sbcs r5, r5, r9 - sbcs r6, r6, r12 - sbcs r7, r7, lr - stmia rp!, { r4, r5, r6, r7 } - sub n, n, #4 - teq n, #0 - bne L(sub_n_loop) - ldmfd sp!, { r4, r5, r6, r7 } -L(return): - sbc r0, r0, r0 - and r0, r0, #1 - ldmfd sp!, { r8, r9, pc } -EPILOGUE(mpn_sub_n) # HG changeset patch # User Richard Henderson <r...@twiddle.net> # Date 1328087766 -39600 # Branch rth/arm # Node ID 03f2f54ff061cc30d21877d99c98519fcb81d7af # Parent d7791bf4dfaf7e6e7597941cbf556029544e90fb arm: Improve addmul_1 and mul_1 for cortex-a8. Improves cycles/limb from 5.5 to 4.4 for cortex-a8. Uses umaal to add both the carry and (for addmul_1) the accumulation at once; no need to play games with the carry flag. But submul_1 cannot be improved this way. Merely record the current cycles/limb value for cortex-a8 in the file. diff -r d7791bf4dfaf -r 03f2f54ff061 mpn/arm/addmul_1.asm --- a/mpn/arm/addmul_1.asm Wed Feb 01 11:12:08 2012 +1100 +++ b/mpn/arm/addmul_1.asm Wed Feb 01 20:16:06 2012 +1100 @@ -1,7 +1,7 @@ dnl ARM mpn_addmul_1 -- Multiply a limb vector with a limb and add the result dnl to a second limb vector. -dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. +dnl Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -21,87 +21,79 @@ include(`../config.m4') C cycles/limb -C StrongARM: 7.75-9.75 (dependent on vl value) -C XScale: 8-9 (dependent on vl value, estimated) +C StrongARM: ?? +C XScale: ?? +C Cortex-A8 4.4 define(`rp',`r0') define(`up',`r1') define(`n',`r2') define(`vl',`r3') -define(`rl',`r12') -define(`ul',`r6') -define(`r',`lr') - +define(`ul',`r4') ASM_START() PROLOGUE(mpn_addmul_1) - stmfd sp!, { r4-r6, lr } - mov r4, #0 C clear r4 - adds r0, r0, #0 C clear cy - tst n, #1 + push { r4, r5, r6, r7 } + + mov r7, #0 C initialize carry + tst n, #1 C n % 2 == 1? beq L(skip1) + + ldr ul, [up], #4 C handle one limb + ldr r5, [rp, #0] + umaal r5, r7, ul, vl + sub n, n, #1 + str r5, [rp], #4 + +L(skip1): + tst n, #2 C n % 4 == 2? + beq L(skip2) + + ldr ul, [up], #4 C handle two limbs + ldr r5, [rp, #0] + umaal r5, r7, ul, vl ldr ul, [up], #4 - ldr rl, [rp, #0] - umull r5, r4, ul, vl - adds r, rl, r5 - str r, [rp], #4 -L(skip1): - tst n, #2 - beq L(skip2) - ldr ul, [up], #4 - ldr rl, [rp, #0] - mov r5, #0 - umlal r4, r5, ul, vl - ldr ul, [up], #4 - adcs r, rl, r4 - ldr rl, [rp, #4] - mov r4, #0 - umlal r5, r4, ul, vl - str r, [rp], #4 - adcs r, rl, r5 - str r, [rp], #4 + ldr r6, [rp, #4] + umaal r6, r7, ul, vl + sub n, n, #2 + stmia rp!, { r5, r6 } + L(skip2): - bics r, n, #3 + tst n, n C n was <= 3? beq L(return) - ldr ul, [up], #4 - ldr rl, [rp, #0] - mov r5, #0 - umlal r4, r5, ul, vl - b L(in) - + C Loop handling 4 limbs. Note that (at least on cortex-a8) the + C RdLo (first) input to umaal is not required until E2, while the + C RdHi (second) input is required at E1. Given that the carry-out + C (RdHi output) is not available until E5, we want the carry-in to + C use RdLo. This requires rotating which register holds the carry + C throught the loop. L(loop): ldr ul, [up], #4 - adcs r, rl, r5 - ldr rl, [rp, #4] - mov r5, #0 - umlal r4, r5, ul, vl - str r, [rp], #4 -L(in): ldr ul, [up], #4 - adcs r, rl, r4 - ldr rl, [rp, #4] - mov r4, #0 - umlal r5, r4, ul, vl - str r, [rp], #4 + ldr r5, [rp, #0] + umaal r5, r7, ul, vl + ldr ul, [up], #4 - adcs r, rl, r5 - ldr rl, [rp, #4] - mov r5, #0 - umlal r4, r5, ul, vl - str r, [rp], #4 + ldr r6, [rp, #4] + umaal r7, r6, ul, vl + str r5, [rp], #4 + ldr ul, [up], #4 - adcs r, rl, r4 - ldr rl, [rp, #4] - mov r4, #0 - umlal r5, r4, ul, vl - str r, [rp], #4 - sub n, n, #4 - bics r, n, #3 + ldr r5, [rp, #4] + umaal r6, r5, ul, vl + str r7, [rp], #4 + + ldr ul, [up], #4 + ldr r7, [rp, #4] + umaal r5, r7, ul, vl + str r6, [rp], #4 + + subs n, n, #4 + str r5, [rp], #4 bne L(loop) - adcs r, rl, r5 - str r, [rp], #4 L(return): - adc r0, r4, #0 - ldmfd sp!, { r4-r6, pc } + mov r0, r7 C return the carry + pop { r4, r5, r6, r7 } + bx lr EPILOGUE(mpn_addmul_1) diff -r d7791bf4dfaf -r 03f2f54ff061 mpn/arm/mul_1.asm --- a/mpn/arm/mul_1.asm Wed Feb 01 11:12:08 2012 +1100 +++ b/mpn/arm/mul_1.asm Wed Feb 01 20:16:06 2012 +1100 @@ -2,7 +2,7 @@ dnl in a second limb vector. dnl Contributed by Robert Harley. -dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. +dnl Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -22,57 +22,79 @@ include(`../config.m4') C cycles/limb -C StrongARM: 6-8 (dependent on vl value) -C XScale: ?-? - -C We should rewrite this along the lines of addmul_1.asm. That should save a -C cycle on StrongARM, and several cycles on XScale. +C StrongARM: ?? +C XScale: ?? +C Cortex-a8: 4.3 define(`rp',`r0') define(`up',`r1') define(`n',`r2') define(`vl',`r3') - +define(`ul',`r4') ASM_START() PROLOGUE(mpn_mul_1) - stmfd sp!, { r8, r9, lr } - ands r12, n, #1 + stmfd sp!, { r4, r5, r6, r7 } + + mov r7, #0 C initializse carry + tst n, #1 C n % 2 == 1? beq L(skip1) - ldr lr, [up], #4 - umull r9, r12, lr, vl - str r9, [rp], #4 + + ldr ul, [up], #4 C handle one limb + umull r5, r7, ul, vl + sub n, n, #1 + str r5, [rp], #4 + L(skip1): tst n, #2 beq L(skip2) - mov r8, r12 - ldmia up!, { r12, lr } - mov r9, #0 - umlal r8, r9, r12, vl - mov r12, #0 - umlal r9, r12, lr, vl - stmia rp!, { r8, r9 } + + ldr ul, [up], #4 C handle two limbs + mov r5, #0 + umaal r5, r7, ul, vl + ldr ul, [up], #4 + mov r6, #0 + umaal r6, r7, ul, vl + sub n, n, #2 + stmia rp!, { r5, r6 } + L(skip2): - bics n, n, #3 + tst n, n beq L(return) - stmfd sp!, { r6, r7 } + + C Loop handling 4 limbs. Note that (at least on cortex-a8) the + C RdLo (first) input to umaal is not required until E2, while the + C RdHi (second) input is required at E1. Given that the carry-out + C (RdHi output) is not available until E5, we want the carry-in to + C use RdLo. This requires rotating which register holds the carry + C throught the loop. L(loop): - mov r6, r12 - ldmia up!, { r8, r9, r12, lr } - ldr r7, [rp, #12] C cache allocate + ldr ul, [up], #4 + mov r5, #0 + umaal r5, r7, ul, vl + ldr r6, [rp, #12] C cache allocate + + ldr ul, [up], #4 + mov r6, #0 + umaal r7, r6, ul, vl + str r5, [rp], #4 + + ldr ul, [up], #4 + mov r5, #0 + umaal r6, r5, ul, vl + str r7, [rp], #4 + + ldr ul, [up], #4 mov r7, #0 - umlal r6, r7, r8, vl - mov r8, #0 - umlal r7, r8, r9, vl - mov r9, #0 - umlal r8, r9, r12, vl - mov r12, #0 - umlal r9, r12, lr, vl + umaal r5, r7, ul, vl + str r6, [rp], #4 + subs n, n, #4 - stmia rp!, { r6, r7, r8, r9 } + str r5, [rp], #4 bne L(loop) - ldmfd sp!, { r6, r7 } + L(return): - mov r0, r12 - ldmfd sp!, { r8, r9, pc } + mov r0, r7 C return the carry + ldmfd sp!, { r4, r5, r6, r7 } + bx lr EPILOGUE(mpn_mul_1) diff -r d7791bf4dfaf -r 03f2f54ff061 mpn/arm/submul_1.asm --- a/mpn/arm/submul_1.asm Wed Feb 01 11:12:08 2012 +1100 +++ b/mpn/arm/submul_1.asm Wed Feb 01 20:16:06 2012 +1100 @@ -23,6 +23,7 @@ C cycles/limb C StrongARM: 7.75-9.75 (dependent on vl value) C XScale: 8-9 (dependent on vl value, estimated) +C Cortex-a8: 4.85 define(`rp',`r0') define(`up',`r1') # HG changeset patch # User Richard Henderson <r...@twiddle.net> # Date 1328055128 -39600 # Branch rth/arm # Node ID d7791bf4dfaf7e6e7597941cbf556029544e90fb # Parent d7351c2ba64cf47899167fa81999d68c9ddb99ee arm: Use builtins for count_leading/trailing_zeros. diff -r d7351c2ba64c -r d7791bf4dfaf longlong.h --- a/longlong.h Mon Jan 30 23:59:39 2012 +0100 +++ b/longlong.h Wed Feb 01 11:12:08 2012 +1100 @@ -513,12 +513,10 @@ #define UDIV_TIME 200 #endif /* LONGLONG_STANDALONE */ #endif -#if defined (__ARM_ARCH_5__) -/* This actually requires arm 5 */ -#define count_leading_zeros(count, x) \ - __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x)) +/* Let GCC decide how to implement these. */ +#define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count,x) +#define count_trailing_zeros(count, x) count_trailing_zeros_gcc_clz(count,x) #define COUNT_LEADING_ZEROS_0 32 -#endif #endif /* __arm__ */ #if defined (__clipper__) && W_TYPE_SIZE == 32
_______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org http://gmplib.org/mailman/listinfo/gmp-devel