Some arm cortex-a8 improvements

Richard Henderson Wed, 01 Feb 2012 06:12:23 -0800

Three patches herein.  If there's a better way to submit patches,
please advise; I've never used hg before.


The first patch gives gcc control over ctz/clz.  Particularly for
armv6t2 and later, which have rbit for use for ctz.

The second patch improves multiplication a bit.  I'm still playing
with addmul_2, but this is a start for addmul_1/mul_1.  I couldn't
do better than the existing submul_1.  Unfortunately the Xscale
machines in the gcc build farm are turned off, so I can't test to
see if I've regressed on that platform.

The third patch tidies up add_n/sub_n, and provides for the carry-in
entry points.

It's a bit touchy speed testing these.  There's no cycle counter
available in userspace, and Hz is depressingly low.  So I've had
to bump the minimum iterations way way up in order to get semi-
reliable results.  Which causes the speed testing to take quite
a long time.

Feedback welcome.


r~

# HG changeset patch
# User Richard Henderson <r...@twiddle.net>
# Date 1328087950 -39600
# Branch rth/arm
# Node ID f36514a912491d979c763a010a4d7354e10aa033
# Parent  03f2f54ff061cc30d21877d99c98519fcb81d7af
arm: Merge add_n and sub_n into aors_n.

This gets us mpn_add/sub_nc as well.  Minor code rearrangements
to achieve this carry-in.  Make sure to keep stack 8-byte aligned.

diff -r 03f2f54ff061 -r f36514a91249 mpn/arm/add_n.asm
--- a/mpn/arm/add_n.asm Wed Feb 01 20:16:06 2012 +1100
+++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
-dnl  ARM mpn_add_n -- Add two limb vectors of the same length > 0 and store sum
-dnl  in a third limb vector.
-dnl  Contributed by Robert Harley.
-
-dnl  Copyright 1997, 2000, 2001 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of the GNU Lesser General Public License as published
-dnl  by the Free Software Foundation; either version 3 of the License, or (at
-dnl  your option) any later version.
-
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-dnl  License for more details.
-
-dnl  You should have received a copy of the GNU Lesser General Public License
-dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C This code runs at 5 cycles/limb.
-
-define(`rp',`r0')
-define(`up',`r1')
-define(`vp',`r2')
-define(`n',`r3')
-
-
-ASM_START()
-PROLOGUE(mpn_add_n)
-       stmfd   sp!, { r8, r9, lr }
-       movs    n, n, lsr #1
-       bcc     L(skip1)
-       ldr     r12, [up], #4
-       ldr     lr, [vp], #4
-       adds    r12, r12, lr
-       str     r12, [rp], #4
-L(skip1):
-       tst     n, #1
-       beq     L(skip2)
-       ldmia   up!, { r8, r9 }
-       ldmia   vp!, { r12, lr }
-       adcs    r8, r8, r12
-       adcs    r9, r9, lr
-       stmia   rp!, { r8, r9 }
-L(skip2):
-       bics    n, n, #1
-       beq     L(return)
-       stmfd   sp!, { r4, r5, r6, r7 }
-L(add_n_loop):
-       ldmia   up!, { r4, r5, r6, r7 }
-       ldmia   vp!, { r8, r9, r12, lr }
-       adcs    r4, r4, r8
-       ldr     r8, [rp, #12]                   C cache allocate
-       adcs    r5, r5, r9
-       adcs    r6, r6, r12
-       adcs    r7, r7, lr
-       stmia   rp!, { r4, r5, r6, r7 }
-       sub     n, n, #2
-       teq     n, #0
-       bne     L(add_n_loop)
-       ldmfd   sp!, { r4, r5, r6, r7 }
-L(return):
-       adc     r0, n, #0
-       ldmfd   sp!, { r8, r9, pc }
-EPILOGUE(mpn_add_n)
diff -r 03f2f54ff061 -r f36514a91249 mpn/arm/aors_n.asm
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/aors_n.asm        Wed Feb 01 20:19:10 2012 +1100
@@ -0,0 +1,102 @@
+dnl  ARM mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+dnl  Contributed by Robert Harley.
+
+dnl  Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This code runs at 5 cycles/limb.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`vp',`r2')
+define(`n',`r3')
+
+ifdef(`OPERATION_add_n',`
+  define(func,         mpn_add_n)
+  define(func_nc,      mpn_add_nc)
+  define(ADDSUBC,      adcs)
+  define(INITC,                `cmn    r0, #0')
+  define(SETC,         `lsrs   $1, $1, #1')
+  define(GENRET,       `adc    r0, n, n')
+')
+
+ifdef(`OPERATION_sub_n',`
+  define(func,         mpn_sub_n)
+  define(func_nc,      mpn_sub_nc)
+  define(ADDSUBC,      sbcs)
+  define(INITC,                `cmp    r0, r0')
+  define(SETC,         `rsbs   $1, $1, #0')
+  define(GENRET,       `sbc    r0, r0, r0; and r0, r0, #1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+
+PROLOGUE(func_nc)
+       ldr     ip, [sp, #0]                    C load carry-in
+       SETC(ip)                                C move carry-in to carry flag
+       b       L(ent)
+EPILOGUE(func_nc)
+
+PROLOGUE(func)
+       INITC
+L(ent):
+       push    { r4, r5, ip, lr }
+       tst     n, #1                           C n % 2 == 1?
+       beq     L(skip1)
+
+       ldr     r4, [up], #4                    C handle one limb
+       ldr     r5, [vp], #4
+       ADDSUBC r4, r4, r5
+       str     r4, [rp], #4
+
+L(skip1):
+       tst     n, #2                           C n % 4 == 2?
+       beq     L(skip2)
+
+       ldmia   up!, { r4, r5 }                 C handle two limbs
+       ldmia   vp!, { ip, lr }
+       ADDSUBC r4, r4, ip
+       ADDSUBC r5, r5, lr
+       stmia   rp!, { r4, r5 }
+
+L(skip2):
+       bics    n, n, #3                        C n <= 3?
+       beq     L(return)
+
+       push    { r6, r7, r8, r9 }
+L(loop):
+       ldmia   up!, { r4, r5, r6, r7 }
+       ldmia   vp!, { r8, r9, ip, lr }
+       ADDSUBC r4, r4, r8
+       ldr     r8, [rp, #12]                   C cache allocate
+       ADDSUBC r5, r5, r9
+       ADDSUBC r6, r6, ip
+       ADDSUBC r7, r7, lr
+       stmia   rp!, { r4, r5, r6, r7 }
+       sub     n, n, #4                        C Dec and test == 0 without
+       tst     n, n                            C   clobbering carry flag.
+       bne     L(loop)
+
+       pop     { r6, r7, r8, r9 }
+L(return):
+       GENRET
+       pop     { r4, r5, ip, pc }
+EPILOGUE(func)
diff -r 03f2f54ff061 -r f36514a91249 mpn/arm/sub_n.asm
--- a/mpn/arm/sub_n.asm Wed Feb 01 20:16:06 2012 +1100
+++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
@@ -1,71 +0,0 @@
-dnl  ARM mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
-dnl  store difference in a third limb vector.
-dnl  Contributed by Robert Harley.
-
-dnl  Copyright 1997, 2000, 2001 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of the GNU Lesser General Public License as published
-dnl  by the Free Software Foundation; either version 3 of the License, or (at
-dnl  your option) any later version.
-
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-dnl  License for more details.
-
-dnl  You should have received a copy of the GNU Lesser General Public License
-dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C This code runs at 5 cycles/limb.
-
-define(`rp',`r0')
-define(`up',`r1')
-define(`vp',`r2')
-define(`n',`r3')
-
-
-ASM_START()
-PROLOGUE(mpn_sub_n)
-       stmfd   sp!, { r8, r9, lr }
-       subs    r12, r12, r12
-       tst     n, #1
-       beq     L(skip1)
-       ldr     r12, [up], #4
-       ldr     lr, [vp], #4
-       subs    r12, r12, lr
-       str     r12, [rp], #4
-L(skip1):
-       tst     n, #2
-       beq     L(skip2)
-       ldmia   up!, { r8, r9 }
-       ldmia   vp!, { r12, lr }
-       sbcs    r8, r8, r12
-       sbcs    r9, r9, lr
-       stmia   rp!, { r8, r9 }
-L(skip2):
-       bics    n, n, #3
-       beq     L(return)
-       stmfd   sp!, { r4, r5, r6, r7 }
-L(sub_n_loop):
-       ldmia   up!, { r4, r5, r6, r7 }
-       ldmia   vp!, { r8, r9, r12, lr }
-       sbcs    r4, r4, r8
-       ldr     r8, [rp, #12]                   C cache allocate
-       sbcs    r5, r5, r9
-       sbcs    r6, r6, r12
-       sbcs    r7, r7, lr
-       stmia   rp!, { r4, r5, r6, r7 }
-       sub     n, n, #4
-       teq     n, #0
-       bne     L(sub_n_loop)
-       ldmfd   sp!, { r4, r5, r6, r7 }
-L(return):
-       sbc     r0, r0, r0
-       and     r0, r0, #1
-       ldmfd   sp!, { r8, r9, pc }
-EPILOGUE(mpn_sub_n)
# HG changeset patch
# User Richard Henderson <r...@twiddle.net>
# Date 1328087766 -39600
# Branch rth/arm
# Node ID 03f2f54ff061cc30d21877d99c98519fcb81d7af
# Parent  d7791bf4dfaf7e6e7597941cbf556029544e90fb
arm: Improve addmul_1 and mul_1 for cortex-a8.

Improves cycles/limb from 5.5 to 4.4 for cortex-a8.  Uses umaal to
add both the carry and (for addmul_1) the accumulation at once; no
need to play games with the carry flag.

But submul_1 cannot be improved this way.  Merely record the
current cycles/limb value for cortex-a8 in the file.

diff -r d7791bf4dfaf -r 03f2f54ff061 mpn/arm/addmul_1.asm
--- a/mpn/arm/addmul_1.asm      Wed Feb 01 11:12:08 2012 +1100
+++ b/mpn/arm/addmul_1.asm      Wed Feb 01 20:16:06 2012 +1100
@@ -1,7 +1,7 @@
 dnl  ARM mpn_addmul_1 -- Multiply a limb vector with a limb and add the result
 dnl  to a second limb vector.
 
-dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+dnl  Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -21,87 +21,79 @@
 include(`../config.m4')
 
 C            cycles/limb
-C StrongARM:  7.75-9.75  (dependent on vl value)
-C XScale:        8-9     (dependent on vl value, estimated)
+C StrongARM:   ??
+C XScale:      ??
+C Cortex-A8    4.4
 
 define(`rp',`r0')
 define(`up',`r1')
 define(`n',`r2')
 define(`vl',`r3')
-define(`rl',`r12')
-define(`ul',`r6')
-define(`r',`lr')
-
+define(`ul',`r4')
 
 ASM_START()
 PROLOGUE(mpn_addmul_1)
-       stmfd   sp!, { r4-r6, lr }
-       mov     r4, #0                  C clear r4
-       adds    r0, r0, #0              C clear cy
-       tst     n, #1
+       push    { r4, r5, r6, r7 }
+
+       mov     r7, #0                  C initialize carry
+       tst     n, #1                   C n % 2 == 1?
        beq     L(skip1)
+
+       ldr     ul, [up], #4            C handle one limb
+       ldr     r5, [rp, #0]
+       umaal   r5, r7, ul, vl
+       sub     n, n, #1
+       str     r5, [rp], #4
+
+L(skip1):
+       tst     n, #2                   C n % 4 == 2?
+       beq     L(skip2)
+
+       ldr     ul, [up], #4            C handle two limbs
+       ldr     r5, [rp, #0]
+       umaal   r5, r7, ul, vl
        ldr     ul, [up], #4
-       ldr     rl, [rp, #0]
-       umull   r5, r4, ul, vl
-       adds    r, rl, r5
-       str     r, [rp], #4
-L(skip1):
-       tst     n, #2
-       beq     L(skip2)
-       ldr     ul, [up], #4
-       ldr     rl, [rp, #0]
-       mov     r5, #0
-       umlal   r4, r5, ul, vl
-       ldr     ul, [up], #4
-       adcs    r, rl, r4
-       ldr     rl, [rp, #4]
-       mov     r4, #0
-       umlal   r5, r4, ul, vl
-       str     r, [rp], #4
-       adcs    r, rl, r5
-       str     r, [rp], #4
+       ldr     r6, [rp, #4]
+       umaal   r6, r7, ul, vl
+       sub     n, n, #2
+       stmia   rp!, { r5, r6 }
+
 L(skip2):
-       bics    r, n, #3
+       tst     n, n                    C n was <= 3?
        beq     L(return)
 
-       ldr     ul, [up], #4
-       ldr     rl, [rp, #0]
-       mov     r5, #0
-       umlal   r4, r5, ul, vl
-       b       L(in)
-
+       C Loop handling 4 limbs.  Note that (at least on cortex-a8) the
+       C RdLo (first) input to umaal is not required until E2, while the
+       C RdHi (second) input is required at E1.  Given that the carry-out
+       C (RdHi output) is not available until E5, we want the carry-in to
+       C use RdLo.  This requires rotating which register holds the carry
+       C throught the loop.
 L(loop):
        ldr     ul, [up], #4
-       adcs    r, rl, r5
-       ldr     rl, [rp, #4]
-       mov     r5, #0
-       umlal   r4, r5, ul, vl
-       str     r, [rp], #4
-L(in): ldr     ul, [up], #4
-       adcs    r, rl, r4
-       ldr     rl, [rp, #4]
-       mov     r4, #0
-       umlal   r5, r4, ul, vl
-       str     r, [rp], #4
+       ldr     r5, [rp, #0]
+       umaal   r5, r7, ul, vl
+
        ldr     ul, [up], #4
-       adcs    r, rl, r5
-       ldr     rl, [rp, #4]
-       mov     r5, #0
-       umlal   r4, r5, ul, vl
-       str     r, [rp], #4
+       ldr     r6, [rp, #4]
+       umaal   r7, r6, ul, vl
+       str     r5, [rp], #4
+
        ldr     ul, [up], #4
-       adcs    r, rl, r4
-       ldr     rl, [rp, #4]
-       mov     r4, #0
-       umlal   r5, r4, ul, vl
-       str     r, [rp], #4
-       sub     n, n, #4
-       bics    r, n, #3
+       ldr     r5, [rp, #4]
+       umaal   r6, r5, ul, vl
+       str     r7, [rp], #4
+
+       ldr     ul, [up], #4
+       ldr     r7, [rp, #4]
+       umaal   r5, r7, ul, vl
+       str     r6, [rp], #4
+
+       subs    n, n, #4
+       str     r5, [rp], #4
        bne     L(loop)
 
-       adcs    r, rl, r5
-       str     r, [rp], #4
 L(return):
-       adc     r0, r4, #0
-       ldmfd   sp!, { r4-r6, pc }
+       mov     r0, r7                  C return the carry
+       pop     { r4, r5, r6, r7 }
+       bx      lr
 EPILOGUE(mpn_addmul_1)
diff -r d7791bf4dfaf -r 03f2f54ff061 mpn/arm/mul_1.asm
--- a/mpn/arm/mul_1.asm Wed Feb 01 11:12:08 2012 +1100
+++ b/mpn/arm/mul_1.asm Wed Feb 01 20:16:06 2012 +1100
@@ -2,7 +2,7 @@
 dnl  in a second limb vector.
 dnl  Contributed by Robert Harley.
 
-dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+dnl  Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -22,57 +22,79 @@
 include(`../config.m4')
 
 C            cycles/limb
-C StrongARM:     6-8  (dependent on vl value)
-C XScale:        ?-?
-
-C We should rewrite this along the lines of addmul_1.asm.  That should save a
-C cycle on StrongARM, and several cycles on XScale.
+C StrongARM:   ??
+C XScale:      ??
+C Cortex-a8:   4.3
 
 define(`rp',`r0')
 define(`up',`r1')
 define(`n',`r2')
 define(`vl',`r3')
-
+define(`ul',`r4')
 
 ASM_START()
 PROLOGUE(mpn_mul_1)
-       stmfd   sp!, { r8, r9, lr }
-       ands    r12, n, #1
+       stmfd   sp!, { r4, r5, r6, r7 }
+
+       mov     r7, #0                  C initializse carry
+       tst     n, #1                   C n % 2 == 1?
        beq     L(skip1)
-       ldr     lr, [up], #4
-       umull   r9, r12, lr, vl
-       str     r9, [rp], #4
+
+       ldr     ul, [up], #4            C handle one limb
+       umull   r5, r7, ul, vl
+       sub     n, n, #1
+       str     r5, [rp], #4
+
 L(skip1):
        tst     n, #2
        beq     L(skip2)
-       mov     r8, r12
-       ldmia   up!, { r12, lr }
-       mov     r9, #0
-       umlal   r8, r9, r12, vl
-       mov     r12, #0
-       umlal   r9, r12, lr, vl
-       stmia   rp!, { r8, r9 }
+
+       ldr     ul, [up], #4            C handle two limbs
+       mov     r5, #0
+       umaal   r5, r7, ul, vl
+       ldr     ul, [up], #4
+       mov     r6, #0
+       umaal   r6, r7, ul, vl
+       sub     n, n, #2
+       stmia   rp!, { r5, r6 }
+
 L(skip2):
-       bics    n, n, #3
+       tst     n, n
        beq     L(return)
-       stmfd   sp!, { r6, r7 }
+
+       C Loop handling 4 limbs.  Note that (at least on cortex-a8) the
+       C RdLo (first) input to umaal is not required until E2, while the
+       C RdHi (second) input is required at E1.  Given that the carry-out
+       C (RdHi output) is not available until E5, we want the carry-in to
+       C use RdLo.  This requires rotating which register holds the carry
+       C throught the loop.
 L(loop):
-       mov     r6, r12
-       ldmia   up!, { r8, r9, r12, lr }
-       ldr     r7, [rp, #12]                   C cache allocate
+       ldr     ul, [up], #4
+       mov     r5, #0
+       umaal   r5, r7, ul, vl
+       ldr     r6, [rp, #12]                   C cache allocate
+
+       ldr     ul, [up], #4
+       mov     r6, #0
+       umaal   r7, r6, ul, vl
+       str     r5, [rp], #4
+
+       ldr     ul, [up], #4
+       mov     r5, #0
+       umaal   r6, r5, ul, vl
+       str     r7, [rp], #4
+
+       ldr     ul, [up], #4
        mov     r7, #0
-       umlal   r6, r7, r8, vl
-       mov     r8, #0
-       umlal   r7, r8, r9, vl
-       mov     r9, #0
-       umlal   r8, r9, r12, vl
-       mov     r12, #0
-       umlal   r9, r12, lr, vl
+       umaal   r5, r7, ul, vl
+       str     r6, [rp], #4
+
        subs    n, n, #4
-       stmia   rp!, { r6, r7, r8, r9 }
+       str     r5, [rp], #4
        bne     L(loop)
-       ldmfd   sp!, { r6, r7 }
+
 L(return):
-       mov     r0, r12
-       ldmfd   sp!, { r8, r9, pc }
+       mov     r0, r7                  C return the carry
+       ldmfd   sp!, { r4, r5, r6, r7 }
+       bx      lr
 EPILOGUE(mpn_mul_1)
diff -r d7791bf4dfaf -r 03f2f54ff061 mpn/arm/submul_1.asm
--- a/mpn/arm/submul_1.asm      Wed Feb 01 11:12:08 2012 +1100
+++ b/mpn/arm/submul_1.asm      Wed Feb 01 20:16:06 2012 +1100
@@ -23,6 +23,7 @@
 C            cycles/limb
 C StrongARM:  7.75-9.75  (dependent on vl value)
 C XScale:        8-9     (dependent on vl value, estimated)
+C Cortex-a8:    4.85
 
 define(`rp',`r0')
 define(`up',`r1')
# HG changeset patch
# User Richard Henderson <r...@twiddle.net>
# Date 1328055128 -39600
# Branch rth/arm
# Node ID d7791bf4dfaf7e6e7597941cbf556029544e90fb
# Parent  d7351c2ba64cf47899167fa81999d68c9ddb99ee
arm: Use builtins for count_leading/trailing_zeros.

diff -r d7351c2ba64c -r d7791bf4dfaf longlong.h
--- a/longlong.h        Mon Jan 30 23:59:39 2012 +0100
+++ b/longlong.h        Wed Feb 01 11:12:08 2012 +1100
@@ -513,12 +513,10 @@
 #define UDIV_TIME 200
 #endif /* LONGLONG_STANDALONE */
 #endif
-#if defined (__ARM_ARCH_5__)
-/* This actually requires arm 5 */
-#define count_leading_zeros(count, x) \
-  __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
+/* Let GCC decide how to implement these.  */
+#define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count,x)
+#define count_trailing_zeros(count, x) count_trailing_zeros_gcc_clz(count,x)
 #define COUNT_LEADING_ZEROS_0 32
-#endif
 #endif /* __arm__ */
 
 #if defined (__clipper__) && W_TYPE_SIZE == 32

_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
http://gmplib.org/mailman/listinfo/gmp-devel

Some arm cortex-a8 improvements

Reply via email to