Building on the copyi that tege committed the other day, use neon for the logical operations too.

I did both a 128-bit aligned version,

$ ./speed-128 -p 1000000000 -C -s 10,50,100,500,1000,5000,10000 mpn_and_n 
mpn_nand_n
clock_gettime is 1.000ns accurate
overhead 6.00 cycles, precision 1000000000 units of 1.00e-09 secs, CPU freq 
1694.10 MHz
            mpn_and_n    mpn_nand_n
10            #1.7987        1.8986
50            #0.9393        1.0692
100           #1.2491        1.3890
500           #0.8154        0.9753
1000          #0.7786        0.9435
5000          #1.4955        1.5765
10000         #1.6532        1.7415

and a 256-bit aligned version, just to see if having a higher ratio of operation insns to memory insns would help,

$ ./speed-256 -p 1000000000 -C -s 10,50,100,500,1000,5000,10000 mpn_and_n 
mpn_nand_n
clock_gettime is 1.000ns accurate
overhead 6.00 cycles, precision 1000000000 units of 1.00e-09 secs, CPU freq 
1694.10 MHz
            mpn_and_n    mpn_nand_n
10            #1.5989        1.6988
50            #1.0992        1.1592
100           #1.0393        1.0593
500           #1.0373        1.0413
1000          #1.0303        1.0313
5000          #1.5914        1.6003
10000          1.6824       #1.6768

It's a bit curious how the later is less "jaggy", but slightly slower.


r~
dnl  ARM mpn_and_n, et al.

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C            and cyc/l          nand cyc/l
C StrongARM      ?                  ?
C XScale         ?                  ?
C Cortex-A8      ?                  ?
C Cortex-A9      ?                  ?
C Cortex-A15     0.78               0.94

define(`rp', `r0')
define(`up', `r1')
define(`vp', `r2')
define(`n',  `r3')

define(`POSTOP')

ifdef(`OPERATION_and_n',`
  define(`func',    `mpn_and_n')
  define(`LOGOP',   `vand       $1, $2, $3')')
ifdef(`OPERATION_andn_n',`
  define(`func',    `mpn_andn_n')
  define(`LOGOP',   `vbic       $1, $2, $3')')
ifdef(`OPERATION_nand_n',`
  define(`func',    `mpn_nand_n')
  define(`POSTOP',  `vmvn       $1, $1')
  define(`LOGOP',   `vand       $1, $2, $3')')
ifdef(`OPERATION_ior_n',`
  define(`func',    `mpn_ior_n')
  define(`LOGOP',   `vorr       $1, $2, $3')')
ifdef(`OPERATION_iorn_n',`
  define(`func',    `mpn_iorn_n')
  define(`LOGOP',   `vorn       $1, $2, $3')')
ifdef(`OPERATION_nior_n',`
  define(`func',    `mpn_nior_n')
  define(`POSTOP',  `vmvn       $1, $1')
  define(`LOGOP',   `vorr       $1, $2, $3')')
ifdef(`OPERATION_xor_n',`
  define(`func',    `mpn_xor_n')
  define(`LOGOP',   `veor       $1, $2, $3')')
ifdef(`OPERATION_xnor_n',`
  define(`func',    `mpn_xnor_n')
  define(`POSTOP',  `vmvn       $1, $1')
  define(`LOGOP',   `veor       $1, $2, $3')')

MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n 
mpn_nior_n mpn_xor_n mpn_xnor_n)

ASM_START()
        .fpu    neon
PROLOGUE(func)
        cmp             n, #7
        ble             L(bc)

C Copy until rp is 128-bit aligned
        tst             rp, #4
        beq             L(al1)
        vld1.32         {d0[0]}, [up]!
        vld1.32         {d1[0]}, [vp]!
        sub             n, n, #1
        LOGOP(          d0, d0, d1)
        POSTOP(         d0, d0)
        vst1.32         {d0[0]}, [rp]!
L(al1): tst             rp, #8
        beq             L(al2)
        vld1.32         {d0}, [up]!
        vld1.32         {d1}, [vp]!
        sub             n, n, #2
        LOGOP(          d0, d0, d1)
        POSTOP(         d0, d0)
        vst1.32         {d0}, [rp:64]!
L(al2): vld1.32         {q2}, [up]!
        vld1.32         {q3}, [vp]!
        subs            n, n, #12
        blt             L(end)

        ALIGN(16)
L(top): vld1.32         {q0}, [up]!
        LOGOP(          q2, q2, q3)
        vld1.32         {q1}, [vp]!
        POSTOP(         q2, q2)
        subs            n, n, #8
        vst1.32         {q2}, [rp:128]!
        vld1.32         {q2}, [up]!
        LOGOP(          q0, q0, q1)
        vld1.32         {q3}, [vp]!
        POSTOP(         q0, q0)
        vst1.32         {q0}, [rp:128]!
        bge     L(top)

L(end): LOGOP(          q2, q2, q3)
        POSTOP(         q2, q2)
        vst1.32         {q2}, [rp:128]!

C Copy last 0-7 limbs.  Note that rp is aligned after loop, but not when we
C arrive here via L(bc)
L(bc):  tst             n, #4
        beq             L(tl1)
        vld1.32         {q0}, [up]!
        vld1.32         {q1}, [vp]!
        LOGOP(          q0, q0, q1)
        POSTOP(         q0, q0)
        vst1.32         {q0}, [rp]!
L(tl1): tst             n, #2
        beq             L(tl2)
        vld1.32         {d0}, [up]!
        vld1.32         {d1}, [vp]!
        LOGOP(          d0, d0, d1)
        POSTOP(         d0, d0)
        vst1.32         {d0}, [rp]!
L(tl2): tst             n, #1
        beq             L(tl3)
        vld1.32         {d0[0]}, [up]!
        vld1.32         {d1[0]}, [vp]!
        LOGOP(          d0, d0, d1)
        POSTOP(         d0, d0)
        vst1.32         {d0[0]}, [rp]!
L(tl3): bx              lr
EPILOGUE()
dnl  ARM mpn_and_n, et al.

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C            and cyc/l          nand cyc/l
C StrongARM      ?                  ?
C XScale         ?                  ?
C Cortex-A8      ?                  ?
C Cortex-A9      ?                  ?
C Cortex-A15     1                  1

define(`rp', `r0')
define(`up', `r1')
define(`vp', `r2')
define(`n',  `r3')

define(`POSTOP')

ifdef(`OPERATION_and_n',`
  define(`func',    `mpn_and_n')
  define(`LOGOP',   `vand       $1, $2, $3')')
ifdef(`OPERATION_andn_n',`
  define(`func',    `mpn_andn_n')
  define(`LOGOP',   `vbic       $1, $2, $3')')
ifdef(`OPERATION_nand_n',`
  define(`func',    `mpn_nand_n')
  define(`POSTOP',  `vmvn       $1, $1')
  define(`LOGOP',   `vand       $1, $2, $3')')
ifdef(`OPERATION_ior_n',`
  define(`func',    `mpn_ior_n')
  define(`LOGOP',   `vorr       $1, $2, $3')')
ifdef(`OPERATION_iorn_n',`
  define(`func',    `mpn_iorn_n')
  define(`LOGOP',   `vorn       $1, $2, $3')')
ifdef(`OPERATION_nior_n',`
  define(`func',    `mpn_nior_n')
  define(`POSTOP',  `vmvn       $1, $1')
  define(`LOGOP',   `vorr       $1, $2, $3')')
ifdef(`OPERATION_xor_n',`
  define(`func',    `mpn_xor_n')
  define(`LOGOP',   `veor       $1, $2, $3')')
ifdef(`OPERATION_xnor_n',`
  define(`func',    `mpn_xnor_n')
  define(`POSTOP',  `vmvn       $1, $1')
  define(`LOGOP',   `veor       $1, $2, $3')')

MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n 
mpn_nior_n mpn_xor_n mpn_xnor_n)

ASM_START()
        .fpu    neon
PROLOGUE(func)
        cmp             n, #15
        ble             L(bc)

C Copy until rp is 256-bit aligned
        tst             rp, #4
        beq             L(al1)
        vld1.32         {d0[0]}, [up]!
        vld1.32         {d1[0]}, [vp]!
        sub             n, n, #1
        LOGOP(          d0, d0, d1)
        POSTOP(         d0, d0)
        vst1.32         {d0[0]}, [rp]!
L(al1): tst             rp, #8
        beq             L(al2)
        vld1.32         {d0}, [up]!
        vld1.32         {d1}, [vp]!
        sub             n, n, #2
        LOGOP(          d0, d0, d1)
        POSTOP(         d0, d0)
        vst1.32         {d0}, [rp:64]!
L(al2): tst             rp, #16
        beq             L(al3)
        vld1.32         {q0}, [up]!
        vld1.32         {q1}, [vp]!
        subs            n, n, #4
        LOGOP(          q0, q0, q1)
        POSTOP(         q0, q0)
        vst1.32         {q0}, [rp:128]!
L(al3): vld1.32         {q8-q9}, [up]!
        vld1.32         {q10-q11}, [vp]!
        subs            n, n, #8+16
        blt             L(end)

        ALIGN(16)
L(top): vld1.32         {q0-q1}, [up]!
        LOGOP(          q8, q8, q10)
        vld1.32         {q2-q3}, [vp]!
        LOGOP(          q9, q9, q11)
        POSTOP(         q8, q8)
        POSTOP(         q9, q9)
        vst1.32         {q8-q9}, [rp:256]!
        subs            n, n, #16
        vld1.32         {q8-q9}, [up]!
        LOGOP(          q0, q0, q2)
        vld1.32         {q10-q11}, [vp]!
        LOGOP(          q1, q1, q3)
        POSTOP(         q0, q0)
        POSTOP(         q1, q1)
        vst1.32         {q0-q1}, [rp:256]!
        bge             L(top)

L(end): LOGOP(          q8, q8, q10)
        LOGOP(          q9, q9, q11)
        POSTOP(         q8, q8)
        POSTOP(         q9, q9)
        vst1.32         {q8-q9}, [rp:256]!

C Copy last 0-15 limbs.  Note that rp is aligned after loop, but not when we
C arrive here via L(bc)
L(bc):  tst             n, #8
        beq             L(tl0)
        vld1.32         {q0-q1}, [up]!
        vld1.32         {q2-q3}, [vp]!
        LOGOP(          q0, q0, q2)
        LOGOP(          q1, q1, q3)
        POSTOP(         q0, q0)
        POSTOP(         q1, q1)
        vst1.32         {q0-q1}, [rp]!
L(tl0): tst             n, #4
        beq             L(tl1)
        vld1.32         {q0}, [up]!
        vld1.32         {q1}, [vp]!
        LOGOP(          q0, q0, q1)
        POSTOP(         q0, q0)
        vst1.32         {q0}, [rp]!
L(tl1): tst             n, #2
        beq             L(tl2)
        vld1.32         {d0}, [up]!
        vld1.32         {d1}, [vp]!
        LOGOP(          d0, d0, d1)
        POSTOP(         d0, d0)
        vst1.32         {d0}, [rp]!
L(tl2): tst             n, #1
        beq             L(tl3)
        vld1.32         {d0[0]}, [up]!
        vld1.32         {d1[0]}, [vp]!
        LOGOP(          d0, d0, d1)
        POSTOP(         d0, d0)
        vst1.32         {d0[0]}, [rp]!
L(tl3): bx              lr
EPILOGUE()
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
http://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to