Neon column-wise addmul_4

Richard Henderson Tue, 26 Feb 2013 08:38:03 -0800

On 02/26/2013 05:14 AM, Niels Möller wrote:
> Untried tricks: One could try to use vuzp to separate high and low
> parts of the products. Then only the low parts need shifting around.
> I guess I'll try that with addmul_4 first, to see if it makes for any
> improvement. One could maybe use vaddw, to delay adding in one of the
> carry limbs, reducing the recurrency to only vuzp, vaddw (but if the
> recurrency isn't the bottleneck, that won't help).


For discussion.  FWIW, this comes in at 4.0 c/l.  That's slower than the
numbers from the previously posted row-wise addmul_4, but I'd unrolled that one
4 times, so it's not quite a fair comparison.  That said...

>From Niels' message down-thread:

>  2 vmlal
>  2 vext
>  2 vpaddl
>  2 ld1 (scalar)
>  1 st1 (scalar)
>  1 subs
>  1 bne
  ---
  11

Here:

  3 vadd
  2 vmull
  2 vpadal
  2 ld1
  1 vpaddl
  1 vuzp
  1 vext
  1 st1
  1 subs
  1 bne
 ---
 15

It requires 4 more insns per loop.  This fact really should not be surprising
since we're producing a single folded carry term per loop, rather than having
the carry be distributed across elements of a vector.

The fact that the carry is consolidated means that we cannot use vmlal, because
the carry may be as large as 35 bits.  Which means that we must wait until the
product has been decomposed to add it in without losing data.


r~

dnl  ARM neon mpn_addmul_4.
dnl
dnl  Copyright 2013 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

        .fpu    neon
        .arch   armv6t2
        .arm

C            cycles/limb
        
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

define(`Qu03', `q0')    C v4si of four multiplicand limbs
define(`Du01', `d0')    C v2si aliases
define(`Du23', `d1')

define(`Dv10', `d2')    C v2si of two multiplier limbs
define(`Dv32', `d3')    C ... 2nd

define(`Da0', `d4')     C one zero-extended addend limb

define(`Dsum', `d5')    C Three names for the same register, used in the
define(`Dcin', `d5')    C ... appropriate context for clarity regarding
define(`Dcout', `d5')   C ... the role it's currently playing in the round.

define(`Qu4', `q3')     C one multiplicand limb, viewed inside v4si
define(`Du4', `d6')     C ... or v2si.

define(`Qp01', `q8')    C v2di of product
define(`Dp0', `d16')    C v1di (scalar) aliases of Qp01
define(`Dp1', `d17')
define(`Qp23', `q9')    C ... 2nd
define(`Dp2', `d18')
define(`Dp3', `d19')

define(`Qzero', `q15')  C The zero constant.
define(`Dzero', `d31')


ASM_START()
PROLOGUE(mpn_addmul_4)
        vld1.32         {Dv10, Dv32}, [vp]      @ load v0-v3
        mov             vp, rp                  @ read from vp, write to rp
        vld1.32         {Du01, Du23}, [up]!     @ load u0-u3
        vld1.32         {Da0}, [vp]!            @ load a0-a1
        vmov.i32        Qzero, #0

        @ Column 0: only consider the U0*V0 product.
        vmull.u32       Qp01, Du01, Dv10        @ v2di{ u0*v0, ignored }
        vext.32         Dv32, Dv32, Dv32, #1    @ arrange v0-v3 as v3-v0
        vext.32         Dv10, Dv10, Dv10, #1
        vaddw.u32       Qp01, Qp01, Da0         @ v2di{ u0*v0+a0, ignored }
        vshr.u64        Dcout, Dp0, #32         @ cout = hi(p0)
        vshr.u64        Da0, Da0, #32           @ zero-extend a1
        vst1.32         {Dp0[0]}, [rp]!         @ output lo(p0)

        @ Column 1: only consider the U1*V0 and U0*V1 products.
        vmull.u32       Qp01, Du01, Dv10        @ v2di{ u0*v1, v0*u1 }
        vadd.i64        Dsum, Dcin, Da0         @ sum = cin + a1
        vld1.32         {Da0[0]}, [vp]!         @ load (zero-extended) a2

        @ Reorganize the high and low halves of the 64-bit products:
        @ Dp0 = v1di{ u0*v1 } = v2si{ a, b }
        @ Dp1 = v1di{ u1*v0 } = v2si{ x, y }
        vuzp.32         Dp0, Dp1
        @ Dp0 = v2si{ a, x } = { lo(u0*v1), lo(u1*v0) }
        @ Dp1 = v2si{ y, b } = { hi(u1*v0), hi(u0*v1) }

        vpadal.u32      Dsum, Dp0               @ sum = cin + a1 + a + x
        vst1.32         {Dsum[0]}, [rp]!        @ output lo(sum)
        vshr.u64        Dcout, Dsum, #32        @ cout = hi(sum)
        vpadal.u32      Dcout, Dp1              @ cout = hi(sum) + y + b

        @ Column 2: We have 3 products, but just that is hard to manage,
        @ so we're going to drop into the main loop with a zero placed to
        @ moot the multiplication with v3.
        vshr.u64        Du4, Du23, #32          @ v2si{ u3, 0 }
        vext.32         Qu03, Qzero, Qu03, #3   @ v4di{ 0, u0, u1, u2 }

        @ We've done two rounds so far.  Since we load data for round N+1
        @ inside the loop, make sure we exit the loop one round early.
        sub             n, n, #2+1

        ALIGN(16)
L(loop):
        @ 4 x 1 multiplies, all producing results in the same column.
        vmull.u32       Qp01, Du01, Dv32
        vadd.i64        Dsum, Dcin, Da0         @ sum = cin + a0
        vld1.32         {Da0[0]}, [vp]!         @ load next A limb

        vmull.u32       Qp23, Du23, Dv10
        vext.32         Qu03, Qu03, Qu4, #1     @ { u4, u3, u2, u1 }
        vld1.32         {Du4[0]}, [up]!         @ load next U limb

        @ Reorganize the high and low halves of the 64-bit products:
        @ Qp01 = v2di{ u0*v3, u1*v2 } = v4si{ a, b, c, d }
        @ Qp23 = v2di{ u2*v1, u3*v0 }       = v4si{ w, x, y, z }
        vuzp.32         Qp01, Qp23
        @ Qp01 = v4si{ x, z, b, d } = v4si{ lo(u2*v1), ..., lo(u1*v2) }
        @ Qp23 = v4si{ w, y, a, c } = v4si{ hi(u2*v1), ..., lo(u1*v2) }

        @ Independantly sum the high and low portions.
        vpadal.u32      Dsum, Dp0               @ sum += x + z
        vpaddl.u32      Qp23, Qp23              @ v2di{ w+y, a+c }
        vpadal.u32      Dsum, Dp1               @ sum += b + d
        vadd.i64        Dp2, Dp2, Dp3           @ p2 = w+y+a+c
        vst1.i32        {Dsum[0]}, [rp]!        @ output lo(sum)
        vshr.u64        Dcout, Dsum, #32        @ cout = hi(sum)
        subs            n, n, #1
        vadd.i64        Dcout, Dp2              @ cout = hi(sum) + w+y+a+c
        bne             L(loop)

        @ Column N: Four products, but no more data to prefetch.
        vmull.u32       Qp01, Du01, Dv32
        vadd.i64        Dsum, Dcin, Da0         @ sum = cin + a0
        vmull.u32       Qp23, Du23, Dv10
        vext.32         Qu03, Qu03, Qzero, #1   @ { 0, u3, u2, u1 }
        vuzp.32         Qp01, Qp23              @ separate hi's and lo's
        vpadal.u32      Dsum, Dp0               @ sum += x + z
        vpaddl.u32      Qp23, Qp23              @ v2di{ w+y, a+c }
        vpadal.u32      Dsum, Dp1               @ sum += b + d
        vadd.i64        Dp2, Dp2, Dp3           @ p2 = w+y+a+c
        vst1.i32        {Dsum[0]}, [rp]!        @ output lo(sum)
        vshr.u64        Dcout, Dsum, #32        @ cout = hi(sum)
        vadd.i64        Dcout, Dp2              @ cout = hi(sum) + w+y+a+c

        @ Column N+1: Three products.  We shifted in a zero so that
        @ we could continue to use the four product pattern.
        vmull.u32       Qp01, Du01, Dv32
        vmull.u32       Qp23, Du23, Dv10
        vext.32         Du23, Du01, Du23, #1    @ rebuild { u2, u3 }

        @ Qp01 = v2di{ u0*v3, u1*v2 } = v4si{ a, b, c, d }
        @ Qp23 = v2di{ u2*v1, 0 }     = v4si{ w, x, 0, 0 }
        vuzp.32         Qp01, Qp23
        @ Qp01 = { x, 0, b, d }, Qp32 = { w, 0, a, c }
        vpadal.u32      Dsum, Dp0               @ sum += x + z
        vpaddl.u32      Qp23, Qp23              @ v2di{ w, a+c }
        vpadal.u32      Dsum, Dp1               @ sum += b + d
        vadd.i64        Dp2, Dp2, Dp3           @ p2 = w+a+c
        vst1.i32        {Dsum[0]}, [rp]!        @ output lo(sum)
        vshr.u64        Dcout, Dsum, #32        @ cout = hi(sum)
        vadd.i64        Dcout, Dp2              @ cout = hi(sum) + w+a+c

        @ Column N+2: only consider the U3*V2 and U2*V3 products.
        vmull.u32       Qp01, Du23, Dv32
        vuzp.32         Dp0, Dp1                @ separate hi's and lo's
        vpadal.u32      Dsum, Dp0               @ sum = cin + a + x
        vpaddl.u32      Dp1, Dp1                @ p1 = b + y
        vst1.32         {Dsum[0]}, [rp]!        @ output lo(sum)
        vsra.u64        Dp1, Dsum, #32          @ p1 is cout this round

        @ Column N+3: only consider the U3*V3 product.
        vmlal.u32       Qp01, Du23, Dv32[0]     @ v2di{ ignored, u3*v3+cin }
        vst1.32         {Dp1[0]}, [rp]!         @ output lo(p1)
        vmov.32         r0, Dp1[1]              @ return hi(p1)
        bx              lr

EPILOGUE()

_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
http://gmplib.org/mailman/listinfo/gmp-devel

Neon column-wise addmul_4

Reply via email to