ni...@lysator.liu.se (Niels Möller) writes:

>> dnl  AMD64 mpn_addsubmul_1msb0, R = Au - Bv, u,v < 2^63.
>
> This comment obviously wrong ;-)
>
> But that function could be implemented by adding two "not %rdx" in the
> right places of the loop, plus small adjustment just before and after
> the loop.
>
> Since
>
>  Au - Bv = Au + (2^{64 n} - 1 - B) v - 2^{64 n} v + v
>
> So complement B on the fly, set initial carry limb to v, and subtract v from
> the return value. (Same trick as in arm/v7a/cora15/submul_1).
>
> Should definitely be worth a try, before trying some completely
> different loop.

Tried now, below implementation appears to work fine.

But considerably slower. Those extra not instruction appears to cost one
cycle per limb on my machine: addmul_1 at 2 c/l, addaddmul_1msb0 at 3
c/l, and addsubmul_1msb0 at 4 c/l. I didn't expect that much.

Regards,
/Niels


---------8<----------

dnl  AMD64 mpn_addsubmul_1msb0, R = Au - Bv, u,v < 2^63.

dnl  Contributed to the GNU project by Niels Möller and Torbjörn Granlund.

dnl  Copyright 2021 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of either:
dnl
dnl    * the GNU Lesser General Public License as published by the Free
dnl      Software Foundation; either version 3 of the License, or (at your
dnl      option) any later version.
dnl
dnl  or
dnl
dnl    * the GNU General Public License as published by the Free Software
dnl      Foundation; either version 2 of the License, or (at your option) any
dnl      later version.
dnl
dnl  or both in parallel, as here.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
dnl  for more details.
dnl
dnl  You should have received copies of the GNU General Public License and the
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

C INPUT PARAMETERS
define(`rp',    `%rdi')
define(`ap',    `%rsi')
define(`bp_param', `%rdx')
define(`n',     `%rcx')
define(`u0',    `%r8')
define(`v0',    `%r9')

define(`bp', `%rbx')

define(`c0', `%rax')    C carry limb, and return value
define(`l0', `%r10')
define(`l1', `%r11')
define(`hi', `%rbp')

ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)

ASM_START()
        TEXT
        ALIGN(16)
PROLOGUE(mpn_addsubmul_1msb0)
        FUNC_ENTRY(4)
IFDOS(` mov     56(%rsp), %r8   ')
IFDOS(` mov     64(%rsp), %r9   ')

        push    %rbx
        push    %rbp

        lea     (ap,n,8), ap
        lea     (bp_param,n,8), bp
        lea     (rp,n,8), rp
        neg     n

        xor     R32(c0), R32(c0)        C Also clears CF and OF
        mov     v0, c0
        test    $1, R32(n)
        jnz     L(mid)

        ALIGN(16)
L(top): mov     (ap,n,8), %rdx
        mulx(   u0, l0, hi)
        mov     (bp,n,8), %rdx
        not     %rdx
        adox(   c0, l0)
        mulx(   v0, l1, c0)
        adox(   hi, c0)
        adc     l0, l1
        mov     l1, (rp,n,8)
        inc     n                       C Clears OF (since n != 2^63 - 1)
L(mid): mov     (ap,n,8), %rdx
        mulx(   u0, l0, hi)
        mov     (bp,n,8), %rdx
        not     %rdx
        adox(   c0, l0)
        mulx(   v0, l1, c0)
        adox(   hi, c0)
        adc     l0, l1
        mov     l1, (rp,n,8)
        inc     n
        jnz     L(top)

L(end): adc     $0, c0
        sub     v0, c0
        pop     %rbp
        pop     %rbx
        FUNC_EXIT()
        ret
EPILOGUE()

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to