Torbjörn Granlund <t...@gmplib.org> writes:

> ni...@lysator.liu.se (Niels Möller) writes:
>
>   Same as in the current (from 2013) version. Delaying the write is a bit
>   tricky, since we already use all registers. But it would be better to
>   update the quotient limbs in memory only in the unlikely
>   carry-propagation case. I figure adc to memory is no worse than explicit
>   load, adc, store (or adc from memory, store)?
>
> I did not realise that the register pressure was so bad.  Perhaps trying
> to decrease that would be most helpful.  Sometimes, when values tend to
> naturally migrate, some unrolling can reduce register pressure.

It turned out letting the Q1 register live between iterations wasn't
that difficult.

> If requiring mulx helps, I would for now forget about mul.  All relevant
> CPUs have mulx.

I now have a mulx version working. It's much neater, and slightly
faster. I think it can be sped up a bit more with better scheduling
(some of my intermediate less neat versions were faster than this), but
not really sure how. Could also switch to adox/adcx to get a little more
scheduling freedom.

L(loop):
        mulx(   B2, P0, P1)     C {p1, p0} <-- u1 B2
        mulx(   DINV, T0, T1)   C {t1, t0} <-- u1 dinv

        C {q2, q0} <-- u1 + q0
        xor     Q2, Q2
        add     %rdx, Q0
        adc     $0, Q2

        C {u2, u1, u0} <-- {u0, up[n-1]} + { p1, p0 }
        mov     U0, %rdx
        mov     -8(UP, UN, 8), U0
        add     P0, U0
        adc     P1, %rdx                C carry represents U2

        C u1 <-- u1 - u2 d
        lea     (%rdx, D), P0
        cmovc   P0, %rdx

        C {q2, q0} <-- {q2, q0} + {q1, t1} + u2
        adc     T1, Q0
        adc     Q1, Q2
        mov     Q2, 8(QP, UN, 8)
        jc      L(q_incr)
L(q_incr_done):
        mov     Q0, Q1
        mov     T0, Q0
        dec     UN
        jnz     L(loop)

The most critical recurrency is the one via %rdx, used for
U1: That's mulx, add, adc, lea, cmov. In addition, each iteration
depends on the previous one via U0, Q0 and Q1.

Complete file attached, if you want to try it out.

Regards,
/Niels

dnl  x86-64 mpn_div_qr_1n_pi1
dnl  -- Divide an mpn number by a normalized single-limb number,
dnl     using a single-limb inverse.

dnl  Contributed to the GNU project by Niels Möller

dnl  Copyright 2013, 2021 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of either:
dnl
dnl    * the GNU Lesser General Public License as published by the Free
dnl      Software Foundation; either version 3 of the License, or (at your
dnl      option) any later version.
dnl
dnl  or
dnl
dnl    * the GNU General Public License as published by the Free Software
dnl      Foundation; either version 2 of the License, or (at your option) any
dnl      later version.
dnl
dnl  or both in parallel, as here.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
dnl  for more details.
dnl
dnl  You should have received copies of the GNU General Public License and the
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

C INPUT Parameters
define(`QP', `%rdi')
define(`UP', `%rsi')
define(`UN_INPUT', `%rdx')
define(`U1_INPUT', `%rcx')
define(`D', `%r8')
define(`DINV', `%r9')

C Invariants
define(`B2', `%rbp')

C Variables. Note that U1 is register %rdx, which is special for mulx.
define(`UN', `%rbx')
define(`P0', `%r10')
define(`T1', `%r12')
define(`U0', `%r11')
define(`Q0', `%r13')
define(`Q1', `%r14')
define(`Q2', `%r15')
define(`T0', `%rax')
define(`P1', `%rcx')            C Overlaps U1_INPUT

ABI_SUPPORT(STD64)

        ASM_START()
        TEXT
        ALIGN(16)
PROLOGUE(mpn_div_qr_1n_pi1)
        FUNC_ENTRY(4)
IFDOS(` mov     56(%rsp), %r8   ')
IFDOS(` mov     64(%rsp), %r9   ')
        dec     UN_INPUT
        jnz     L(first)

        C Just a single 2/1 division.
        C Use scratch registers only
        lea     1(U1_INPUT), %r10
        mov     U1_INPUT, %rdx
        mulx(   DINV, %rax, %r11)
        mov     (UP), %rcx
        add     %rcx, %rax
        adc     %r10, %r11
        mov     %r11, %r10
        imul    D, %r11
        sub     %r11, %rcx
        cmp     %rcx, %rax

        lea     (%rcx, D), %rax C For return value
        cmovnc  %rcx, %rax
        sbb     $0, %r10
        cmp     D, %rax
        jc      L(single_div_done)
        sub     D, %rax
        add     $1, %r10
L(single_div_done):
        mov     %r10, (QP)
        FUNC_EXIT()
        ret
L(first):
        C FIXME: Could delay some of these until we enter the loop.
        push    %r15
        push    %r14
        push    %r13
        push    %r12
        push    %rbx
        push    %rbp

        neg     D
        mov     D, B2
        imul    DINV, B2

        mov     UN_INPUT, UN
        mov     U1_INPUT, %rdx

        mulx(   DINV, T0, P1)
        mov     T0, Q0
        mov     %rdx, Q1
        add     P1, Q1

        mulx(   B2, T0, P1)
        mov     -8(UP, UN, 8), U0
        mov     (UP, UN, 8), %rdx
        add     T0, U0
        adc     P1, %rdx
        lea     (%rdx, D), P0
        cmovc   P0, %rdx
        adc     $0, Q1
        mov     Q1, (QP, UN, 8)

        dec     UN
        jz      L(final)

        C Algorithm based on DIV_QR_1N_METHOD == 3

        C For the u update:
        C        +-------+
        C        |u1 * B2|
        C        +---+---+
        C      + |u0 |u-1|
        C        +---+---+
        C      - | d |     (conditional on carry)
        C     ---+---+---+
        C        |u1 | u0|
        C        +---+---+

        C For the q update:
        C           +-------+
        C           |u1 * v |
        C           +---+---+
        C           | u1|
        C           +---+
        C           | 1 |  (conditional on {u1, u0} carry)
        C       +---+---+
        C    +  | q1| q0|
        C   +---+---+---+---+
        C   | q3| q2| q1| q0|
        C   +---+---+---+---+

        ALIGN(16)
L(loop):
        mulx(   B2, P0, P1)     C {p1, p0} <-- u1 B2
        mulx(   DINV, T0, T1)   C {t1, t0} <-- u1 dinv

        C {q2, q0} <-- u1 + q0
        xor     Q2, Q2
        add     %rdx, Q0
        adc     $0, Q2

        C {u2, u1, u0} <-- {u0, up[n-1]} + { p1, p0 }
        mov     U0, %rdx
        mov     -8(UP, UN, 8), U0
        add     P0, U0
        adc     P1, %rdx                C carry represents U2

        C u1 <-- u1 - u2 d
        lea     (%rdx, D), P0
        cmovc   P0, %rdx

        C {q2, q0} <-- {q2, q0} + {q1, t1} + u2
        adc     T1, Q0
        adc     Q1, Q2
        mov     Q2, 8(QP, UN, 8)
        jc      L(q_incr)
L(q_incr_done):
        mov     Q0, Q1
        mov     T0, Q0
        dec     UN
        jnz     L(loop)

L(final):
        neg     D
        xor     Q2, Q2

        mov     %rdx, P0
        sub     D, %rdx
        cmovc   P0, %rdx
        sbb     $-1, Q2

        lea     1(%rdx), P0

        mulx(   DINV, T0, P1)
        add     U0, T0
        adc     P0, P1
        mov     P1, P0
        imul    D, P1
        sub     P1, U0
        cmp     U0, T0
        lea     (U0, D), %rax   C Clobbers T0
        cmovnc  U0, %rax
        sbb     $0, P0
        cmp     D, %rax
        jc      L(div_done)
        sub     D, %rax
        add     $1, P0
L(div_done):
        add     P0, Q0
        mov     Q0, (QP)
        adc     Q2, Q1
        mov     Q1, 8(QP)
        jnc     L(done)

L(final_q_incr):
        addq    $1, 16(QP)
        lea     8(QP), QP
        jc      L(final_q_incr)
L(done):
        pop     %rbp
        pop     %rbx
        pop     %r12
        pop     %r13
        pop     %r14
        pop     %r15
        FUNC_EXIT()
        ret

L(q_incr):
        C Q1 is not live, so use it for indexing
        lea     16(QP, UN, 8), P0
L(q_incr_loop):
        addq    $1, (P0)
        jnc     L(q_incr_done)
        lea     8(P0), P0
        jmp     L(q_incr_loop)
EPILOGUE()

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to