Torbjörn Granlund <t...@gmplib.org> writes:

> And, when there is asm, the scary performance ratio between C and asm
> which would suggest that C-to-C comparisons of tight GMP loops might not
> be terribly relevant.  :-)

I've got a x86_64 variant of method 3 to work. Attached below. It's a
bit clumsy, with lots of moves. It's tempting to write a variant using
mulx and maybe adcx/adox too.

On my laptop, it's half a cycle per limb faster than current
implementation, 7.3 c/l vs 7.7. 

Regards,
/Niels

dnl  x86-64 mpn_div_qr_1n_pi1
dnl  -- Divide an mpn number by a normalized single-limb number,
dnl     using a single-limb inverse.

dnl  Contributed to the GNU project by Niels Möller

dnl  Copyright 2013, 2021 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of either:
dnl
dnl    * the GNU Lesser General Public License as published by the Free
dnl      Software Foundation; either version 3 of the License, or (at your
dnl      option) any later version.
dnl
dnl  or
dnl
dnl    * the GNU General Public License as published by the Free Software
dnl      Foundation; either version 2 of the License, or (at your option) any
dnl      later version.
dnl
dnl  or both in parallel, as here.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
dnl  for more details.
dnl
dnl  You should have received copies of the GNU General Public License and the
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

C INPUT Parameters
define(`QP', `%rdi')
define(`UP', `%rsi')
define(`UN_INPUT', `%rdx')
define(`U1', `%rcx')
define(`D', `%r8')
define(`DINV', `%r9')

C Invariants
define(`B2', `%rbp')

C Variables
define(`UN', `%rbx')
define(`T', `%r10')
define(`U0', `%r11')
define(`U2', `%r12')
define(`Q0', `%r13')
define(`Q1', `%r14')
define(`Q2', `%r15')

ABI_SUPPORT(STD64)

        ASM_START()
        TEXT
        ALIGN(16)
PROLOGUE(mpn_div_qr_1n_pi1)
        FUNC_ENTRY(4)
IFDOS(` mov     56(%rsp), %r8   ')
IFDOS(` mov     64(%rsp), %r9   ')
        dec     UN_INPUT
        jnz     L(first)

        C Just a single 2/1 division.
        C T, U0 are allocated in scratch registers
        lea     1(U1), T
        mov     U1, %rax
        mul     DINV
        mov     (UP), U0
        add     U0, %rax
        adc     T, %rdx
        mov     %rdx, T
        imul    D, %rdx
        sub     %rdx, U0
        cmp     U0, %rax
        lea     (U0, D), %rax
        cmovnc  U0, %rax
        sbb     $0, T
        cmp     D, %rax
        jc      L(single_div_done)
        sub     D, %rax
        add     $1, T
L(single_div_done):
        mov     T, (QP)
        FUNC_EXIT()
        ret
L(first):
        C FIXME: Could delay some of these until we enter the loop.
        push    %r15
        push    %r14
        push    %r13
        push    %r12
        push    %rbx
        push    %rbp

        neg     D
        mov     D, B2
        imul    DINV, B2

        mov     UN_INPUT, UN

        mov     DINV, %rax
        mul     U1
        mov     %rax, Q0
        mov     U1, Q1
        add     %rdx, Q1
        
        mov     B2, %rax
        mul     U1
        mov     -8(UP, UN, 8), U0
        mov     (UP, UN, 8), U1
        add     %rax, U0
        adc     %rdx, U1
        lea     (U1, D), T
        cmovc   T, U1
        adc     $0, Q1
        mov     Q1, (QP, UN, 8)

        dec     UN
        C mov   U1, %rax
        jz      L(final)

        ALIGN(16)
L(loop):
        mov     B2, %rax
        mul     U1              C {p1, p0} <-- u1 B2

        C {q2, q1} <-- u1 + q0 
        xor     Q2, Q2
        mov     Q0, Q1
        add     U1, Q1
        adc     $0, Q2

        C {u2, u1, u0} <-- {u0, up[n-1]} + { p1, p0 } 
        mov     -8(UP, UN, 8), T
        add     %rax, T
        mov     U1, %rax
        mov     U0, U1
        mov     T, U0
        adc     %rdx, U1
        sbb     U2, U2

        mul     DINV            C {t1, t0} <-- u1 dinv
        C u1 <-- u1 - u2 d
        mov     D, T
        and     U2, T
        add     T, U1

        C {q2, q1} <-- {q2, q1} + (t1 + u2)
        sub     U2, %rdx        C No overflow possible
        add     %rdx, Q1
        mov     Q1, (QP, UN, 8)
        adc     Q2, 8(QP, UN, 8)
        jc      L(q_incr)
L(q_incr_done):
        dec     UN
        mov     %rax, Q0
        jnz     L(loop)

L(final):
        neg     D
        xor     Q1, Q1

        mov     U1, %rax
        sub     D, %rax
        cmovc   U1, %rax
        sbb     $-1, Q1

        lea     1(%rax), T
        mul     DINV
        add     U0, %rax
        adc     T, %rdx
        mov     %rdx, T
        imul    D, %rdx
        sub     %rdx, U0
        cmp     U0, %rax
        lea     (U0, D), %rax
        cmovnc  U0, %rax
        sbb     $0, T
        cmp     D, %rax
        jc      L(div_done)
        sub     D, %rax
        add     $1, T
L(div_done):
        add     T, Q0
        mov     Q0, (QP)
        adc     Q1, 8(QP)
        jnc     L(done)

L(final_q_incr):
        addq    $1, 16(QP)
        lea     8(QP), QP
        jc      L(final_q_incr)
L(done):
        pop     %rbp
        pop     %rbx
        pop     %r12
        pop     %r13
        pop     %r14
        pop     %r15
        FUNC_EXIT()
        ret

L(q_incr):
        C Q1 is not live, so use it for indexing
        lea     16(QP, UN, 8), Q1
L(q_incr_loop):
        addq    $1, (Q1)
        jnc     L(q_incr_done)
        lea     8(Q1), Q1
        jmp     L(q_incr_loop)
EPILOGUE()

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to