ni...@lysator.liu.se (Niels Möller) writes:

> Torbjorn Granlund <t...@gmplib.org> writes:
>
>> Basically qp = up won't work, but qp = up + k for any positive k will?
>> Does the C code share that property?
[...]
>> I think it would be good to fix that, since it is surely a common usage
>> scenario.
>
> I agree.

I've checked in some bugfixes. Now it seems to work according to try,
and without the no-overlap patch which it turned out I never pushed into
the repo.

> If/when the code is reorganized to delay the q stores (to avoid the "adc
> Q2,8(QP, UN, 8)" instruction),

Here's a first crude version, which seems to work. I could eliminate
other uses of T, and then use that register to the previous Q1 limb
around

I had to change the logic

        lea     (B2md, U0), T
        ...
        and     B2, U2
        add     U2, U0
        cmovnc  U0, T
        adc     U1, Q1
        
Here, B2md holds B^2 (mod d) - d, and U2 is initially a mask. To use U2
as temporary (it dies at the add), I had to change it to

        and     B2, U2
        add     U2, U0
        lea     (B2md, U0), U2
        cmovnc  U0, U2
        adc     U1, Q1

which gives the same result, provided that B2md is adjusted to just hold
-d. This sequence has less friendly dependencies. Maybe it would be
better to stick some of the loop-invariant constants in memory instead.

It's getting late, so I'll not do any benchmarking right now.

Regards,
/Niels

dnl  x86-64 mpn_div_qr_1n_pi1
dnl  -- Divide an mpn number by a normalized single-limb number,
dnl     using a single-limb inverse.

dnl  Contributed to the GNU project by Niels Möller

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')


C               c/l
C AMD K8,K9     13
C AMD K10       13
C AMD bull      16.5
C AMD pile      15
C AMD steam      ?
C AMD bobcat    16
C AMD jaguar     ?
C Intel P4      47      poor
C Intel core    19.25
C Intel NHM     18
C Intel SBR     15      poor
C Intel IBR     13
C Intel HWL     11.7
C Intel BWL      ?
C Intel atom    52      very poor
C VIA nano      19


C INPUT Parameters
define(`QP', `%rdi')
define(`UP', `%rsi')
define(`UN_INPUT', `%rdx')
define(`U1', `%rcx')    C Also in %rax
define(`D', `%r8')
define(`DINV', `%r9')

C Invariants
define(`B2', `%rbp')
define(`B2md', `%rbx')

C Variables
define(`UN', `%r8')     C Overlaps D input
define(`T', `%r10')
define(`U0', `%r11')
define(`U2', `%r12')
define(`Q0', `%r13')
define(`Q1', `%r14')
define(`Q2', `%r15')

ABI_SUPPORT(STD64)

        ASM_START()
        TEXT
        ALIGN(16)
PROLOGUE(mpn_div_qr_1n_pi1)
        FUNC_ENTRY(6)
IFDOS(` mov     56(%rsp), %r8   ')
IFDOS(` mov     64(%rsp), %r9   ')
        dec     UN_INPUT
        jnz     L(first)

        C Just a single 2/1 division.
        C T, U0 are allocated in scratch registers
        lea     1(U1), T
        mov     U1, %rax
        mul     DINV
        mov     (UP), U0
        add     U0, %rax
        adc     T, %rdx
        mov     %rdx, T
        imul    D, %rdx
        sub     %rdx, U0
        cmp     U0, %rax
        lea     (U0, D), %rax
        cmovnc  U0, %rax
        sbb     $0, T
        cmp     D, %rax
        jc      L(single_div_done)
        sub     D, %rax
        add     $1, T
L(single_div_done):
        mov     T, (QP)
        FUNC_EXIT
        ret
L(first):
        C FIXME: Could delay some of these until we enter the loop.
        push    %r15
        push    %r14
        push    %r13
        push    %r12
        push    %rbx
        push    %rbp

        mov     D, B2
        imul    DINV, B2
        neg     B2
C       mov     B2, B2md
C       sub     D, B2md
        mov     D, B2md
        neg     B2md

        C D not needed until final reduction
        push    D
        mov     UN_INPUT, UN    C Clobbers D

        mov     DINV, %rax
        mul     U1
        mov     %rax, Q0
        add     U1, %rdx
        mov     %rdx, Q1

        mov     B2, %rax
        mul     U1
        mov     -8(UP, UN, 8), U0
        mov     (UP, UN, 8), U1
        C mov   Q1, (QP, UN, 8)
        add     %rax, U0
        adc     %rdx, U1
        sbb     U2, U2
        dec     UN
        mov     U1, %rax
        jz      L(final)

        ALIGN(16)

        C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
        C At entry, %rax holds an extra copy of U1
L(loop):
        C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
        C Remains to add in B (U1 + c)
        mov     Q1, T           C Save input value
        mov     DINV, Q1
        mov     U2, Q2
        and     U2, Q1
        neg     Q2
        mul     DINV
        add     %rdx, Q1
        adc     $0, Q2
        add     Q0, Q1
        mov     %rax, Q0
        mov     B2, %rax
        adc     $0, Q2

        C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
        mul     U1
        and     B2, U2
        add     U2, U0
        C Could do an earlier U0 + (B2 - d),
        C if we only had a spare register
        lea     (B2md, U0), U2
        cmovnc  U0, U2

        C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
        adc     U1, Q1
        mov     -8(UP, UN, 8), U0
        adc     T, Q2
        mov     Q2, 8(QP, UN, 8)
        jc      L(q_incr)
L(q_incr_done):
        add     %rax, U0
        mov     U2, %rax
        adc     %rdx, %rax
        C mov   Q1, (QP, UN, 8)
        sbb     U2, U2
        dec     UN
        mov     %rax, U1
        jnz     L(loop)

L(final):
        pop     D

        mov     Q1, T   C Save input value
        mov     U2, Q1
        and     D, U2
        sub     U2, %rax
        neg     Q1

        mov     %rax, U1
        sub     D, %rax
        cmovc   U1, %rax
        sbb     $-1, Q1

        lea     1(%rax), U2
        mul     DINV
        add     U0, %rax
        adc     U2, %rdx
        mov     %rdx, U2
        imul    D, %rdx
        sub     %rdx, U0
        cmp     U0, %rax
        lea     (U0, D), %rax
        cmovnc  U0, %rax
        sbb     $0, U2
        cmp     D, %rax
        jc      L(div_done)
        sub     D, %rax
        add     $1, U2
L(div_done):
        add     U2, Q0
        mov     Q0, (QP)
        adc     T, Q1
        mov     Q1, 8(QP)
        jnc     L(done)
L(final_q_incr):
        addq    $1, 16(QP)
        lea     8(QP), QP
        jc      L(final_q_incr)

L(done):
        pop     %rbp
        pop     %rbx
        pop     %r12
        pop     %r13
        pop     %r14
        pop     %r15
        FUNC_EXIT
        ret

L(q_incr):
        C U1 is not live, so use it for indexing
        lea     16(QP, UN, 8), U1
L(q_incr_loop):
        addq    $1, (U1)
        jnc     L(q_incr_done)
        lea     8(U1), U1
        jmp     L(q_incr_loop)
EPILOGUE()

> Regards,
> /Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
http://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to