I turned out the code was a bit slower on k8.

This patch changes that.  With it applied, things takes 11 c/l on both
pipelines.  This is also a 2 c/l improvement for piledriver.

I have not tested that this is correct.  If you like the patch, please
consider putting the result in the k8 subdir.

diff -r e9a5ec7f4003 mpn/x86_64/k10/div_qr_1n_pi1.asm
--- a/mpn/x86_64/k10/div_qr_1n_pi1.asm  Tue Oct 22 10:16:16 2013 +0200
+++ b/mpn/x86_64/k10/div_qr_1n_pi1.asm  Tue Oct 22 14:19:48 2013 +0200
@@ -117,15 +117,16 @@
        dec     UN
        mov     U1, %rax
        jz      L(final)
+       mov     $0, R32(Q1)
        
        ALIGN(16)
 
-       C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
-       C At entry, %rax holds an extra copy of U1, and carry holds an extra 
copy of U2.
+       C Loop is 28 instructions, 30 K8/K10 decoder slots, should run in 10
+       C cycles.  At entry, %rax holds an extra copy of U1, and carry holds
+       C an extra copy of U2.
 L(loop):
        C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
        C Remains to add in B (U1 + c)
-       mov     $0, Q1
        cmovc   DINV, Q1
        mov     U2, Q2
        neg     Q2
@@ -147,13 +148,14 @@
        C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
        adc     U1, Q1
        mov     -8(UP, UN, 8), U0
-       adc     Q2,8(QP, UN, 8)
+       adc     Q2, 8(QP, UN, 8)
        jc      L(q_incr)
 L(q_incr_done):
        add     %rax, U0
        mov     T, %rax
        adc     %rdx, %rax
        mov     Q1, (QP, UN, 8)
+       mov     $0, R32(Q1)
        sbb     U2, U2
        dec     UN
        mov     %rax, U1 

-- 
Torbjörn
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
http://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to