Hi

Our current addmul_1 for the core2/penryn runs at 4.5/4.6c/l. Attached is 
version which runs at the optimal 4.0c/l , this is a 4-way unrolled , delay by 
2 , variable swap method. I haven't put it in trunk yet as I think I can do 
better. I have a 2-way unrolled , delay by 2 , variable swap method in which 
the inner loop also ran at 4.0c/l , but when I added the proper feed-in and 
wind-down code those good timings disappeared , and despite some fiddling I 
haven't managed to get it back :( . However there are a few problems with the 
above code , mainly in that it is deeply pipelined and therefore for small 
sizes it runs slower than our existing code , and therefore it not much use 
for mul_basecase.

I'll explain what I mean by delay by 2 , consider the usual AMD method

#first limb
mov $0,r2
mov (src),ax
mul cx
add r0,(dst)
adc ax,r1
adc dx,r2
#second limb
mov $0,r0
mov 1(src),ax
mul cx
add r1,1(dst)
adc ax,r2
adc dx,r0
# third limb.....

Here we add the result of the mul(ax,dx) in the same piece of code that 
processes that limb , and we rely on the OOO scheduler to take care of it. 
This works fine on the AMD chips where the latency of mul is 5c , but on the 
Intel chips the latency is 8c , and this is just too far ahead for the 
schedulers , so what we can do is do the adding for 0(src) in the piece of 
code that multiplys 1(src) , and as the mul instruction always uses ax.dx we 
must copy them to two temp registers ax1,dx1 to get this

#first limb
mov $0,r2
mov (src),ax
mul cx
add r0,-1(dst)
adc ax3,r1
adc dx3,r2
mov ax,ax1
mov dx,dx1
#second limb
mov $0,r0
mov 1(src),ax
mul cx
add r1,0(dst)
adc ax1,r2
adc dx1,r0
mov ax,ax2
mov dx,dx2
# third limb
mov $0,r1
mov 2(src),ax
mul cx
add r2,1(dst)
adc ax2,r0
adc dx2,r1
mov ax,ax3
mov dx,dx3

This is a delay by 1 , ie the adding for 0(src) is done in the next limb's 
code , and a delay by 2 is postponing it another limb. This leads to a long 
pipeline with quite wasteful and long feed-in/wind-down code. I'm going to try 
a delay by 1 , as so far I've only tried a delay by 2(current code is delay by 
2), this should reduce the feed-in/wind-down code and hopefully it will be 
easier to deal with.

The variable swap part is that in the AMD code we have a minimum unroll of 3 
(without introducing extra copy instructions) , same for the delay by 1 code 
above. By swapping dst/src of the add instruction we can get a minumum of a 2-
way unroll. ie

#first limb
mov (src),ax
mul cx
add ax1,-1(dst)
adc dx1,ax2
adc $0,dx2
mov ax,ax1
mov dx,dx1
#second limb
mov 1(src),ax
mul cx
add ax2,0(dst)
adc dx2,ax1
adc $0,dx1
mov ax,ax2
mov dx,dx2

This also saves on the number of registers used , but it does constrain the 
re-ordering possibilities for the schedulers , again I will have to try it out 
to see what is best. The mov instructions also seem to best when implemented 
as "lea's" , on the AMD chips I could understand this as they use the AGU 
units as oppose to the ALU units , but Intel chips dont do this as far as I 
can tell.

Jason

-- 
You received this message because you are subscribed to the Google Groups 
"mpir-devel" group.
To post to this group, send email to mpir-de...@googlegroups.com.
To unsubscribe from this group, send email to 
mpir-devel+unsubscr...@googlegroups.com.
For more options, visit this group at 
http://groups.google.com/group/mpir-devel?hl=en.

dnl  mpn_addmul_1

dnl  Copyright 2010 The Code Cavern

dnl  This file is part of the MPIR Library.

dnl  The MPIR Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The MPIR Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

include(`../config.m4')

ASM_START()
PROLOGUE(mpn_addmul_1)
#mov (%rsi),%rax   #this speeds up small n , but slows down main loop????
cmp $2,%rdx
jb one
jz two
lea 16-48(%rsi,%rdx,8),%rsi
lea 16-48(%rdi,%rdx,8),%rdi
push %r13
push %r12
push %rbx
mov $6,%rbx
sub %rdx,%rbx
        mov -16(%rsi,%rbx,8),%rax
        mul %rcx
        lea (%rax),%r8
        mov 8-16(%rsi,%rbx,8),%rax
        lea (%rdx),%r9
        mul %rcx
        lea (%rax),%r11
        mov 16-16(%rsi,%rbx,8),%rax
        mov -16(%rdi,%rbx,8),%r10
        mov 8-16(%rdi,%rbx,8),%r13
        lea (%rdx),%r12
cmp $0,%rbx
jge skiplp      
ALIGN(16)
lp:
        mul %rcx
        add %r8,%r10
        lea (%rax),%r8
        mov 24-16(%rsi,%rbx,8),%rax
        adc %r9,%r11
        mov %r10,-16(%rdi,%rbx,8)
        adc $0,%r12
        lea (%rdx),%r9
        mul %rcx
        add %r11,%r13
        lea (%rax),%r11
        mov 32-16(%rsi,%rbx,8),%rax
        adc %r12,%r8
        mov %r13,8-16(%rdi,%rbx,8)
        adc $0,%r9
        lea (%rdx),%r12
        mov 16-16(%rdi,%rbx,8),%r10
        mul %rcx
        add %r8,%r10
        lea (%rax),%r8
        mov 40-16(%rsi,%rbx,8),%rax
        adc %r9,%r11
        mov %r10,16-16(%rdi,%rbx,8)
        mov 24-16(%rdi,%rbx,8),%r13
        adc $0,%r12
        lea (%rdx),%r9
        mul %rcx
        add %r11,%r13
        lea (%rax),%r11
        adc %r12,%r8
        mov %r13,24-16(%rdi,%rbx,8)
        mov 48-16(%rsi,%rbx,8),%rax
        mov 32-16(%rdi,%rbx,8),%r10
        mov 40-16(%rdi,%rbx,8),%r13
        adc $0,%r9
        lea (%rdx),%r12
        add $4,%rbx
        jnc lp
ALIGN(16)
skiplp:
mul %rcx
cmp $2,%rbx
ja case0        #// rbx=3
je case1        #// rbx=2
jp case2        #// rbx=1
case3:          #// rbx=0
# 3 more loads
        add %r8,%r10
        lea (%rax),%r8
        mov 8(%rsi),%rax
        adc %r9,%r11
        mov %r10,-16(%rdi)
        adc $0,%r12
        lea (%rdx),%r9
        mul %rcx
        add %r11,%r13
        lea (%rax),%r11
        mov 16(%rsi),%rax
        adc %r12,%r8
        mov %r13,-8(%rdi)
        adc $0,%r9
        lea (%rdx),%r12
        mov (%rdi),%r10
        mul %rcx
        mov 8(%rdi),%r13
case1:  add %r8,%r10
        lea (%rax),%r8
        mov 24(%rsi),%rax
        adc %r9,%r11
        mov %r10,(%rdi)
        adc $0,%r12
        lea (%rdx),%r9
        mul %rcx
        add %r11,%r13
        lea (%rax),%r11
        adc %r12,%r8
        mov %r13,8(%rdi)
        mov 16(%rdi),%r10
        mov 24(%rdi),%r13
        adc $0,%r9
        lea (%rdx),%r12
        add %r8,%r10
        adc %r9,%r11
        mov %r10,16(%rdi)
        adc $0,%r12
        add %r11,%r13
        adc $0,%r12
        mov %r13,24(%rdi)
        mov %r12,%rax
        pop %rbx
        pop %r12
        pop %r13
        ret
case2:
# 2 more loads                  //rbx=1
        add %r8,%r10
        lea (%rax),%r8
        mov 16(%rsi),%rax
        adc %r9,%r11
        mov %r10,-8(%rdi)
        adc $0,%r12
        lea (%rdx),%r9
        mul %rcx
        add %r11,%r13
        lea (%rax),%r11
        mov 24(%rsi),%rax
        adc %r12,%r8
        mov %r13,(%rdi)
        adc $0,%r9
        lea (%rdx),%r12
        mov 8(%rdi),%r10
        mul %rcx
        mov 16(%rdi),%r13
case0:  add %r8,%r10
        lea (%rax),%r8
        adc %r9,%r11
        mov %r10,8(%rdi)
        adc $0,%r12
        lea (%rdx),%r9
        add %r11,%r13
        adc %r12,%r8
        mov %r13,16(%rdi)
        mov 24(%rdi),%r10
        adc $0,%r9
        add %r8,%r10
        adc $0,%r9
        mov %r10,24(%rdi)
        mov %r9,%rax
        pop %rbx
        pop %r12
        pop %r13
        ret
one:
        mov (%rsi),%rax
        mul %rcx
        add %rax,(%rdi)
        adc $0,%rdx
        mov %rdx,%rax
        ret
two:    
        mov (%rsi),%rax
        mul %rcx
        mov %rax,%r8
        mov 8(%rsi),%rax
        mov %rdx,%r9
        mov $0,%r10d
        mul %rcx
        add %r8,(%rdi)
        adc %rax,%r9
        adc %rdx,%r10
        add %r9,8(%rdi)
        adc $0,%r10
        mov %r10,%rax
        ret
EPILOGUE()

Reply via email to