On Friday 28 January 2011 13:43:56 Jason wrote:
> On Friday 28 January 2011 11:02:07 Jason wrote:
> > On Friday 28 January 2011 10:55:06 jason wrote:
> > > Hi
> > > 
> > > In trunk is a new AMD addmul_1 , this runs at the same speed as the
> > > old one , but is smaller. The old code was 450 bytes and the new code
> > > is 407 bytes.I've not tested it on a K10 yet as skynet is down but
> > > from what I think I know of the AMD chips it must run at the same
> > > speed.The windows conversion is only worth doing if the alignments/
> > > spacing are placed carefully. ie loop starts/ends on 16bytes
> > > boundary , and jmp destinations are close enough , defined by
> > > testing :)
> > > More to follow.
> > > 
> > > Jason
> > 
> > Note: The old addmul_1 also had an alternate entry point for inclsh_n , I
> > dont know why we did this , if the fastest inclsh is really addmul_1 then
> > we should use a macro , and if not (ie core2) then we should an alternate
> > entry point(or new fn)
> > Note: The 450 bytes count above did not include the inclsh_n part
> > 
> > Jason
> 
> Attached is a AMD 4way addmul_1 , the inner loop is the same , but instead
> of four cases to handle the "leftovers" we use jumping into the loop ,
> this save quite a bit of code save , it's 278 bytes. The asymptotic speed
> is the same but the overheads are a bit more. I have not put this in
> trunk.
> 
> Jason

I should of also said , I expect I can quite easily shave some cycles of it 
and some space.

Attached are 3 variants of an AMD addmul_1 7-way unroll. This runs at 
17/7=2.428c/l (4-way is 2.5c/l) a 2.9% improvement , due to the reasons below 
I dont regard this as practical so you will notice that no attempt has been 
made to optimize it or clean it up.

k8_addmul_1_7way.asm is the usual way of handling the leftovers by having 7 
cases , the problem is code size.

k8_addmul_1_7way_jmpepi.asm uses a small 7-entry jump table to branch to the 7 
cases (as oppose to the above which uses a string of cmp's and Jcc's_) , code 
size is still a problem , and the jump table should be in a separate segment.

k8_addmul_1_7way_jmpin.asm uses a jump into the middle of the loop approach to 
handle the left-overs , this saves a lot of space but we need to calculate 
size%7 , this is much easier than a general division( could do a hensel-div ie 
10cycles max) or some shifting and assume L1-cache then we can limit the size 
to 4096.  I've just done a standard slow division , and the feed-in cases are 
poor.

The inflexibility of the code sequence limits the scheduler and pick hardware 
so that some tricks had to be used to help the chip out :)

It may be possible to to improve this speed (if the tricks are good enough) by 
going to a larger unroll , 10-way (2.4c/l) is possible , and 16-way (2.375) is 
the next, but there are better ways.

Jason

-- 
You received this message because you are subscribed to the Google Groups 
"mpir-devel" group.
To post to this group, send email to mpir-devel@googlegroups.com.
To unsubscribe from this group, send email to 
mpir-devel+unsubscr...@googlegroups.com.
For more options, visit this group at 
http://groups.google.com/group/mpir-devel?hl=en.

dnl  mpn_addmul_1

dnl  Copyright 2011 The Code Cavern

dnl  This file is part of the MPIR Library.

dnl  The MPIR Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The MPIR Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

include(`../config.m4')

ASM_START()
PROLOGUE(mpn_addmul_1)
# doesn't like xor's in the loop ?????????? on K10 cuda
# swap reg's around to save mov
# use red zone
# space out loop and feedin to be multiple of 16 bytes
mov (%rsi),%rax
cmp $1,%rdx
je one	
push %rbx
push %rbp
push %r12
push %r13
push %r14	
mov %rcx,%rbp
xor %rbx,%rbx
mov $8,%rcx
lea -64(%rsi,%rdx,8),%rsi
lea -64(%rdi,%rdx,8),%rdi
sub %rdx,%rcx
xor %r10,%r10
xor %r11,%r11
mov (%rsi,%rcx,8),%rax
mul %rbp
mov %rax,%r8
mov %rdx,%r9
mov 8(%rsi,%rcx,8),%rax
cmp $0,%rcx
jge skiplp
.align 16
lp:	mov $0,%r12
	mul %rbp
	add %r8,(%rdi,%rcx,8)
	adc %rax,%r9
	adc %rdx,%r10
	mov 16(%rsi,%rcx,8),%rax
	mul %rbp
	add %r9,8(%rdi,%rcx,8)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 24(%rsi,%rcx,8),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,16(%rdi,%rcx,8)
	adc %rax,%r11
	adc %rdx,%r12
	mov 32(%rsi,%rcx,8),%rax
	mul %rbp
	add %r11,24(%rdi,%rcx,8)
	mov $0,%r9
	adc %rax,%r12
	mov $0,%r10
	mov 40(%rsi,%rcx,8),%rax
	adc %rdx,%r13
	mov $0,%r11
	mul %rbp
	add %r12,32(%rdi,%rcx,8)
	adc %rax,%r13
	adc %rdx,%r14
	mov 48(%rsi,%rcx,8),%rax
	mul %rbp
	add %r13,40(%rdi,%rcx,8)
	adc %rax,%r14
	mov 56(%rsi,%rcx,8),%rax
	adc %rdx,%r8
	mul %rbp
	add %r14,48(%rdi,%rcx,8)
	adc %rax,%r8
	adc %rdx,%r9
	add $7,%rcx
	mov 8(%rsi,%rcx,8),%rax
	jnc lp
#.align 16
skiplp:
	mov $0,%r12
	mul %rbp
	add %r8,(%rdi,%rcx,8)
	adc %rax,%r9
	adc %rdx,%r10
cmp $5,%rcx
ja case0
jz case1
cmp $3,%rcx
ja case2
jz case3
cmp $1,%rcx
ja case4
jz case5
case6:	mov 16(%rsi),%rax
	mul %rbp
	add %r9,8(%rdi)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 24(%rsi),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,16(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
	mov 32(%rsi),%rax
	mul %rbp
	add %r11,24(%rdi)
	mov $0,%r9
	adc %rax,%r12
	mov $0,%r10
	mov 40(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r11
	mul %rbp
	add %r12,32(%rdi)
	adc %rax,%r13
	adc %rdx,%r14
	mov 48(%rsi),%rax
	mul %rbp
	add %r13,40(%rdi)
	adc %rax,%r14
	mov 56(%rsi),%rax
	adc %rdx,%r8
	mul %rbp
	add %r14,48(%rdi)
	adc %rax,%r8
	adc %rdx,%r9
	add %r8,56(%rdi)
	adc $0,%r9
	mov %r9,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case5:	mov 24(%rsi),%rax
	mul %rbp
	add %r9,16(%rdi)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 32(%rsi),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,24(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
	mov 40(%rsi),%rax
	mul %rbp
	add %r11,32(%rdi)
	mov $0,%r9
	adc %rax,%r12
	mov $0,%r10
	mov 48(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r11
	mul %rbp
	add %r12,40(%rdi)
	adc %rax,%r13
	adc %rdx,%r14
	mov 56(%rsi),%rax
	mul %rbp
	add %r13,48(%rdi)
	adc %rax,%r14
	adc %rdx,%r8
	add %r14,56(%rdi)
	adc $0,%r8
	mov %r8,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case4:	mov 32(%rsi),%rax
	mul %rbp
	add %r9,24(%rdi)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 40(%rsi),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,32(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
	mov 48(%rsi),%rax
	mul %rbp
	add %r11,40(%rdi)
	mov $0,%r9
	adc %rax,%r12
	mov $0,%r10
	mov 56(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r11
	mul %rbp
	add %r12,48(%rdi)
	adc %rax,%r13
	adc %rdx,%r14
	add %r13,56(%rdi)
	adc $0,%r14
	mov %r14,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case3:	mov 40(%rsi),%rax
	mul %rbp
	add %r9,32(%rdi)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 48(%rsi),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,40(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
	mov 56(%rsi),%rax
	mul %rbp
	add %r11,48(%rdi)
	adc %rax,%r12
	adc %rdx,%r13
	add %r12,56(%rdi)
	adc $0,%r13
	mov %r13,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case2:	mov 48(%rsi),%rax
	mul %rbp
	add %r9,40(%rdi)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 56(%rsi),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,48(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
	add %r11,56(%rdi)
	adc $0,%r12
	mov %r12,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case1:	mov 56(%rsi),%rax
	mul %rbp
	add %r9,48(%rdi)
	adc %rax,%r10
	adc %rdx,%r11
	add %r10,56(%rdi)
	adc $0,%r11
	mov %r11,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case0:	add %r9,56(%rdi)
	adc $0,%r10
	mov %r10,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
one:	mul %rcx
	add %rax,(%rdi)
	adc $0,%rdx
	mov %rdx,%rax
	ret
EPILOGUE()
dnl  mpn_addmul_1

dnl  Copyright 2011 The Code Cavern

dnl  This file is part of the MPIR Library.

dnl  The MPIR Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The MPIR Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

include(`../config.m4')

ASM_START()
PROLOGUE(mpn_addmul_1)
# doesn't like xor's in the loop ?????????? on K10 cuda
# swap reg's around to save mov
# use red zone
# space out loop and feedin to be multiple of 16 bytes
mov (%rsi),%rax
cmp $1,%rdx
je one	
push %rbx
push %rbp
push %r12
push %r13
push %r14	
mov %rcx,%rbp
xor %rbx,%rbx
mov $8,%rcx
lea -64(%rsi,%rdx,8),%rsi
lea -64(%rdi,%rdx,8),%rdi
sub %rdx,%rcx
xor %r10,%r10
xor %r11,%r11
mov (%rsi,%rcx,8),%rax
mul %rbp
mov %rax,%r8
mov %rdx,%r9
mov 8(%rsi,%rcx,8),%rax
cmp $0,%rcx
jge skiplp
.align 16
lp:	mov $0,%r12
	mul %rbp
	add %r8,(%rdi,%rcx,8)
	adc %rax,%r9
	adc %rdx,%r10
	mov 16(%rsi,%rcx,8),%rax
	mul %rbp
	add %r9,8(%rdi,%rcx,8)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 24(%rsi,%rcx,8),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,16(%rdi,%rcx,8)
	adc %rax,%r11
	adc %rdx,%r12
	mov 32(%rsi,%rcx,8),%rax
	mul %rbp
	add %r11,24(%rdi,%rcx,8)
	mov $0,%r9
	adc %rax,%r12
	mov $0,%r10
	mov 40(%rsi,%rcx,8),%rax
	adc %rdx,%r13
	mov $0,%r11
	mul %rbp
	add %r12,32(%rdi,%rcx,8)
	adc %rax,%r13
	adc %rdx,%r14
	mov 48(%rsi,%rcx,8),%rax
	mul %rbp
	add %r13,40(%rdi,%rcx,8)
	adc %rax,%r14
	mov 56(%rsi,%rcx,8),%rax
	adc %rdx,%r8
	mul %rbp
	add %r14,48(%rdi,%rcx,8)
	adc %rax,%r8
	adc %rdx,%r9
	add $7,%rcx
	mov 8(%rsi,%rcx,8),%rax
	jnc lp
#.align 16
skiplp:
	mov $0,%r12
	mul %rbp
	add %r8,(%rdi,%rcx,8)
	adc %rax,%r9
	adc %rdx,%r10
lea table(%rip),%rax
add (%rax,%rcx,8),%rax
jmp *%rax
case6:	mov 16(%rsi),%rax
	mul %rbp
	add %r9,8(%rdi)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 24(%rsi),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,16(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
	mov 32(%rsi),%rax
	mul %rbp
	add %r11,24(%rdi)
	mov $0,%r9
	adc %rax,%r12
	mov $0,%r10
	mov 40(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r11
	mul %rbp
	add %r12,32(%rdi)
	adc %rax,%r13
	adc %rdx,%r14
	mov 48(%rsi),%rax
	mul %rbp
	add %r13,40(%rdi)
	adc %rax,%r14
	mov 56(%rsi),%rax
	adc %rdx,%r8
	mul %rbp
	add %r14,48(%rdi)
	adc %rax,%r8
	adc %rdx,%r9
	add %r8,56(%rdi)
	adc $0,%r9
	mov %r9,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case5:	mov 24(%rsi),%rax
	mul %rbp
	add %r9,16(%rdi)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 32(%rsi),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,24(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
	mov 40(%rsi),%rax
	mul %rbp
	add %r11,32(%rdi)
	mov $0,%r9
	adc %rax,%r12
	mov $0,%r10
	mov 48(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r11
	mul %rbp
	add %r12,40(%rdi)
	adc %rax,%r13
	adc %rdx,%r14
	mov 56(%rsi),%rax
	mul %rbp
	add %r13,48(%rdi)
	adc %rax,%r14
	adc %rdx,%r8
	add %r14,56(%rdi)
	adc $0,%r8
	mov %r8,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case4:	mov 32(%rsi),%rax
	mul %rbp
	add %r9,24(%rdi)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 40(%rsi),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,32(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
	mov 48(%rsi),%rax
	mul %rbp
	add %r11,40(%rdi)
	mov $0,%r9
	adc %rax,%r12
	mov $0,%r10
	mov 56(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r11
	mul %rbp
	add %r12,48(%rdi)
	adc %rax,%r13
	adc %rdx,%r14
	add %r13,56(%rdi)
	adc $0,%r14
	mov %r14,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case3:	mov 40(%rsi),%rax
	mul %rbp
	add %r9,32(%rdi)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 48(%rsi),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,40(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
	mov 56(%rsi),%rax
	mul %rbp
	add %r11,48(%rdi)
	adc %rax,%r12
	adc %rdx,%r13
	add %r12,56(%rdi)
	adc $0,%r13
	mov %r13,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case2:	mov 48(%rsi),%rax
	mul %rbp
	add %r9,40(%rdi)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 56(%rsi),%rax
	mov $0,%r8
	adc %rdx,%r11
	mul %rbp
	add %r10,48(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
	add %r11,56(%rdi)
	adc $0,%r12
	mov %r12,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case1:	mov 56(%rsi),%rax
	mul %rbp
	add %r9,48(%rdi)
	adc %rax,%r10
	adc %rdx,%r11
	add %r10,56(%rdi)
	adc $0,%r11
	mov %r11,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
case0:	add %r9,56(%rdi)
	adc $0,%r10
	mov %r10,%rax
	pop %r14
	pop %r13
	pop %r12
	pop %rbp
	pop %rbx
	ret
.align 16
one:	mul %rcx
	add %rax,(%rdi)
	adc $0,%rdx
	mov %rdx,%rax
	ret
table:
.quad case6-table	# rcx=0
.quad case5-table	# rcx=1
.quad case4-table
.quad case3-table
.quad case2-table
.quad case1-table
.quad case0-table
EPILOGUE()
dnl  mpn_addmul_1

dnl  Copyright 2011 The Code Cavern

dnl  This file is part of the MPIR Library.

dnl  The MPIR Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The MPIR Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

include(`../config.m4')

ASM_START()
PROLOGUE(mpn_addmul_1)
# doesn't like xor's in the loop ?????????? on K10 cuda
# swap reg's around to save mov
# use red zone
# space out loop and feedin to be multiple of 16 bytes
# simplified feedin code
# replace div instruction with shift/add or 2x mul
mov (%rsi),%rax
cmp $2,%rdx
je two
jl one	
push %rbx
push %rbp
push %r12
push %r13
push %r14	
mov %rcx,%rbp
xor %rbx,%rbx

sub $3,%rdx

mov %rdx,%r14
#// put remainder in r8
mov %rdx,%rax
mov $0,%rdx
mov $7,%r9
div %r9
mov %rdx,%r8
mov %r14,%rdx


mov %r8,%rcx
sub $7,%rcx
sub %rdx,%rcx
lea 8(%rsi,%rdx,8),%rsi
lea 8(%rdi,%rdx,8),%rdi

lea table(%rip),%rax
add (%rax,%r8,8),%rax
jmp *%rax

case3:	mov $0,%r8
	mov $0,%r9
	mov $0,%r10
	mov $0,%r11
	mov $0,%r12
	mov $0,%r13
	mov $0,%r14
	mov 48(%rsi,%rcx,8),%rax	# first load 
	mul %rbp
	mov %rax,%r14
	mov %rdx,%r8
	mov 56(%rsi,%rcx,8),%rax
	jmp in3
case4:	mov $0,%r8
	mov $0,%r9
	mov $0,%r10
	mov $0,%r11
	mov $0,%r12
	mov $0,%r13
	mov $0,%r14
	mov 40(%rsi,%rcx,8),%rax
	mul %rbp
	mov %rax,%r13
	mov %rdx,%r14
	mov 48(%rsi,%rcx,8),%rax
	jmp in4
case5:	mov $0,%r8
	mov $0,%r9
	mov $0,%r10
	mov $0,%r11
	mov $0,%r12
	mov $0,%r13
	mov $0,%r14
	mov 32(%rsi,%rcx,8),%rax
	mul %rbp
	mov %rax,%r12
	mov %rdx,%r13
	mov 40(%rsi,%rcx,8),%rax
	jmp in5
case6:	mov $0,%r8
	mov $0,%r9
	mov $0,%r10
	mov $0,%r11
	mov $0,%r12
	mov $0,%r13
	mov $0,%r14
	mov 24(%rsi,%rcx,8),%rax
	mul %rbp
	mov %rax,%r11
	mov %rdx,%r12
	mov 32(%rsi,%rcx,8),%rax
	jmp in6
case7:	mov $0,%r8
	mov $0,%r9
	mov $0,%r10
	mov $0,%r11
	mov $0,%r12
	mov $0,%r13
	mov $0,%r14
	mov 16(%rsi,%rcx,8),%rax
	mul %rbp
	mov %rax,%r10
	mov %rdx,%r11
	mov 24(%rsi,%rcx,8),%rax
	jmp in7
case8:	
	#mov $0,%r8
	#mov $0,%r9
	#mov $0,%r10
	mov $0,%r11
	mov $0,%r12
	#mov $0,%r13
	#mov $0,%r14
	mov 8(%rsi,%rcx,8),%rax
	mul %rbp
	mov %rax,%r9
	mov %rdx,%r10
	mov 16(%rsi,%rcx,8),%rax
	jmp in8
case9:	
	#mov $0,%r8
	#mov $0,%r9
	mov $0,%r10
	mov $0,%r11
	#mov $0,%r12
	#mov $0,%r13
	#mov $0,%r14
	mov (%rsi,%rcx,8),%rax
	mul %rbp
	mov %rax,%r8
	mov %rdx,%r9
	mov 8(%rsi,%rcx,8),%rax
	jmp in9
.align 16
# would be nice if these jumpin pts were on 16bytes boundarys
lp:
in9:	mov $0,%r12
	mul %rbp
	add %r8,(%rdi,%rcx,8)
	adc %rax,%r9
	adc %rdx,%r10
	mov 16(%rsi,%rcx,8),%rax
in8:	mul %rbp
	add %r9,8(%rdi,%rcx,8)
	lea (%rbx,%rbx,2),%r13
	adc %rax,%r10
	mov $0,%r14
	mov 24(%rsi,%rcx,8),%rax
	mov $0,%r8
	adc %rdx,%r11
in7:	mul %rbp
	add %r10,16(%rdi,%rcx,8)
	adc %rax,%r11
	adc %rdx,%r12
	mov 32(%rsi,%rcx,8),%rax
in6:	mul %rbp
	add %r11,24(%rdi,%rcx,8)
	mov $0,%r9
	adc %rax,%r12
	mov $0,%r10
	mov 40(%rsi,%rcx,8),%rax
	adc %rdx,%r13
	mov $0,%r11
in5:	mul %rbp
	add %r12,32(%rdi,%rcx,8)
	adc %rax,%r13
	adc %rdx,%r14
	mov 48(%rsi,%rcx,8),%rax
in4:	mul %rbp
	add %r13,40(%rdi,%rcx,8)
	adc %rax,%r14
	mov 56(%rsi,%rcx,8),%rax
	adc %rdx,%r8
in3:	mul %rbp
	add %r14,48(%rdi,%rcx,8)
	adc %rax,%r8
	adc %rdx,%r9
	add $7,%rcx
	mov 8(%rsi,%rcx,8),%rax		# last load
	jnc lp
mov $0,%r12	# remove if not upset pick hardware
mul %rbp
add %r8,(%rdi,%rcx,8)
adc %rax,%r9
adc %rdx,%r10
add %r9,8(%rdi,%rcx,8)
adc $0,%r10
mov %r10,%rax
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
ret
.align 16
one:	mul %rcx
	add %rax,(%rdi)
	adc $0,%rdx
	mov %rdx,%rax
	ret
.align 16
two:	mul %rcx
	add %rax,(%rdi)
	mov $0,%r8
	adc %rdx,%r8
	mov 8(%rsi),%rax
	mul %rcx
	add %r8,%rax
	adc $0,%rdx
	add %rax,8(%rdi)
	adc $0,%rdx
	mov %rdx,%rax
	ret	
table:
.quad case3-table       # rcx=0
.quad case4-table       # rcx=1
.quad case5-table
.quad case6-table
.quad case7-table
.quad case8-table
.quad case9-table
EPILOGUE()

Reply via email to