On Friday 28 January 2011 11:02:07 Jason wrote:
> On Friday 28 January 2011 10:55:06 jason wrote:
> > Hi
> > 
> > In trunk is a new AMD addmul_1 , this runs at the same speed as the
> > old one , but is smaller. The old code was 450 bytes and the new code
> > is 407 bytes.I've not tested it on a K10 yet as skynet is down but
> > from what I think I know of the AMD chips it must run at the same
> > speed.The windows conversion is only worth doing if the alignments/
> > spacing are placed carefully. ie loop starts/ends on 16bytes
> > boundary , and jmp destinations are close enough , defined by
> > testing :)
> > More to follow.
> > 
> > Jason
> 
> Note: The old addmul_1 also had an alternate entry point for inclsh_n , I
> dont know why we did this , if the fastest inclsh is really addmul_1 then
> we should use a macro , and if not (ie core2) then we should an alternate
> entry point(or new fn)
> Note: The 450 bytes count above did not include the inclsh_n part
> 
> Jason

Attached is a AMD 4way addmul_1 , the inner loop is the same , but instead of 
four cases to handle the "leftovers" we use jumping into the loop , this save 
quite a bit of code save , it's 278 bytes. The asymptotic speed is the same 
but the overheads are a bit more. I have not put this in trunk.

Jason

-- 
You received this message because you are subscribed to the Google Groups 
"mpir-devel" group.
To post to this group, send email to mpir-devel@googlegroups.com.
To unsubscribe from this group, send email to 
mpir-devel+unsubscr...@googlegroups.com.
For more options, visit this group at 
http://groups.google.com/group/mpir-devel?hl=en.

dnl  mpn_addmul_1

dnl  Copyright 2011 The Code Cavern

dnl  This file is part of the MPIR Library.

dnl  The MPIR Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The MPIR Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

include(`../config.m4')

ASM_START()
PROLOGUE(mpn_addmul_1)
mov (%rsi),%rax
cmp $2,%rdx
jl one
je two
push %r12
inc %rdx
mov %rdx,%r8
and $3,%r8
lea -8-16(%rsi,%rdx,8),%rsi
lea -8-16(%rdi,%rdx,8),%rdi
mov %r8,%r11
sub %rdx,%r11
mul %rcx
cmp $2,%r8
ja case6
je case5
jp case4
case3:	xor %r9,%r9
	mov %rax,%r12
	mov %rdx,%r8
	jmp in3
case6:	xor %r12,%r12
	mov %rax,%r8
	mov %rdx,%r9
	jmp in6
case5:	mov %rax,%r9
	mov %rdx,%r10
	jmp in5
case4:	mov %rax,%r10
	mov %rdx,%r12
	jmp in4
one:	mul %rcx
	add %rax,(%rdi)
	adc $0,%rdx
	mov %rdx,%rax
	ret
ALIGN(16)
lp:	xor %r10,%r10
	mul %rcx
	add %r8,(%rdi,%r11,8)
	adc %rax,%r9
	adc %rdx,%r10
in5:	mov 16(%rsi,%r11,8),%rax
	mul %rcx
	add %r9,8(%rdi,%r11,8)
	adc %rax,%r10
	mov $0,%r12d
	adc %rdx,%r12
in4:	mov 24(%rsi,%r11,8),%rax
	xor %r8,%r8
	xor %r9,%r9
	mul %rcx
	add %r10,16(%rdi,%r11,8)
	adc %rax,%r12
	adc %rdx,%r8
in3:	mov 32(%rsi,%r11,8),%rax
 	mul %rcx
	add %r12,24(%rdi,%r11,8)
	adc %rax,%r8
	adc %rdx,%r9
	add $4,%r11
in6:	mov 8(%rsi,%r11,8),%rax
	jnc lp
	# r11==0 here use for the adc $0,... below to save space
	# or put the pop here (on K10 only) and use r11 instead of r10
	xor %r10,%r10
	mul %rcx
	add %r8,(%rdi)
	adc %rax,%r9
	adc %rdx,%r10
	#mov 16(%rsi,%r11,8),%rax
	#mul %rcx
	add %r9,8(%rdi)
	adc $0,%r10
	#mov $0,%r12d
	#adc $0,%r12
	#mov 24(%rsi,%r11,8),%rax
	pop %r12
	#xor %r8,%r8
	#xor %r9,%r9
	#mul %rcx
	mov %r10,%rax
	ret
two:	mul %rcx
        add %rax,(%rdi)
        mov $0,%r8d
        adc %rdx,%r8
        mov 8(%rsi),%rax
        mul %rcx
        add %r8,%rax
        adc $0,%rdx
        add %rax,8(%rdi)
        adc $0,%rdx  
        mov %rdx,%rax
        ret     
EPILOGUE()

Reply via email to