Re: [mpir-devel] Re: New assembler

Jason Sat, 29 Jan 2011 03:16:40 -0800

On Friday 28 January 2011 14:15:28 Jason wrote:
> On Friday 28 January 2011 13:43:56 Jason wrote:
> > On Friday 28 January 2011 11:02:07 Jason wrote:
> > > On Friday 28 January 2011 10:55:06 jason wrote:
> > > > Hi
> > > > 
> > > > In trunk is a new AMD addmul_1 , this runs at the same speed as the
> > > > old one , but is smaller. The old code was 450 bytes and the new code
> > > > is 407 bytes.I've not tested it on a K10 yet as skynet is down but
> > > > from what I think I know of the AMD chips it must run at the same
> > > > speed.The windows conversion is only worth doing if the alignments/
> > > > spacing are placed carefully. ie loop starts/ends on 16bytes
> > > > boundary , and jmp destinations are close enough , defined by
> > > > testing :)
> > > > More to follow.
> > > > 
> > > > Jason
> > > 
> > > Note: The old addmul_1 also had an alternate entry point for inclsh_n ,
> > > I dont know why we did this , if the fastest inclsh is really addmul_1
> > > then we should use a macro , and if not (ie core2) then we should an
> > > alternate entry point(or new fn)
> > > Note: The 450 bytes count above did not include the inclsh_n part
> > > 
> > > Jason
> > 
> > Attached is a AMD 4way addmul_1 , the inner loop is the same , but
> > instead of four cases to handle the "leftovers" we use jumping into the
> > loop , this save quite a bit of code save , it's 278 bytes. The
> > asymptotic speed is the same but the overheads are a bit more. I have
> > not put this in trunk.
> > 
> > Jason
> 
> I should of also said , I expect I can quite easily shave some cycles of it
> and some space.
> 
> Attached are 3 variants of an AMD addmul_1 7-way unroll. This runs at
> 17/7=2.428c/l (4-way is 2.5c/l) a 2.9% improvement , due to the reasons
> below I dont regard this as practical so you will notice that no attempt
> has been made to optimize it or clean it up.
> 
> k8_addmul_1_7way.asm is the usual way of handling the leftovers by having 7
> cases , the problem is code size.
> 
> k8_addmul_1_7way_jmpepi.asm uses a small 7-entry jump table to branch to
> the 7 cases (as oppose to the above which uses a string of cmp's and
> Jcc's_) , code size is still a problem , and the jump table should be in a
> separate segment.
> 
> k8_addmul_1_7way_jmpin.asm uses a jump into the middle of the loop approach
> to handle the left-overs , this saves a lot of space but we need to
> calculate size%7 , this is much easier than a general division( could do a
> hensel-div ie 10cycles max) or some shifting and assume L1-cache then we
> can limit the size to 4096.  I've just done a standard slow division , and
> the feed-in cases are poor.
> 
> The inflexibility of the code sequence limits the scheduler and pick
> hardware so that some tricks had to be used to help the chip out :)
> 
> It may be possible to to improve this speed (if the tricks are good enough)
> by going to a larger unroll , 10-way (2.4c/l) is possible , and 16-way
> (2.375) is the next, but there are better ways.
> 
> Jason


Hi

Attached is a AMD addmul_1 infinity-way unrolled which runs at 2.333c/l , 
asymptotically this is faster than our current addmul_2(runs at 2.375c/l) . 
This is really proof of concept code at the moment as many things need to be 
done. It's meant for mul_basecase etc where the sizes are limited , if we keep 
to less than 32x32 mul's then it takes 23bytes of code per limb plus overhead 
plus tables(currently 16 bytes per limb, certainly can get this down to 9 or 5  
bytes) . I've included our standard addmul_1 in it for large sizes so I can 
test it properly. Mul basecase is very sensitive to overheads so this may not 
be an improvement , I'll write a basecase on this current code and if it seems 
promising I'll do it properly(reduce code size,reduce tables,check speed for 
all alignments and jumpin points etc).

Jason




-- 
You received this message because you are subscribed to the Google Groups 
"mpir-devel" group.
To post to this group, send email to mpir-devel@googlegroups.com.
To unsubscribe from this group, send email to 
mpir-devel+unsubscr...@googlegroups.com.
For more options, visit this group at 
http://groups.google.com/group/mpir-devel?hl=en.

dnl  mpn_addmul_1

dnl  Copyright 2011 The Code Cavern

dnl  This file is part of the MPIR Library.

dnl  The MPIR Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The MPIR Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.


#// bp		multiplier
#// rsi 	src
#// rdi 	dst
#// r8,r9,r10,r11,r12,r13  rotating temps
#// ax,dx	temps
#// bx		for jumps into case's
#// cx		size
#// r14		for jumps into "loop"
#// r15		zero	# can we make this a load to save a reg ??  ie mov -8(%rsp),%reg

#// lets fake up an addmul_1


include(`../config.m4')

ASM_START()
PROLOGUE(mpn_addmul_1)

	cmp $37,%rdx
	jg oldshit
	push %r15
	push %r14
	push %r13
	push %r12
	push %rbx
	push %rbp
	mov %rcx,%rbp
	mov %rdx,%rcx
	mov (%rsi),%rax
	mul %rbp
	xor %r15,%r15
	lea casetable(%rip),%rbx
	add (%rbx,%rcx,8),%rbx
	lea intable(%rip),%r14
	add (%r14,%rcx,8),%r14	
	lea -112(%rdi,%rcx,8),%rdi
	lea -112(%rsi,%rcx,8),%rsi
	jmp *%rbx	
case8:	mov %rax,%r8
	mov %rdx,%r9
	jmp *%r14	#jmp in8,14,20,26	
case7:	xor %r12,%r12
	mov %rax,%r9
	xor %r10,%r10	# and clear carry	note: rdx added in loop
	jmp *%r14	#jmp in7,13,19,25	
case6:	xor %r12,%r12
	mov %rax,%r10
	mov %rdx,%r11
	jmp *%r14	#jmp in6,12,18,24	
case5:	mov %rax,%r11
	mov %rdx,%r12
	jmp *%r14	#jmp in5,11,17,23	
case4:	xor %r9,%r9
	mov %rax,%r12
	xor %r13,%r13	# and clear carry	note rdx added in loop
	jmp *%r14	#jmp in4,10,16,22	
case3:	xor %r9,%r9
	mov %rax,%r13
	mov %rdx,%r8
	jmp *%r14	#jmp in3,9,15,21,27,	
#// replace mov with xor to save a few bytes
.set REP, -4
in37:	mov 16+48*REP(%rsi),%rax
	adc %rdx,%r10
	mov $0,%r11d
	mul %rbp
	add %r9,8+48*REP(%rdi)
	adc %rax,%r10
	adc %rdx,%r11
in36:	mov 24+48*REP(%rsi),%rax
	mul %rbp
	add %r10,16+48*REP(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
in35:	mov 32+48*REP(%rsi),%rax
	mul %rbp
	add %r11,24+48*REP(%rdi)
	lea (%r15,%r15,2),%r13
	adc %rax,%r12
	mov $0,%r9d
in34:	mov 40+48*REP(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r8d
	mul %rbp
	add %r12,32+48*REP(%rdi)
	adc %rax,%r13
	adc %rdx,%r8
in33:	mov 48+48*REP(%rsi),%rax
	mul %rbp
	add %r13,40+48*REP(%rdi)
	adc %rax,%r8
	adc %rdx,%r9
in32:	mov 56+48*REP(%rsi),%rax
.set REP, -3
	mul %rbp
	add %r8,0+48*REP(%rdi)
	lea (%r15,%r15,2),%r10
	adc %rax,%r9
	mov $0,%r12d
in31:	mov 16+48*REP(%rsi),%rax
	adc %rdx,%r10
	mov $0,%r11d
	mul %rbp
	add %r9,8+48*REP(%rdi)
	adc %rax,%r10
	adc %rdx,%r11
in30:	mov 24+48*REP(%rsi),%rax
	mul %rbp
	add %r10,16+48*REP(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
in29:	mov 32+48*REP(%rsi),%rax
	mul %rbp
	add %r11,24+48*REP(%rdi)
	lea (%r15,%r15,2),%r13
	adc %rax,%r12
	mov $0,%r9d
in28:	mov 40+48*REP(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r8d
	mul %rbp
	add %r12,32+48*REP(%rdi)
	adc %rax,%r13
	adc %rdx,%r8
in27:	mov 48+48*REP(%rsi),%rax
	mul %rbp
	add %r13,40+48*REP(%rdi)
	adc %rax,%r8
	adc %rdx,%r9
in26:	mov 56+48*REP(%rsi),%rax
.set REP, -2
	mul %rbp
	add %r8,0+48*REP(%rdi)
	lea (%r15,%r15,2),%r10
	adc %rax,%r9
	mov $0,%r12d
in25:	mov 16+48*REP(%rsi),%rax
	adc %rdx,%r10
	mov $0,%r11d
	mul %rbp
	add %r9,8+48*REP(%rdi)
	adc %rax,%r10
	adc %rdx,%r11
in24:	mov 24+48*REP(%rsi),%rax
	mul %rbp
	add %r10,16+48*REP(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
in23:	mov 32+48*REP(%rsi),%rax
	mul %rbp
	add %r11,24+48*REP(%rdi)
	lea (%r15,%r15,2),%r13
	adc %rax,%r12
	mov $0,%r9d
in22:	mov 40+48*REP(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r8d
	mul %rbp
	add %r12,32+48*REP(%rdi)
	adc %rax,%r13
	adc %rdx,%r8
in21:	mov 48+48*REP(%rsi),%rax
	mul %rbp
	add %r13,40+48*REP(%rdi)
	adc %rax,%r8
	adc %rdx,%r9
in20:	mov 56+48*REP(%rsi),%rax
.set REP, -1
	mul %rbp
	add %r8,0+48*REP(%rdi)
	lea (%r15,%r15,2),%r10
	adc %rax,%r9
	mov $0,%r12d
in19:	mov 16+48*REP(%rsi),%rax
	adc %rdx,%r10
	mov $0,%r11d
	mul %rbp
	add %r9,8+48*REP(%rdi)
	adc %rax,%r10
	adc %rdx,%r11
in18:	mov 24+48*REP(%rsi),%rax
	mul %rbp
	add %r10,16+48*REP(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
in17:	mov 32+48*REP(%rsi),%rax
	mul %rbp
	add %r11,24+48*REP(%rdi)
	lea (%r15,%r15,2),%r13
	adc %rax,%r12
	mov $0,%r9d
in16:	mov 40+48*REP(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r8d
	mul %rbp
	add %r12,32+48*REP(%rdi)
	adc %rax,%r13
	adc %rdx,%r8
in15:	mov 48+48*REP(%rsi),%rax
	mul %rbp
	add %r13,40+48*REP(%rdi)
	adc %rax,%r8
	adc %rdx,%r9
in14:	mov 56+48*REP(%rsi),%rax
.set REP, 0
	mul %rbp
	add %r8,0+48*REP(%rdi)
	lea (%r15,%r15,2),%r10
	adc %rax,%r9
	mov $0,%r12d
in13:	mov 16+48*REP(%rsi),%rax
	adc %rdx,%r10
	mov $0,%r11d
	mul %rbp
	add %r9,8+48*REP(%rdi)
	adc %rax,%r10
	adc %rdx,%r11
in12:	mov 24+48*REP(%rsi),%rax
	mul %rbp
	add %r10,16+48*REP(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
in11:	mov 32+48*REP(%rsi),%rax
	mul %rbp
	add %r11,24+48*REP(%rdi)
	lea (%r15,%r15,2),%r13
	adc %rax,%r12
	mov $0,%r9d
in10:	mov 40+48*REP(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r8d
	mul %rbp
	add %r12,32+48*REP(%rdi)
	adc %rax,%r13
	adc %rdx,%r8
in9:	mov 48+48*REP(%rsi),%rax
	mul %rbp
	add %r13,40+48*REP(%rdi)
	adc %rax,%r8
	adc %rdx,%r9
in8:	mov 56+48*REP(%rsi),%rax
.set REP, 1
	mul %rbp
	add %r8,0+48*REP(%rdi)
	lea (%r15,%r15,2),%r10
	adc %rax,%r9
	mov $0,%r12d
in7:	mov 16+48*REP(%rsi),%rax
	adc %rdx,%r10
	mov $0,%r11d
	mul %rbp
	add %r9,8+48*REP(%rdi)
	adc %rax,%r10
	adc %rdx,%r11
in6:	mov 24+48*REP(%rsi),%rax
	mul %rbp
	add %r10,16+48*REP(%rdi)
	adc %rax,%r11
	adc %rdx,%r12
in5:	mov 32+48*REP(%rsi),%rax
	mul %rbp
	add %r11,24+48*REP(%rdi)
	lea (%r15,%r15,2),%r13
	adc %rax,%r12
	mov $0,%r9d
in4:	mov 40+48*REP(%rsi),%rax
	adc %rdx,%r13
	mov $0,%r8d
	mul %rbp
	add %r12,32+48*REP(%rdi)
	adc %rax,%r13
	adc %rdx,%r8
in3:	mov 48+48*REP(%rsi),%rax
	mul %rbp
	add %r13,40+48*REP(%rdi)
	adc %rax,%r8
	adc %rdx,%r9
in2:	mov 56+48*REP(%rsi),%rax 	#last src read
.set REP, 2
#// this is the wind-down code only
	mul %rbp
	add %r8,0+48*REP(%rdi)
	lea (%r15,%r15,2),%r10
	adc %rax,%r9
	#mov $0,%r12d
	#mov 16+48*REP(%rsi),%rax
in1:	adc %rdx,%r10
	#mov $0,%r11d
	#mul %rbp
	add %r9,8+48*REP(%rdi)		# last dst read       -128 to 127 is biggest before using extra bytes
	adc $0,%r10d
	#adc $0,%r11d
	#mov 24+48*REP(%rsi),%rax
	#mul %rbp
	mov %r10,%rax		#16+48*REP(%rdi)	# store top digit
in0:	pop %rbp
	pop %rbx
	pop %r12
	pop %r13
	pop %r14
	pop %r15
	ret
.align 8
intable:
.quad	in0-intable	#size=rcx=0
.quad	in1-intable	#size=rcx=1
.quad	in2-intable	#size=rcx=2
.quad	in3-intable	#size=rcx=3
.quad	in4-intable	#size=rcx=4
.quad	in5-intable	#size=rcx=5
.quad	in6-intable
.quad	in7-intable
.quad	in8-intable
.quad	in9-intable
.quad	in10-intable
.quad	in11-intable
.quad	in12-intable
.quad	in13-intable
.quad	in14-intable
.quad	in15-intable
.quad	in16-intable
.quad	in17-intable
.quad	in18-intable
.quad	in19-intable
.quad	in20-intable
.quad	in21-intable
.quad	in22-intable
.quad	in23-intable
.quad	in24-intable
.quad	in25-intable
.quad	in26-intable
.quad	in27-intable
.quad	in28-intable
.quad	in29-intable
.quad	in30-intable
.quad	in31-intable
.quad	in32-intable
.quad	in33-intable
.quad	in34-intable
.quad	in35-intable
.quad	in36-intable
.quad	in37-intable
casetable:
.quad	case6-casetable		# rcx=0
.quad	case7-casetable		# rcx=1
.quad	case8-casetable		# rcx=2
.quad	case3-casetable		# rcx=3
.quad	case4-casetable		# rcx=4
.quad	case5-casetable		# rcx=5
.quad	case6-casetable		# rcx=6
.quad	case7-casetable		# rcx=7
.quad	case8-casetable		# rcx=8
.quad	case3-casetable		# rcx=9
.quad	case4-casetable		# rcx=10
.quad	case5-casetable
.quad	case6-casetable
.quad	case7-casetable
.quad	case8-casetable
.quad	case3-casetable
.quad	case4-casetable
.quad	case5-casetable
.quad	case6-casetable
.quad	case7-casetable
.quad	case8-casetable		# rcx=20
.quad	case3-casetable
.quad	case4-casetable
.quad	case5-casetable
.quad	case6-casetable
.quad	case7-casetable
.quad	case8-casetable
.quad	case3-casetable
.quad	case4-casetable
.quad	case5-casetable
.quad	case6-casetable		# rcx=30
.quad	case7-casetable
.quad	case8-casetable
.quad	case3-casetable
.quad	case4-casetable
.quad	case5-casetable
.quad	case6-casetable
.quad	case7-casetable		# rcx=37

ALIGN(64)
oldshit:
mov (%rsi),%rax
cmp $1,%rdx
je one		
mov $5,%r11
lea -40(%rsi,%rdx,8),%rsi
lea -40(%rdi,%rdx,8),%rdi
sub %rdx,%r11
mul %rcx
.byte 0x26
mov %rax,%r8
.byte 0x26
mov 8(%rsi,%r11,8),%rax
.byte 0x26
mov %rdx,%r9
.byte 0x26
cmp $0,%r11
.byte 0x26
mov %r12,-8(%rsp)
.byte 0x26
jge skiploop
#ALIGN(16)
loop:
	xor %r10,%r10
	mul %rcx
	add %r8,(%rdi,%r11,8)
	adc %rax,%r9
	adc %rdx,%r10
	mov 16(%rsi,%r11,8),%rax
	mul %rcx
	add %r9,8(%rdi,%r11,8)
	adc %rax,%r10
	mov $0,%r12d
	adc %rdx,%r12
	mov 24(%rsi,%r11,8),%rax
	xor %r8,%r8
	xor %r9,%r9
	mul %rcx
	add %r10,16(%rdi,%r11,8)
	adc %rax,%r12
	adc %rdx,%r8
	mov 32(%rsi,%r11,8),%rax
 	mul %rcx
	add %r12,24(%rdi,%r11,8)
	adc %rax,%r8
	adc %rdx,%r9
	add $4,%r11
	mov 8(%rsi,%r11,8),%rax
	jnc loop
#ALIGN(16)
skiploop:
xor %r10,%r10
mul %rcx
add %r8,(%rdi,%r11,8)
adc %rax,%r9
adc %rdx,%r10
cmp $2,%r11
jz next2
ja next3
jp next1
next0:	#r11=0
	mov 16(%rsi),%rax
	mul %rcx
	add %r9,8(%rdi)
	adc %rax,%r10
	mov %r11,%r12	# could remove but may screw timings
	adc %rdx,%r12
	mov 24(%rsi),%rax
	xor %r8,%r8
	mul %rcx
	add %r10,16(%rdi)
	adc %rax,%r12
	adc %rdx,%r8
	mov 32(%rsi),%rax
	mul %rcx
	add %r12,24(%rdi)
	mov -8(%rsp),%r12
	adc %rax,%r8
	adc $0,%rdx
	add %r8,32(%rdi)
	adc $0,%rdx
	mov %rdx,%rax
	ret
.align 16
next1:	#r11=1
	mov 24(%rsi),%rax
	mul %rcx
	add %r9,16(%rdi)
	adc %rax,%r10
	mov $0,%r8d
	adc %rdx,%r8
	mov 32(%rsi),%rax
	mul %rcx
	add %r10,24(%rdi)
	adc %rax,%r8
	adc $0,%rdx
	add %r8,32(%rdi)
	mov -8(%rsp),%r12
	adc $0,%rdx
	mov %rdx,%rax
	ret
.align 16
next2:	#r11=2
	mov 32(%rsi),%rax
	mul %rcx
	add %r9,24(%rdi)
	adc %rax,%r10
	mov $0,%r12d
	adc %rdx,%r12
	add %r10,32(%rdi)
	adc $0,%r12
	mov %r12,%rax
	mov -8(%rsp),%r12
	ret
.align 16
next3:	#r11=3
	mov -8(%rsp),%r12
	add %r9,32(%rdi)
	adc $0,%r10
	mov %r10,%rax
	ret
.align 16
one:
	mul %rcx
	add %rax,(%rdi)
	adc $0,%rdx
	mov %rdx,%rax
	ret
EPILOGUE()

Re: [mpir-devel] Re: New assembler

Reply via email to