Torbjörn Granlund <t...@gmplib.org> writes:

> Your version is faster than my versions (where I tested them).
>
> I made some minor changes to your code.

Nice!

Made a few additional tweaks, and tried to get it in the right place.
Attaching a patch that adds the file under coreibwl, as suggested, with
an include_mpn from zen. Took out the undefine hack (which I added
because first attempt at using the mulx macros failed), and copied the
windows support stuff from the old implementation. Complete patch
attached, does it look good enough? It now gives appr 30% speedup
compared to mul_1 + addmul_1 on my machine. I'd prefer to next have a
look at addsubmul_1msb0, before trying to optimize this loop further.

diff -r 3dee30523768 ChangeLog
--- a/ChangeLog	Fri Oct 08 16:05:05 2021 +0200
+++ b/ChangeLog	Fri Oct 08 16:41:04 2021 +0200
@@ -1,5 +1,8 @@
 2021-10-08  Niels Möller  <ni...@lysator.liu.se>
 
+	* mpn/x86_64/coreibwl/addaddmul_1msb0.asm: New assembly
+	implementation, using mulx and adox instructions.
+
 	* tests/mpn/t-addaddmul.c: Unit test for mpn_addaddmul_1msb0.
 
 2021-10-07  Niels Möller  <ni...@lysator.liu.se>
diff -r 3dee30523768 mpn/x86_64/coreibwl/addaddmul_1msb0.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreibwl/addaddmul_1msb0.asm	Fri Oct 08 16:41:04 2021 +0200
@@ -0,0 +1,99 @@
+dnl  AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
+
+dnl  Contributed to the GNU project by Niels Möller and Torbjörn Granlund.
+
+dnl  Copyright 2021 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`ap',	`%rsi')
+define(`bp_param', `%rdx')
+define(`n',	`%rcx')
+define(`u0',	`%r8')
+define(`v0',	`%r9')
+
+define(`bp', `%rbx')
+
+define(`c0', `%rax')	C carry limb, and return value
+define(`l0', `%r10')
+define(`l1', `%r11')
+define(`hi', `%rbp')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_addaddmul_1msb0)
+        FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+
+	push	%rbx
+	push	%rbp
+
+	lea	(ap,n,8), ap
+	lea	(bp_param,n,8), bp
+	lea	(rp,n,8), rp
+	neg	n
+
+	xor	R32(c0), R32(c0)	C Also clears CF and OF
+	test	$1, R32(n)
+	jnz	L(mid)
+
+	ALIGN(16)
+L(top):	mov	(ap,n,8), %rdx
+	mulx(	u0, l0, hi)
+	mov	(bp,n,8), %rdx
+	adox(	c0, l0)
+	mulx(	v0, l1, c0)
+	adox(	hi, c0)
+	adc	l0, l1
+	mov	l1, (rp,n,8)
+	inc	n			C Clears OF (since n != 2^63 - 1)
+L(mid):	mov	(ap,n,8), %rdx
+	mulx(	u0, l0, hi)
+	mov	(bp,n,8), %rdx
+	adox(	c0, l0)
+	mulx(	v0, l1, c0)
+	adox(	hi, c0)
+	adc	l0, l1
+	mov	l1, (rp,n,8)
+	inc	n
+	jnz	L(top)
+
+L(end):	adc	$0, c0
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
diff -r 3dee30523768 mpn/x86_64/zen/addaddmul_1msb0.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/zen/addaddmul_1msb0.asm	Fri Oct 08 16:41:04 2021 +0200
@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_addaddmul_1msb0.
+
+dnl  Copyright 2021 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addaddmul_1msb0)
+include_mpn(`x86_64/coreibwl/addaddmul_1msb0.asm')
> dnl  AMD64 mpn_addsubmul_1msb0, R = Au - Bv, u,v < 2^63.

This comment obviously wrong ;-)

But that function could be implemented by adding two "not %rdx" in the
right places of the loop, plus small adjustment just before and after
the loop.

Since

 Au - Bv = Au + (2^{64 n} - 1 - B) v - 2^{64 n} v + v

So complement B on the fly, set initial carry limb to v, and subtract v from
the return value. (Same trick as in arm/v7a/cora15/submul_1).

Should definitely be worth a try, before trying some completely
different loop.

Regards,
/Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to