Add implementation based on 64x64 multiplications (mlgr) and 128-bit
subtractions in vector registers (vsbiq/vsbcbiq). Unroll loop by 2 and
use three borrow chains to reduce dependencies within each iteration
(borrow limb and two borrow bits in vector registers).
---
 mpn/s390_64/z13/submul_1.asm | 115 +++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 mpn/s390_64/z13/submul_1.asm

diff --git a/mpn/s390_64/z13/submul_1.asm b/mpn/s390_64/z13/submul_1.asm
new file mode 100644
index 000000000..d7761f0e3
--- /dev/null
+++ b/mpn/s390_64/z13/submul_1.asm
@@ -0,0 +1,115 @@
+dnl  S/390-64 mpn_submul_1
+
+dnl  Copyright 2021 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C INPUT PARAMETERS
+define(`rp',   `%r2')
+define(`s1p',  `%r3')
+define(`n',    `%r4')
+define(`s2',   `%r5')
+
+define(`borrow',       `%r0')
+define(`p0_low',       `%r9')
+define(`p0_high',      `%r8')
+define(`p1_low',       `%r11')
+define(`p1_high',      `%r10')
+define(`idx',          `%r1')
+
+define(`rp_vec',       `%v20')
+define(`p0_vec',       `%v6')
+define(`p1_vec',       `%v22')
+define(`borrow_vec1',          `%v23')
+define(`borrow_vec2',          `%v4')
+
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+       .align 16
+       stmg    %r8,%r11,64(%r15)
+
+       lghi    %r10,0
+       lghi    borrow,0
+       vrepig  borrow_vec1,1
+       vlr     borrow_vec2,borrow_vec1
+
+       tmll    n,1
+       je      L(even)
+
+       lg      p0_low,0(s1p)
+       lg      %r11,0(rp)
+       mlgr    p0_high,s2
+       lghi    %r10,1
+
+       slgr    %r11,p0_low
+       locghi  borrow,1,0xC
+       agr     borrow,p0_high
+
+       stg     %r11,0(rp)
+
+       clgrjhe %r10,n,L(done)
+
+L(even):
+       sllg    idx,%r10,3
+       sllg    n,n,3
+
+L(loop):
+       lg      p0_low,0(idx,s1p)
+       lg      p1_low,8(idx,s1p)
+       vl      rp_vec,0(idx,rp)
+       mlgr    p0_high,s2
+       mlgr    p1_high,s2
+
+       vpdi    rp_vec,rp_vec,rp_vec,4
+       vlvgp   p0_vec,p0_high,p0_low
+       vlvgp   p1_vec,p1_low,borrow
+
+       vsbiq   %v5,rp_vec,p0_vec,borrow_vec1
+       vsbiq   %v3,%v5,p1_vec,borrow_vec2
+       vsbcbiq borrow_vec1,rp_vec,p0_vec,borrow_vec1
+       vsbcbiq borrow_vec2,%v5,p1_vec,borrow_vec2
+
+       vpdi    %v7,%v3,%v3,4
+       vst     %v7,0(idx,rp)
+       lgr     borrow,p1_high
+
+       aghi    idx,16
+       clgrjl  idx,n,L(loop)
+
+L(done):
+       vlgvg   %r3,borrow_vec1,1
+       vlgvg   %r2,borrow_vec2,1
+       aghi    borrow,2
+       lmg     %r8,%r11,64(%r15)
+       sgr     borrow,%r3
+       sgrk    %r2,borrow,%r2
+       br      %r14
+EPILOGUE()
-- 
2.26.2

_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to