Add implementation based on 64x64 multiplications (mlgr) and 128-bit subtractions in vector registers (vsbiq/vsbcbiq). Unroll loop by 2 and use three borrow chains to reduce dependencies within each iteration (borrow limb and two borrow bits in vector registers). --- mpn/s390_64/z13/submul_1.asm | 115 +++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 mpn/s390_64/z13/submul_1.asm
diff --git a/mpn/s390_64/z13/submul_1.asm b/mpn/s390_64/z13/submul_1.asm new file mode 100644 index 000000000..d7761f0e3 --- /dev/null +++ b/mpn/s390_64/z13/submul_1.asm @@ -0,0 +1,115 @@ +dnl S/390-64 mpn_submul_1 + +dnl Copyright 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`s1p', `%r3') +define(`n', `%r4') +define(`s2', `%r5') + +define(`borrow', `%r0') +define(`p0_low', `%r9') +define(`p0_high', `%r8') +define(`p1_low', `%r11') +define(`p1_high', `%r10') +define(`idx', `%r1') + +define(`rp_vec', `%v20') +define(`p0_vec', `%v6') +define(`p1_vec', `%v22') +define(`borrow_vec1', `%v23') +define(`borrow_vec2', `%v4') + + +ASM_START() +PROLOGUE(mpn_submul_1) + .align 16 + stmg %r8,%r11,64(%r15) + + lghi %r10,0 + lghi borrow,0 + vrepig borrow_vec1,1 + vlr borrow_vec2,borrow_vec1 + + tmll n,1 + je L(even) + + lg p0_low,0(s1p) + lg %r11,0(rp) + mlgr p0_high,s2 + lghi %r10,1 + + slgr %r11,p0_low + locghi borrow,1,0xC + agr borrow,p0_high + + stg %r11,0(rp) + + clgrjhe %r10,n,L(done) + +L(even): + sllg idx,%r10,3 + sllg n,n,3 + +L(loop): + lg p0_low,0(idx,s1p) + lg p1_low,8(idx,s1p) + vl rp_vec,0(idx,rp) + mlgr p0_high,s2 + mlgr p1_high,s2 + + vpdi rp_vec,rp_vec,rp_vec,4 + vlvgp p0_vec,p0_high,p0_low + vlvgp p1_vec,p1_low,borrow + + vsbiq %v5,rp_vec,p0_vec,borrow_vec1 + vsbiq %v3,%v5,p1_vec,borrow_vec2 + vsbcbiq borrow_vec1,rp_vec,p0_vec,borrow_vec1 + vsbcbiq borrow_vec2,%v5,p1_vec,borrow_vec2 + + vpdi %v7,%v3,%v3,4 + vst %v7,0(idx,rp) + lgr borrow,p1_high + + aghi idx,16 + clgrjl idx,n,L(loop) + +L(done): + vlgvg %r3,borrow_vec1,1 + vlgvg %r2,borrow_vec2,1 + aghi borrow,2 + lmg %r8,%r11,64(%r15) + sgr borrow,%r3 + sgrk %r2,borrow,%r2 + br %r14 +EPILOGUE() -- 2.26.2 _______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel