Add implementation based on 64x64 multiplications (mlgr) and 128-bit adds in vector registers (vacq/vacccq). Unroll by 2 and use three parallel carry chains to reduce dependencies within each iteration (carry limb and two carry bits in vector registers). --- mpn/s390_64/z13/addmul_1.asm | 95 ++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 mpn/s390_64/z13/addmul_1.asm
diff --git a/mpn/s390_64/z13/addmul_1.asm b/mpn/s390_64/z13/addmul_1.asm new file mode 100644 index 000000000..bcedfa75c --- /dev/null +++ b/mpn/s390_64/z13/addmul_1.asm @@ -0,0 +1,95 @@ +dnl S/390-64 mpn_addmul_1 + +dnl Copyright 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`s1p', `%r3') +define(`n', `%r4') +define(`s2', `%r5') + +define(`carry', `%r8') +define(`idx', `%r9') +define(`carry_vec1', `%v6') +define(`carry_vec2', `%v20') + +ASM_START() +PROLOGUE(mpn_addmul_1) + .align 16 + stmg %r8,%r11,64(%r15) + lghi %r1,0 + lghi carry,0 + vzero carry_vec1 + vzero carry_vec2 + tmll n,1 + je L(even) + + lg %r11,0(s1p) + mlgr %r10,s2 + lgr carry,%r10 + lg %r10,0(rp) + algr %r11,%r10 + alcgr carry,%r1 + lghi %r1,1 + stg %r11,0(rp) + clgrjhe %r1,n,L(out) + +L(even): + sllg idx,%r1,3 + sllg n,n,3 C Note that mp_size_t n will always be small enough so that n<<3 cannot overflow + +L(loop): + vl %v22,0(idx,rp) + lg %r11,0(idx,s1p) + lg %r1,8(idx,s1p) + mlgr %r10,s2 + mlgr %r0,s2 + vpdi %v21,%v22,%v22,4 + vlvgp %v7,%r10,%r11 + vlvgp %v4,%r1,carry + vacq %v3,%v7,%v21,carry_vec1 + vacq %v5,%v3,%v4,carry_vec2 + vacccq carry_vec1,%v7,%v21,carry_vec1 + vacccq carry_vec2,%v3,%v4,carry_vec2 + vpdi %v22,%v5,%v5,4 + vst %v22,0(idx,rp) + lgr carry,%r0 + aghi idx,16 + clgrjl idx,n,L(loop) + +L(out): + vag carry_vec1,carry_vec1,carry_vec2 + vlgvg %r2,carry_vec1,1 + agr %r2,carry + lmg %r8,%r11,64(%r15) + br %r14 +EPILOGUE() -- 2.26.2 _______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel