Add MIPS32R1 MADDU-based *mul_1.asm functions. The code tries to keep the [accidental] property of MIPS-II counterparts: constant-time operation on 32x16 MDUs as found on e.g. 4KEc and some low- end MCUs. Even if that is unimportant, the performance cost is invisible.
It is faster on all tried MIPS32R1/R2/R5 CPUs (see the c/l table) and is expected to be fast with any pipelined MDU. So-called Area-Efficient MDU (optional on some MCUs) will run it *much* slower (~3x for addmul_1). While functions look similar (especially mul_1 and addmul_1), they are kept separate due to corner-case (N=1,2,3) tweaks for P5600 without any ill effect on 4KEc or 24KEc at least. diff -r 6ab06c72027e -r 789677d6e8b2 configure.ac --- a/configure.ac +++ b/configure.ac @@ -1040,6 +1040,10 @@ mipsisa32r2*-*-*) SPEED_CYCLECOUNTER_OBJ=mips32r2.lo cyclecounter_size=1 + path="mips32/r1 mips32" + ;; + mipsisa32*-*-*) + path="mips32/r1 mips32" ;; esac ;; diff -r 6ab06c72027e -r 789677d6e8b2 mpn/mips32/r1/addmul_1.asm --- /dev/null +++ b/mpn/mips32/r1/addmul_1.asm @@ -0,0 +1,79 @@ +include(`../config.m4') + +C cycles/limb +C 4KEc 9.68 +C 24Kc 9.52 +C 24KEc 9.55 +C P5600 7.80 +C XBurst 13.55 + +C INPUT PARAMETERS +C rp $a0 +C up $a1 +C n $a2 +C vl $a3 + +ASM_START() + .set noat +PROLOGUE(mpn_addmul_1) + lw $v1,0($a1) C L0 + ori $at,$zero,1 + multu $v1,$a3 C M0 + lw $t0,0($a0) C L1, 32x16 MDU stall + addiu $t1,$a2,-2 + beq $at,$a2,1f + maddu $t0,$at C M0 + mfhi $v0 C M0 carry + lw $t2,4($a1) C L1 + beqz $t1,23f + lw $v1,4($a0) C L1 + mflo $t0 C M0 + andi $t3,$t1,1 + sll $a2,$t1,2 + beqz $t3,0f + addu $a2,$a2,$a1 + multu $t2,$a3 C M1 + lw $t2,8($a1) C L2, 32x16 MDU stall + maddu $v1,$at C M1 + maddu $v0,$at C M1 + mfhi $v0 C M1 carry + lw $v1,8($a0) C L2 + sw $t0,0($a0) C S0 + beq $at,$t1,23f + addiu $a0,$a0,4 + addiu $a1,$a1,4 + mflo $t0 C M1 +0: addiu $a1,$a1,8 + addiu $a0,$a0,8 + multu $t2,$a3 C M1 + lw $t3,0($a1) C L2, 32x16 MDU stall + maddu $v1,$at C M1 + lw $t4,0($a0) C L2 + maddu $v0,$at C M1 + mfhi $v0 C M1 carry + sw $t0,-8($a0) C S0 + mflo $t1 C M1 + multu $t3,$a3 C M2 + lw $t2,4($a1) C L3, 32x16 MDU stall + maddu $t4,$at C M2 + lw $v1,4($a0) C L3 + maddu $v0,$at C M2 + mfhi $v0 C M2 carry + sw $t1,-4($a0) C S1 + bne $a1,$a2,0b +23: mflo $t0 C M2 + multu $t2,$a3 C M3 + nop C 32x16 MDU stall + maddu $v1,$at C M3 + maddu $v0,$at C M3 + mflo $at C M3 + mfhi $v0 C M3 carry + sw $t0,0($a0) C S2 + jr $ra + sw $at,4($a0) C S3 +1: mflo $at + mfhi $v0 + jr $ra + sw $at,0($a0) +EPILOGUE(mpn_addmul_1) +ASM_END() diff -r 6ab06c72027e -r 789677d6e8b2 mpn/mips32/r1/mul_1.asm --- /dev/null +++ b/mpn/mips32/r1/mul_1.asm @@ -0,0 +1,69 @@ +include(`../config.m4') + +C cycles/limb +C 4KEc 7.66 +C 24Kc 7.54 +C 24KEc 7.55 +C P5600 7.04 +C XBurst 10.54 + +C INPUT PARAMETERS +C rp $a0 +C up $a1 +C n $a2 +C vl $a3 + +ASM_START() + .set noat +PROLOGUE(mpn_mul_1) + lw $v1,0($a1) C L0 + ori $at,$zero,1 + multu $v1,$a3 C M0 + beq $at,$a2,1f C 32x16 MDU stall + addiu $t1,$a2,-2 + mfhi $v0 C M0 carry + beqz $t1,23f + lw $t2,4($a1) C L1 + mflo $t0 C M0 + andi $t3,$t1,1 + sll $a2,$t1,2 + beqz $t3,0f + addu $a2,$a2,$a1 + multu $t2,$a3 C M1 + lw $t2,8($a1) C L2, 32x16 MDU stall + maddu $v0,$at C M1 + mfhi $v0 C M1 carry + sw $t0,0($a0) C S0 + beq $at,$t1,23f + addiu $a0,$a0,4 + addiu $a1,$a1,4 + mflo $t0 C M1 +0: addiu $a1,$a1,8 + addiu $a0,$a0,8 + multu $t2,$a3 C M1 + lw $t3,0($a1) C L2, 32x16 MDU stall + maddu $v0,$at C M1 + mfhi $v0 C M1 carry + sw $t0,-8($a0) C S0 + mflo $t1 C M1 + multu $t3,$a3 C M2 + lw $t2,4($a1) C L3, 32x16 MDU stall + maddu $v0,$at C M2 + mfhi $v0 C M2 carry + sw $t1,-4($a0) C S1 + bne $a1,$a2,0b +23: mflo $t0 C M2 + multu $t2,$a3 C M3 + nop C 32x16 MDU stall + maddu $v0,$at C M3 + mflo $at C M3 + mfhi $v0 C M3 carry + sw $t0,0($a0) C S2 + jr $ra + sw $at,4($a0) C S3 +1: mflo $at + mfhi $v0 + jr $ra + sw $at,0($a0) +EPILOGUE(mpn_mul_1) +ASM_END() diff -r 6ab06c72027e -r 789677d6e8b2 mpn/mips32/r1/submul_1.asm --- /dev/null +++ b/mpn/mips32/r1/submul_1.asm @@ -0,0 +1,85 @@ +include(`../config.m4') + +C cycles/limb +C 4KEc 10.72 +C 24Kc 10.54 +C 24KEc 10.55 +C P5600 8.07 +C XBurst 13.55 + +C INPUT PARAMETERS +C rp $a0 +C up $a1 +C n $a2 +C vl $a3 + +ASM_START() + .set noat +PROLOGUE(mpn_submul_1) + lw $v1,0($a0) C L1 + ori $at,$zero,1 + lw $t0,0($a1) C L0 + multu $v1,$at C M0 + msubu $t0,$a3 C M0 + beq $at,$a2,1f C 32x16 MDU stall + addiu $t1,$a2,-2 + mfhi $v0 C M0 carry + lw $v1,4($a0) C L1 + beqz $t1,23f + lw $t2,4($a1) C L1 + mflo $t0 C M0 + andi $t3,$t1,1 + sll $a2,$t1,2 + beqz $t3,0f + addu $a2,$a2,$a1 + negu $v0 C M1 + multu $v1,$at C M1 + msubu $t2,$a3 C M1 + addiu $a0,$a0,4 C 32x16 MDU stall + msubu $v0,$at C M1 + mfhi $v0 C M1 carry + lw $v1,4($a0) C L2 + lw $t2,8($a1) C L2 + beq $at,$t1,23f + sw $t0,-4($a0) C S0 + addiu $a1,$a1,4 + mflo $t0 C M1 +0: addiu $a0,$a0,8 + addiu $a1,$a1,8 + multu $v1,$at C M1 + lw $t4,0($a0) C L2 + lw $t3,0($a1) C L2 + msubu $t2,$a3 C M1 + negu $v0 C M1, 32x16 MDU stall + msubu $v0,$at C M1 + mfhi $v0 C M1 carry + sw $t0,-8($a0) C S0 + mflo $t1 C M1 + multu $t4,$at C M2 + lw $v1,4($a0) C L3 + lw $t2,4($a1) C L3 + msubu $t3,$a3 C M2 + negu $v0 C M2, 32x16 MDU stall + msubu $v0,$at C M2 + mfhi $v0 C M2 carry + sw $t1,-4($a0) C S1 + bne $a1,$a2,0b +23: mflo $t0 C M2 + multu $v1,$at C M3 + msubu $t2,$a3 C M3 + negu $v0 C M3, 32x16 MDU stall + sw $t0,0($a0) C S2 + msubu $v0,$at C M3 + mflo $at C M3 + mfhi $v0 C M3 carry + sw $at,4($a0) C S3 + jr $ra + negu $v0 +1: mflo $at + mfhi $v0 + sw $at,0($a0) + jr $ra + negu $v0 +EPILOGUE(mpn_submul_1) + +ASM_END() _______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel