ni...@lysator.liu.se (Niels Möller) writes: > You're idea of conditonally adding the invariant d * B2 at the right > place is also interesting,
I've tried it out. Works nicely, but no speedup on my machine. I'm attaching another patch. There are then 4 methods: method 1: Old loop around udiv_qrnnd_preinv. method 2: The clever code from 10 years ago, with the microoptimization I commited the other day. method 3: More or less the same as I posted a few days ago. method 4: Postpones the update u1 -= u2 d, off the critical recurrency chain. Instead, conditionally adds in the constant B2 (B - d) to the lower u limbs. Benchmark on my laptop: $ ./speed -p 1000000 -s 2-20 -C mpn_div_qr_1n_pi1.0x8765432108765432 mpn_div_qr_1n_pi1_1.0x8765432108765432 mpn_div_qr_1n_pi1_2.0x8765432108765432 mpn_div_qr_1n_pi1_3.0x8765432108765432 mpn_div_qr_1n_pi1_4.0x8765432108765432 overhead 2.63 cycles, precision 1000000 units of 7.16e-10 secs, CPU freq 1396.05 MHz mpn_div_qr_1n_pi1.0x8765432108765432 mpn_div_qr_1n_pi1_1.0x8765432108765432 mpn_div_qr_1n_pi1_2.0x8765432108765432 mpn_div_qr_1n_pi1_3.0x8765432108765432 mpn_div_qr_1n_pi1_4.0x8765432108765432 2 4.4566 #3.9635 5.8490 5.4085 6.0087 3 4.4323 #4.2441 5.8115 5.1158 5.7832 4 4.5348 #4.3992 5.7306 5.0807 5.8611 5 #4.5534 4.6698 5.7493 4.9803 5.5605 6 #4.5653 4.8412 5.7497 4.9129 5.6516 7 #4.6069 5.1388 5.7388 4.8811 5.6110 8 #4.6202 5.5073 5.7423 4.9359 5.5695 9 #4.6433 5.7341 5.7537 4.9357 5.5407 10 #4.6436 5.9595 5.7400 4.9231 5.5428 11 #4.6698 6.1348 5.7449 4.9430 5.5237 12 #4.6395 6.2541 5.7378 4.9452 5.5239 13 #4.6905 6.3761 5.7373 4.9482 5.5085 14 #4.6700 6.4692 5.8173 4.9447 5.5006 15 #4.6643 6.5548 5.7426 4.9644 5.4958 16 #4.6809 6.6305 5.7439 4.9625 5.4924 17 #4.6800 6.6901 5.7418 4.9576 5.4760 18 #4.6903 6.7440 5.7436 4.9866 5.4840 19 #4.6886 6.7891 5.7366 4.9753 5.4872 20 #4.6818 6.8370 5.7405 4.9783 5.4820 asm method 1 method 2 method 3 method 4 So on my laptop, method 3 wins, just 0.3 cycles/limb behind the assembly implementation. It's also curius that method 1 wins for the smaller operands, maybe that's the cost of the B2 = -d * dinv ? I think assembly implementation of method 3 may get register challenged (but I haven't loked at what the compiler generates). To reduce number of live registers, one might need to complete the u accumulation before the q multiply. Regards, /Niels
diff -r a9f0db9f7199 mpn/generic/div_qr_1n_pi1.c --- a/mpn/generic/div_qr_1n_pi1.c Thu Jun 03 23:50:08 2021 +0200 +++ b/mpn/generic/div_qr_1n_pi1.c Sun Jun 06 19:47:36 2021 +0200 @@ -298,6 +298,208 @@ return u0; } +#elif DIV_QR_1N_METHOD == 3 + +/* This variant handles carry from the u update earlier. This gives a + longer critical path, but reduces the work needed for the + quotients. */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t cy, u0; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + u0 = up[n-1]; /* Early read, to allow qp == up. */ + + add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0); + u1 -= cy & d; + q1 -= cy; + qp[n-1] = q1; + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + mp_limb_t t1, t0; + + /* Additions for the q update: + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+ + * | 1 | (conditional on {u1, u0} carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + * + * Additions for the u update: + * +-------+ + * |u1 * B2| + * +---+---+ + * + |u0 |u-1| + * +---+---+ + * - | d | (conditional on carry) + * ---+---+---+ + * |u1 | u0| + * +---+---+ + * + */ + umul_ppmm (p1, p0, u1, B2); + ADDC_LIMB (q2, q1, u1, q0); + umul_ppmm (t1, t0, u1, dinv); + add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0); + u1 -= cy & d; + + /* t1 <= B-2, so cy can be added in without overflow. */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy); + q0 = t0; + + /* Final q update */ + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + } + + q1 = (u1 >= d); + u1 -= (-q1) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} + +#elif DIV_QR_1N_METHOD == 4 + +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t u2, u0; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t B2d0, B2d1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + /* B2 * (B-d) */ + umul_ppmm (B2d1, B2d0, B2, -d); + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + + add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0); + + /* After read of up[n-1], to allow qp == up. */ + qp[n-1] = q1 - u2; + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + mp_limb_t t1, t0; + + /* Additions for the q update. *After* u1 -= u2 & d adjustment. + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+ + * | 1 | (conditional on {u1, u0} carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + * + * Additions for the u update. *Before* u1 -= u2 & d adjstment. + * +-------+ + * |u1 * B2| + * +---+---+ + * |u0 |u-1| + * +---+---+ + + + |B2(B-d)| (conditional on u2) + * -+---+---+---+ + * |u2 |u1 | u0| + * +---+---+---+ + * + */ + /* Multiply with unadjusted u1, to shorten critical path. */ + umul_ppmm (p1, p0, u1, B2); + u1 -= (d & u2); + ADDC_LIMB (q2, q1, u1, q0); + umul_ppmm (t1, t0, u1, dinv); + + add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0); + add_mssaaaa (u2, u1, u0, u1, u0, p1, p0); + u2 += cy; + ASSERT(-u2 <= 1); + + /* t1 <= B-2, so u2 can be added in without overflow. */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2); + q0 = t0; + + /* Final q update */ + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + } + u1 -= u2 & d; + + q1 = (u1 >= d); + u1 -= (-q1) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} #else #error Unknown DIV_QR_1N_METHOD #endif diff -r a9f0db9f7199 tune/Makefile.am --- a/tune/Makefile.am Thu Jun 03 23:50:08 2021 +0200 +++ b/tune/Makefile.am Sun Jun 06 19:47:36 2021 +0200 @@ -53,7 +53,8 @@ libspeed_la_SOURCES = \ common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c \ - div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1_tune.c \ + div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1n_pi1_3.c \ + div_qr_1n_pi1_4.c div_qr_1_tune.c \ freq.c \ gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \ hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c \ diff -r a9f0db9f7199 tune/common.c --- a/tune/common.c Thu Jun 03 23:50:08 2021 +0200 +++ b/tune/common.c Sun Jun 06 19:47:36 2021 +0200 @@ -718,6 +718,16 @@ { SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_2); } +double +speed_mpn_div_qr_1n_pi1_3 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_3); +} +double +speed_mpn_div_qr_1n_pi1_4 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_4); +} double speed_mpn_div_qr_1 (struct speed_params *s) diff -r a9f0db9f7199 tune/div_qr_1_tune.c --- a/tune/div_qr_1_tune.c Thu Jun 03 23:50:08 2021 +0200 +++ b/tune/div_qr_1_tune.c Sun Jun 06 19:47:36 2021 +0200 @@ -34,10 +34,13 @@ mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); +mp_limb_t mpn_div_qr_1n_pi1_3 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); #if !HAVE_NATIVE_mpn_div_qr_1n_pi1 -#define __gmpn_div_qr_1n_pi1 \ - (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2) +#define __gmpn_div_qr_1n_pi1 \ + (div_qr_1n_pi1_method <= 2 \ + ? (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2) \ + : (div_qr_1n_pi1_method == 3 ? mpn_div_qr_1n_pi1_3 : mpn_div_qr_1n_pi1_4)) #endif #undef mpn_div_qr_1 diff -r a9f0db9f7199 tune/div_qr_1n_pi1_3.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tune/div_qr_1n_pi1_3.c Sun Jun 06 19:47:36 2021 +0200 @@ -0,0 +1,38 @@ +/* mpn/generic/div_qr_1n_pi1.c method 3. + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef DIV_QR_1N_METHOD +#define DIV_QR_1N_METHOD 3 +#undef mpn_div_qr_1n_pi1 +#define mpn_div_qr_1n_pi1 mpn_div_qr_1n_pi1_3 + +#include "mpn/generic/div_qr_1n_pi1.c" diff -r a9f0db9f7199 tune/div_qr_1n_pi1_4.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tune/div_qr_1n_pi1_4.c Sun Jun 06 19:47:36 2021 +0200 @@ -0,0 +1,38 @@ +/* mpn/generic/div_qr_1n_pi1.c method 4. + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef DIV_QR_1N_METHOD +#define DIV_QR_1N_METHOD 4 +#undef mpn_div_qr_1n_pi1 +#define mpn_div_qr_1n_pi1 mpn_div_qr_1n_pi1_4 + +#include "mpn/generic/div_qr_1n_pi1.c" diff -r a9f0db9f7199 tune/speed.c --- a/tune/speed.c Thu Jun 03 23:50:08 2021 +0200 +++ b/tune/speed.c Sun Jun 06 19:47:36 2021 +0200 @@ -244,6 +244,8 @@ { "mpn_div_qr_1n_pi1", speed_mpn_div_qr_1n_pi1, FLAG_R }, { "mpn_div_qr_1n_pi1_1",speed_mpn_div_qr_1n_pi1_1, FLAG_R }, { "mpn_div_qr_1n_pi1_2",speed_mpn_div_qr_1n_pi1_2, FLAG_R }, + { "mpn_div_qr_1n_pi1_3",speed_mpn_div_qr_1n_pi1_3, FLAG_R }, + { "mpn_div_qr_1n_pi1_4",speed_mpn_div_qr_1n_pi1_4, FLAG_R }, { "mpn_div_qr_1", speed_mpn_div_qr_1, FLAG_R }, { "mpn_div_qr_2n", speed_mpn_div_qr_2n, }, diff -r a9f0db9f7199 tune/speed.h --- a/tune/speed.h Thu Jun 03 23:50:08 2021 +0200 +++ b/tune/speed.h Sun Jun 06 19:47:36 2021 +0200 @@ -210,6 +210,8 @@ double speed_mpn_div_qr_1n_pi1 (struct speed_params *); double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *); double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *); +double speed_mpn_div_qr_1n_pi1_3 (struct speed_params *); +double speed_mpn_div_qr_1n_pi1_4 (struct speed_params *); double speed_mpn_div_qr_1 (struct speed_params *); double speed_mpn_div_qr_2n (struct speed_params *); double speed_mpn_div_qr_2u (struct speed_params *); @@ -482,6 +484,8 @@ mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); +mp_limb_t mpn_div_qr_1n_pi1_3 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); +mp_limb_t mpn_div_qr_1n_pi1_4 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); diff -r a9f0db9f7199 tune/tuneup.c --- a/tune/tuneup.c Thu Jun 03 23:50:08 2021 +0200 +++ b/tune/tuneup.c Sun Jun 06 19:47:36 2021 +0200 @@ -2253,16 +2253,18 @@ if (!HAVE_NATIVE_mpn_div_qr_1n_pi1) { static struct param_t param; - speed_function_t f[2] = + speed_function_t f[] = { speed_mpn_div_qr_1n_pi1_1, speed_mpn_div_qr_1n_pi1_2, + speed_mpn_div_qr_1n_pi1_3, + speed_mpn_div_qr_1n_pi1_4, }; s.size = 10; s.r = randlimb_norm (); - one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", ¶m); + one_method (numberof(f), f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", ¶m); } {
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
_______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel