ni...@lysator.liu.se (Niels Möller) writes: >> broot.c: mpn_broot and your mpn_xxx that computes a^{1/n-1}, >> perhaps call the latter mpn_brootinvm1 > > I'll start with this one, then.
Here's an initial patch, integrating my code from a few months ago. Some TODO items: 0. Support in speed, for benchmarking. 1. Wraparound optimizations. 2. Consider rearranging the iteration slightly. Now it is r' <-- r - r * (a^{k-1} r^k - 1) / n (A) (converging to a^{1/n - 1}). Rewrite the powering as r' <-- r - r * ((a r)^{k-1} r - 1) / n (B) Then we get rid of the (invariant, if we compute it at full precicion up front) power a^{k-1}, at the cost of an extra mullo (of various smaller sizes) in the loop. It makes some sense because a * r can be computed using wraparound (the low half is the same as in the previous iteration), and a * r = a^{1/n}, so this is the number we'd really like to compute, so if we do this we should merge mpn_broot and mpn_broot_invm1 back into a single function. (I vaguely to recall similar issues in other newton iterations, where it might be advantageous to, e.g., write an iteration to 1/sqrt(x) and get sqrt(x) almost for free). Or possibly r' <-- r - r * ((a^2 r^2)^{(k-1)/2} r - 1) / n (C) This has the potential advantage that the computation of a^2 r^2 is one invariant sqrlo (the a^2), one plain squaring (the r^2), and one balanced mullo (or mulmod_bnm1). In contrast, r^k (in A) needs powlo with larger output than input, and a * r (in B) is a 2x1 unbalanced mullo. 3. Write special code for the iteration increasing from one two two limbs of precision. Regards, /Niels
diff -Nrc2 gmp.0b25e4cfd719/configure.in gmp/configure.in *** gmp.0b25e4cfd719/configure.in 2012-10-27 21:37:50.000000000 +0200 --- gmp/configure.in 2012-10-27 21:37:50.000000000 +0200 *************** *** 2680,2684 **** dcpi1_bdiv_q dcpi1_bdiv_qr \ mu_bdiv_q mu_bdiv_qr \ ! bdiv_q bdiv_qr \ divexact bdiv_dbm1c redc_1 redc_2 redc_n powm powlo powm_sec \ trialdiv remove \ --- 2680,2684 ---- dcpi1_bdiv_q dcpi1_bdiv_qr \ mu_bdiv_q mu_bdiv_qr \ ! bdiv_q bdiv_qr broot \ divexact bdiv_dbm1c redc_1 redc_2 redc_n powm powlo powm_sec \ trialdiv remove \ diff -Nrc2 gmp.0b25e4cfd719/gmp-impl.h gmp/gmp-impl.h *** gmp.0b25e4cfd719/gmp-impl.h 2012-10-27 21:37:50.000000000 +0200 --- gmp/gmp-impl.h 2012-10-27 21:37:50.000000000 +0200 *************** *** 1630,1633 **** --- 1630,1635 ---- __GMP_DECLSPEC mp_size_t mpn_rootrem (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t); + #define mpn_broot __MPN(broot) + __GMP_DECLSPEC void mpn_broot (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t); #if defined (_CRAY) diff -Nrc2 gmp.0b25e4cfd719/mpn/generic/broot.c gmp/mpn/generic/broot.c *** gmp.0b25e4cfd719/mpn/generic/broot.c 1970-01-01 01:00:00.000000000 +0100 --- gmp/mpn/generic/broot.c 2012-10-27 21:37:50.000000000 +0200 *************** *** 0 **** --- 1,184 ---- + /* mpn_broot -- Compute hensel sqrt + + Contributed to the GNU project by Niels Möller + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + + Copyright 2012 Free Software Foundation, Inc. + + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 3 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + + #include "gmp.h" + #include "gmp-impl.h" + + /* Computes a^e (mod B). Uses right-to-left binary algorithm, since + typical use will have e small. */ + static mp_limb_t + powlimb (mp_limb_t a, mp_limb_t e) + { + mp_limb_t r = 1; + mp_limb_t s = a; + + for (r = 1, s = a; e > 0; e >>= 1, s *= s) + if (e & 1) + r *= s; + + return r; + } + + /* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd. + + Iterates + + r' <-- r - r * (a^{k-1} r^k - 1) / n + + If + + a^{k-1} r^k = 1 (mod 2^m), + + then + + a^{k-1} r'^k = 1 (mod 2^{2m}), + */ + static void + mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k) + { + mp_size_t sizes[GMP_LIMB_BITS * 2]; + mp_ptr akm1, tp, rnp, ep, scratch; + mp_limb_t a0, r0, km1, kinv; + mp_size_t rn; + unsigned i; + + TMP_DECL; + + ASSERT (n > 0); + ASSERT (ap[0] & 1); + ASSERT (k & 1); + ASSERT (k >= 3); + + TMP_MARK; + + akm1 = TMP_ALLOC_LIMBS (4*n); + tp = akm1 + n; + + km1 = k-1; + /* FIXME: Could arrange the iteration so we don't need to compute + this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note + that we can use wraparound also for a*r, since the low half is + unchanged from the previous iteration. Or possibly mulmid. Also, + a r = a^{1/k}, so we get that value too, for free? */ + mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */ + + a0 = ap[0]; + binvert_limb (kinv, k); + + /* 4 bits: a^{1/k - 1} (mod 16): + + a % 8 + 1 3 5 7 + k%4 +------- + 1 |1 1 1 1 + 3 |1 9 9 1 + */ + r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8); + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */ + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */ + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */ + #if GMP_NUMB_BITS > 32 + { + unsigned prec = 32; + do + { + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); + prec *= 2; + } + while (prec < GMP_NUMB_BITS); + } + #endif + + rp[0] = r0; + if (n == 1) + { + TMP_FREE; + return; + } + + /* FIXME: Special case for two limb iteration. */ + rnp = TMP_ALLOC_LIMBS (2*n); + ep = rnp + n; + + /* FIXME: Possible to this on the fly with some bit fiddling. */ + for (i = 0; n > 1; n = (n + 1)/2) + sizes[i++] = n; + + rn = 1; + + while (i-- > 0) + { + /* Compute x^k. What's the best way to handle the doubled + precision? */ + MPN_ZERO (rp + rn, sizes[i] - rn); + mpn_powlo (rnp, rp, &k, 1, sizes[i], tp); + + /* Multiply by a^{k-1}. Can use wraparound; low part is + 000...01. */ + + mpn_mullo_n (ep, rnp, akm1, sizes[i]); + ASSERT (ep[0] == 1); + ASSERT (rn == 1 || mpn_zero_p (ep + 1, rn - 1)); + + ASSERT (sizes[i] <= 2*rn); + mpn_pi1_bdiv_q_1 (ep, ep + rn, sizes[i] - rn, k, kinv, 0); + + /* Multiply by x, plain mullo. */ + mpn_mullo_n (rp + rn, ep, rp, sizes[i] - rn); + + /* FIXME: Avoid negation, e.g., by using a bdiv_q_1 variant + returning -q. */ + mpn_neg (rp + rn, rp + rn, sizes[i] - rn); + + rn = sizes[i]; + } + TMP_FREE; + } + + /* Computes a^{1/k} (mod B^n). Both a and k must be odd. */ + void + mpn_broot (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k) + { + mp_ptr tp; + TMP_DECL; + + ASSERT (n > 0); + ASSERT (ap[0] & 1); + ASSERT (k & 1); + + if (k == 1) + { + MPN_COPY (rp, ap, n); + return; + } + + TMP_MARK; + tp = TMP_ALLOC_LIMBS (n); + + mpn_broot_invm1 (tp, ap, n, k); + mpn_mullo_n (rp, tp, ap, n); + + TMP_FREE; + } diff -Nrc2 gmp.0b25e4cfd719/tests/mpn/Makefile.am gmp/tests/mpn/Makefile.am *** gmp.0b25e4cfd719/tests/mpn/Makefile.am 2012-10-27 21:37:50.000000000 +0200 --- gmp/tests/mpn/Makefile.am 2012-10-27 21:37:50.000000000 +0200 *************** *** 29,33 **** t-toom2-sqr t-toom3-sqr t-toom4-sqr t-toom6-sqr t-toom8-sqr \ t-mul t-mullo t-mulmod_bnm1 t-sqrmod_bnm1 t-mulmid \ ! t-hgcd t-hgcd_appr t-matrix22 t-invert t-div t-bdiv EXTRA_DIST = toom-shared.h toom-sqr-shared.h --- 29,33 ---- t-toom2-sqr t-toom3-sqr t-toom4-sqr t-toom6-sqr t-toom8-sqr \ t-mul t-mullo t-mulmod_bnm1 t-sqrmod_bnm1 t-mulmid \ ! t-hgcd t-hgcd_appr t-matrix22 t-invert t-div t-bdiv t-broot EXTRA_DIST = toom-shared.h toom-sqr-shared.h diff -Nrc2 gmp.0b25e4cfd719/tests/mpn/t-broot.c gmp/tests/mpn/t-broot.c *** gmp.0b25e4cfd719/tests/mpn/t-broot.c 1970-01-01 01:00:00.000000000 +0100 --- gmp/tests/mpn/t-broot.c 2012-10-27 21:37:50.000000000 +0200 *************** *** 0 **** --- 1,100 ---- + /* Copyright 2012 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 3 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + PARTICULAR PURPOSE. See the GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along with + this program. If not, see http://www.gnu.org/licenses/. */ + + + #include <stdlib.h> /* for strtol */ + #include <stdio.h> /* for printf */ + + #include "gmp.h" + #include "gmp-impl.h" + #include "longlong.h" + #include "tests/tests.h" + + #define MAX_LIMBS 150 + #define COUNT 500 + + int + main (int argc, char **argv) + { + gmp_randstate_ptr rands; + + mp_ptr ap, rp, pp, scratch; + int count = COUNT; + unsigned i; + TMP_DECL; + + if (argc > 1) + { + char *end; + count = strtol (argv[1], &end, 0); + if (*end || count <= 0) + { + fprintf (stderr, "Invalid test count: %s.\n", argv[1]); + return 1; + } + } + + tests_start (); + rands = RANDS; + + ap = TMP_ALLOC_LIMBS (MAX_LIMBS); + rp = TMP_ALLOC_LIMBS (MAX_LIMBS); + pp = TMP_ALLOC_LIMBS (MAX_LIMBS); + scratch = TMP_ALLOC_LIMBS (3*MAX_LIMBS); /* For mpn_powlo */ + + for (i = 0; i < count; i++) + { + mp_size_t n; + mp_limb_t k; + int c; + + n = 1 + gmp_urandomm_ui (rands, MAX_LIMBS); + + if (i & 1) + mpn_random2 (ap, n); + else + mpn_random (ap, n); + + ap[0] |= 1; + + if (i < 100) + k = 3 + 2*i; + else + { + mpn_random (&k, 1); + if (k < 3) + k = 3; + else + k |= 1; + } + mpn_broot (rp, ap, n, k); + mpn_powlo (pp, rp, &k, 1, n, scratch); + + MPN_CMP (c, ap, pp, n); + if (c != 0) + { + gmp_fprintf (stderr, + "mpn_broot returned bad result: %u limbs\n", + (unsigned) n); + gmp_fprintf (stderr, "k = %Mx\n", k); + gmp_fprintf (stderr, "a = %Nx\n", ap, n); + gmp_fprintf (stderr, "r = %Nx\n", rp, n); + gmp_fprintf (stderr, "r^n = %Nx\n", pp, n); + abort (); + } + } + TMP_FREE; + tests_end (); + return 0; + }
-- Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26. Internet email is subject to wholesale government surveillance.
_______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org http://gmplib.org/mailman/listinfo/gmp-devel