ni...@lysator.liu.se (Niels Möller) writes:

>> broot.c:    mpn_broot and your mpn_xxx that computes a^{1/n-1},
>>             perhaps call the latter mpn_brootinvm1
>
> I'll start with this one, then.

Here's an initial patch, integrating my code from a few months ago. Some
TODO items:

0. Support in speed, for benchmarking.
 
1. Wraparound optimizations.

2. Consider rearranging the iteration slightly. Now it is

     r' <-- r - r * (a^{k-1} r^k - 1) / n     (A)
     
   (converging to a^{1/n - 1}). Rewrite the powering as

     r' <-- r - r * ((a r)^{k-1} r - 1) / n   (B)

   Then we get rid of the (invariant, if we compute it at full precicion
   up front) power a^{k-1}, at the cost of an extra mullo (of various
   smaller sizes) in the loop. It makes some sense because a * r can be
   computed using wraparound (the low half is the same as in the
   previous iteration), and a * r = a^{1/n}, so this is the number we'd
   really like to compute, so if we do this we should merge mpn_broot
   and mpn_broot_invm1 back into a single function. (I vaguely to recall
   similar issues in other newton iterations, where it might be
   advantageous to, e.g., write an iteration to 1/sqrt(x) and get
   sqrt(x) almost for free).

   Or possibly

     r' <-- r - r * ((a^2 r^2)^{(k-1)/2} r - 1) / n  (C)
   
   This has the potential advantage that the computation of a^2 r^2 is
   one invariant sqrlo (the a^2), one plain squaring (the r^2), and one
   balanced mullo (or mulmod_bnm1). In contrast, r^k (in A) needs powlo
   with larger output than input, and a * r (in B) is a 2x1 unbalanced
   mullo.

3. Write special code for the iteration increasing from one two two
   limbs of precision.

Regards,
/Niels

diff -Nrc2 gmp.0b25e4cfd719/configure.in gmp/configure.in
*** gmp.0b25e4cfd719/configure.in	2012-10-27 21:37:50.000000000 +0200
--- gmp/configure.in	2012-10-27 21:37:50.000000000 +0200
***************
*** 2680,2684 ****
    dcpi1_bdiv_q dcpi1_bdiv_qr						   \
    mu_bdiv_q mu_bdiv_qr							   \
!   bdiv_q bdiv_qr							   \
    divexact bdiv_dbm1c redc_1 redc_2 redc_n powm powlo powm_sec		   \
    trialdiv remove							   \
--- 2680,2684 ----
    dcpi1_bdiv_q dcpi1_bdiv_qr						   \
    mu_bdiv_q mu_bdiv_qr							   \
!   bdiv_q bdiv_qr broot							   \
    divexact bdiv_dbm1c redc_1 redc_2 redc_n powm powlo powm_sec		   \
    trialdiv remove							   \
diff -Nrc2 gmp.0b25e4cfd719/gmp-impl.h gmp/gmp-impl.h
*** gmp.0b25e4cfd719/gmp-impl.h	2012-10-27 21:37:50.000000000 +0200
--- gmp/gmp-impl.h	2012-10-27 21:37:50.000000000 +0200
***************
*** 1630,1633 ****
--- 1630,1635 ----
  __GMP_DECLSPEC mp_size_t mpn_rootrem (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t);
  
+ #define mpn_broot __MPN(broot)
+ __GMP_DECLSPEC void mpn_broot (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t);
  
  #if defined (_CRAY)
diff -Nrc2 gmp.0b25e4cfd719/mpn/generic/broot.c gmp/mpn/generic/broot.c
*** gmp.0b25e4cfd719/mpn/generic/broot.c	1970-01-01 01:00:00.000000000 +0100
--- gmp/mpn/generic/broot.c	2012-10-27 21:37:50.000000000 +0200
***************
*** 0 ****
--- 1,184 ----
+ /* mpn_broot -- Compute hensel sqrt
+ 
+    Contributed to the GNU project by Niels Möller
+ 
+    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+ 
+ Copyright 2012 Free Software Foundation, Inc.
+ 
+ This file is part of the GNU MP Library.
+ 
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+ 
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+ 
+ #include "gmp.h"
+ #include "gmp-impl.h"
+ 
+ /* Computes a^e (mod B). Uses right-to-left binary algorithm, since
+    typical use will have e small. */
+ static mp_limb_t
+ powlimb (mp_limb_t a, mp_limb_t e)
+ {
+   mp_limb_t r = 1;
+   mp_limb_t s = a;
+ 
+   for (r = 1, s = a; e > 0; e >>= 1, s *= s)
+     if (e & 1)
+       r *= s;
+ 
+   return r;
+ }
+ 
+ /* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd.
+ 
+    Iterates
+ 
+      r' <-- r - r * (a^{k-1} r^k - 1) / n
+ 
+    If
+ 
+      a^{k-1} r^k = 1 (mod 2^m),
+ 
+    then
+ 
+      a^{k-1} r'^k = 1 (mod 2^{2m}),
+  */
+ static void
+ mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k)
+ {
+   mp_size_t sizes[GMP_LIMB_BITS * 2];  
+   mp_ptr akm1, tp, rnp, ep, scratch;
+   mp_limb_t a0, r0, km1, kinv;
+   mp_size_t rn;
+   unsigned i;
+ 
+   TMP_DECL;
+ 
+   ASSERT (n > 0);
+   ASSERT (ap[0] & 1);
+   ASSERT (k & 1);
+   ASSERT (k >= 3);
+ 
+   TMP_MARK;
+   
+   akm1 = TMP_ALLOC_LIMBS (4*n);
+   tp = akm1 + n;
+ 
+   km1 = k-1;
+   /* FIXME: Could arrange the iteration so we don't need to compute
+      this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note
+      that we can use wraparound also for a*r, since the low half is
+      unchanged from the previous iteration. Or possibly mulmid. Also,
+      a r = a^{1/k}, so we get that value too, for free? */
+   mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */
+ 
+   a0 = ap[0];
+   binvert_limb (kinv, k);
+ 
+   /* 4 bits: a^{1/k - 1} (mod 16):
+ 
+         a % 8
+         1 3 5 7   
+    k%4 +-------
+      1 |1 1 1 1
+      3 |1 9 9 1
+   */
+   r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8);
+   r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */
+   r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */
+   r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */
+ #if GMP_NUMB_BITS > 32
+   {
+     unsigned prec = 32;
+     do
+       {
+ 	r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k));
+ 	prec *= 2;
+       }
+     while (prec < GMP_NUMB_BITS);
+   }
+ #endif
+ 
+   rp[0] = r0;
+   if (n == 1)
+     {
+       TMP_FREE;
+       return;
+     }
+ 
+   /* FIXME: Special case for two limb iteration. */
+   rnp = TMP_ALLOC_LIMBS (2*n);  
+   ep = rnp + n;
+ 
+   /* FIXME: Possible to this on the fly with some bit fiddling. */
+   for (i = 0; n > 1; n = (n + 1)/2)
+     sizes[i++] = n;
+ 
+   rn = 1;
+ 
+   while (i-- > 0)
+     {
+       /* Compute x^k. What's the best way to handle the doubled
+ 	 precision? */
+       MPN_ZERO (rp + rn, sizes[i] - rn);
+       mpn_powlo (rnp, rp, &k, 1, sizes[i], tp); 
+ 
+       /* Multiply by a^{k-1}. Can use wraparound; low part is
+ 	 000...01. */
+ 
+       mpn_mullo_n (ep, rnp, akm1, sizes[i]);
+       ASSERT (ep[0] == 1);
+       ASSERT (rn == 1 || mpn_zero_p (ep + 1, rn - 1));
+ 
+       ASSERT (sizes[i] <= 2*rn);
+       mpn_pi1_bdiv_q_1 (ep, ep + rn, sizes[i] - rn, k, kinv, 0);
+ 
+       /* Multiply by x, plain mullo. */
+       mpn_mullo_n (rp + rn, ep, rp, sizes[i] - rn);
+ 
+       /* FIXME: Avoid negation, e.g., by using a bdiv_q_1 variant
+ 	 returning -q. */
+       mpn_neg (rp + rn, rp + rn, sizes[i] - rn);
+ 
+       rn = sizes[i];
+     }
+   TMP_FREE;
+ }
+ 
+ /* Computes a^{1/k} (mod B^n). Both a and k must be odd. */
+ void
+ mpn_broot (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k)
+ {
+   mp_ptr tp;
+   TMP_DECL;
+ 
+   ASSERT (n > 0);
+   ASSERT (ap[0] & 1);
+   ASSERT (k & 1);
+ 
+   if (k == 1)
+     {
+       MPN_COPY (rp, ap, n);
+       return;
+     }
+ 
+   TMP_MARK;
+   tp = TMP_ALLOC_LIMBS (n);
+ 
+   mpn_broot_invm1 (tp, ap, n, k);
+   mpn_mullo_n (rp, tp, ap, n);
+ 
+   TMP_FREE;  
+ }
diff -Nrc2 gmp.0b25e4cfd719/tests/mpn/Makefile.am gmp/tests/mpn/Makefile.am
*** gmp.0b25e4cfd719/tests/mpn/Makefile.am	2012-10-27 21:37:50.000000000 +0200
--- gmp/tests/mpn/Makefile.am	2012-10-27 21:37:50.000000000 +0200
***************
*** 29,33 ****
    t-toom2-sqr t-toom3-sqr t-toom4-sqr t-toom6-sqr t-toom8-sqr		\
    t-mul t-mullo t-mulmod_bnm1 t-sqrmod_bnm1 t-mulmid			\
!   t-hgcd t-hgcd_appr t-matrix22 t-invert t-div t-bdiv
  
  EXTRA_DIST = toom-shared.h toom-sqr-shared.h
--- 29,33 ----
    t-toom2-sqr t-toom3-sqr t-toom4-sqr t-toom6-sqr t-toom8-sqr		\
    t-mul t-mullo t-mulmod_bnm1 t-sqrmod_bnm1 t-mulmid			\
!   t-hgcd t-hgcd_appr t-matrix22 t-invert t-div t-bdiv t-broot
  
  EXTRA_DIST = toom-shared.h toom-sqr-shared.h
diff -Nrc2 gmp.0b25e4cfd719/tests/mpn/t-broot.c gmp/tests/mpn/t-broot.c
*** gmp.0b25e4cfd719/tests/mpn/t-broot.c	1970-01-01 01:00:00.000000000 +0100
--- gmp/tests/mpn/t-broot.c	2012-10-27 21:37:50.000000000 +0200
***************
*** 0 ****
--- 1,100 ----
+ /* Copyright 2012 Free Software Foundation, Inc.
+ 
+ This program is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free Software
+ Foundation; either version 3 of the License, or (at your option) any later
+ version.
+ 
+ This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along with
+ this program.  If not, see http://www.gnu.org/licenses/.  */
+ 
+ 
+ #include <stdlib.h>		/* for strtol */
+ #include <stdio.h>		/* for printf */
+ 
+ #include "gmp.h"
+ #include "gmp-impl.h"
+ #include "longlong.h"
+ #include "tests/tests.h"
+ 
+ #define MAX_LIMBS 150
+ #define COUNT 500
+ 
+ int
+ main (int argc, char **argv)
+ {
+   gmp_randstate_ptr rands;
+ 
+   mp_ptr ap, rp, pp, scratch;
+   int count = COUNT;
+   unsigned i;
+   TMP_DECL;
+   
+   if (argc > 1)
+     {
+       char *end;
+       count = strtol (argv[1], &end, 0);
+       if (*end || count <= 0)
+ 	{
+ 	  fprintf (stderr, "Invalid test count: %s.\n", argv[1]);
+ 	  return 1;
+ 	}
+     }
+   
+   tests_start ();
+   rands = RANDS;
+ 
+   ap = TMP_ALLOC_LIMBS (MAX_LIMBS);
+   rp = TMP_ALLOC_LIMBS (MAX_LIMBS);
+   pp = TMP_ALLOC_LIMBS (MAX_LIMBS);
+   scratch = TMP_ALLOC_LIMBS (3*MAX_LIMBS); /* For mpn_powlo */
+ 
+   for (i = 0; i < count; i++)
+     {
+       mp_size_t n;
+       mp_limb_t k;
+       int c;
+ 
+       n = 1 + gmp_urandomm_ui (rands, MAX_LIMBS);
+ 
+       if (i & 1)
+ 	mpn_random2 (ap, n);
+       else
+ 	mpn_random (ap, n);
+ 
+       ap[0] |= 1;
+ 
+       if (i < 100)
+ 	k = 3 + 2*i;
+       else
+ 	{
+ 	  mpn_random (&k, 1);
+ 	  if (k < 3)
+ 	    k = 3;
+ 	  else
+ 	    k |= 1;
+ 	}
+       mpn_broot (rp, ap, n, k);
+       mpn_powlo (pp, rp, &k, 1, n, scratch);
+ 
+       MPN_CMP (c, ap, pp, n);
+       if (c != 0)
+ 	{
+ 	  gmp_fprintf (stderr,
+ 		       "mpn_broot returned bad result: %u limbs\n",
+ 		       (unsigned) n);
+ 	  gmp_fprintf (stderr, "k   = %Mx\n", k);
+ 	  gmp_fprintf (stderr, "a   = %Nx\n", ap, n);
+ 	  gmp_fprintf (stderr, "r   = %Nx\n", rp, n);
+ 	  gmp_fprintf (stderr, "r^n = %Nx\n", pp, n);
+ 	  abort ();
+ 	}
+     }
+   TMP_FREE;
+   tests_end ();
+   return 0;
+ }

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
http://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to