Times seem great for GCD. Just some build issues to fix and we're done! Oh and I need to fix the perfect power bug.
Bill. 2008/12/24 Bill Hart <goodwillh...@googlemail.com>: > On sage.math: > > cd tune > make tune > > ./.libs/libspeed.a(gcd_bin.o): In function `__gmpn_ngcd_matrix_init': > gcd_bin.c:(.text+0x0): multiple definition of `__gmpn_ngcd_matrix_init' > gcd.o:gcd.c:(.text+0x0): first defined here > ./.libs/libspeed.a(gcd_bin.o): In function `__gmpn_nhgcd_itch': > gcd_bin.c:(.text+0x80): multiple definition of `__gmpn_nhgcd_itch' > gcd.o:gcd.c:(.text+0x80): first defined here > ./.libs/libspeed.a(gcd_bin.o): In function `__gmpn_nhgcd': > gcd_bin.c:(.text+0xc4): multiple definition of `__gmpn_nhgcd' > gcd.o:gcd.c:(.text+0xc4): first defined here > ./.libs/libspeed.a(gcd_bin.o): In function `mpn_basic_gcd': > gcd_bin.c:(.text+0x2ed): multiple definition of `mpn_basic_gcd' > gcd.o:gcd.c:(.text+0x2ed): first defined here > > Bill. > > > 2008/12/23 Jason Martin <jason.worth.mar...@gmail.com>: >> Attached are some edited versions of >> >> mpn/generic/gcd.c >> >> and >> >> mpn/generic/ngcd.c >> >> Drop them in, test them for correctness and speed. Let me know what >> breaks. When everyone is happy, I'll check them in to svn >> >> --jason >> >> Jason Worth Martin >> Asst. Professor of Mathematics >> http://www.math.jmu.edu/~martin >> >> >> >> >> /* Schönhage's 1987 algorithm, reorganized into hgcd form */ >> >> #include <stdio.h> /* for NULL */ >> >> #include "gmp.h" >> #include "gmp-impl.h" >> #include "longlong.h" >> >> >> /* ****************************************************************** >> * Here we are including the original GMP version of mpn_gcd >> * but we rename it as mpn_basic_gcd. It needs to be available >> * for the ngcd algorithm to call in the base case. >> * >> * Preconditions [U = (up, usize) and V = (vp, vsize)]: >> * >> * 1. V is odd. >> * 2. numbits(U) >= numbits(V). >> * >> * Both U and V are destroyed by the operation. The result is left at vp, >> * and its size is returned. >> * >> * Ken Weber (kwe...@mat.ufrgs.br, kwe...@mcs.kent.edu) >> * >> * Funding for this work has been partially provided by Conselho >> * Nacional de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do >> * Brazil, Grant 301314194-2, and was done while I was a visiting >> * reseacher in the Instituto de Matema'tica at Universidade Federal >> * do Rio Grande do Sul (UFRGS). >> * >> * Refer to K. Weber, The accelerated integer GCD algorithm, ACM >> * Transactions on Mathematical Software, v. 21 (March), 1995, >> * pp. 111-122. >> * >> * *****************************************************************/ >> >> >> >> /* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated >> algorithm is used, otherwise the binary algorithm is used. This may be >> adjusted for different architectures. */ >> #ifndef GCD_ACCEL_THRESHOLD >> #define GCD_ACCEL_THRESHOLD 5 >> #endif >> >> /* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated >> algorithm reduces using the bmod operation. Otherwise, the k-ary reduction >> is used. 0 <= BMOD_THRESHOLD < GMP_NUMB_BITS. */ >> enum >> { >> BMOD_THRESHOLD = GMP_NUMB_BITS/2 >> }; >> >> >> /* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2. >> Both U and V must be odd. */ >> static inline mp_size_t >> gcd_2 (mp_ptr vp, mp_srcptr up) >> { >> mp_limb_t u0, u1, v0, v1; >> mp_size_t vsize; >> >> u0 = up[0]; >> u1 = up[1]; >> v0 = vp[0]; >> v1 = vp[1]; >> >> while (u1 != v1 && u0 != v0) >> { >> unsigned long int r; >> if (u1 > v1) >> { >> u1 -= v1 + (u0 < v0); >> u0 = (u0 - v0) & GMP_NUMB_MASK; >> count_trailing_zeros (r, u0); >> u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r); >> u1 >>= r; >> } >> else /* u1 < v1. */ >> { >> v1 -= u1 + (v0 < u0); >> v0 = (v0 - u0) & GMP_NUMB_MASK; >> count_trailing_zeros (r, v0); >> v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r); >> v1 >>= r; >> } >> } >> >> vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0); >> >> /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */ >> if (u1 == v1 && u0 == v0) >> return vsize; >> >> v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0; >> vp[0] = mpn_gcd_1 (vp, vsize, v0); >> >> return 1; >> } >> >> /* The function find_a finds 0 < N < 2^GMP_NUMB_BITS such that there exists >> 0 < |D| < 2^GMP_NUMB_BITS, and N == D * C mod 2^(2*GMP_NUMB_BITS). >> In the reference article, D was computed along with N, but it is better to >> compute D separately as D <-- N / C mod 2^(GMP_NUMB_BITS + 1), treating >> the result as a twos' complement signed integer. >> >> Initialize N1 to C mod 2^(2*GMP_NUMB_BITS). According to the reference >> article, N2 should be initialized to 2^(2*GMP_NUMB_BITS), but we use >> 2^(2*GMP_NUMB_BITS) - N1 to start the calculations within double >> precision. If N2 > N1 initially, the first iteration of the while loop >> will swap them. In all other situations, N1 >= N2 is maintained. */ >> >> #if HAVE_NATIVE_mpn_gcd_finda >> #define find_a(cp) mpn_gcd_finda (cp) >> >> #else >> static >> #if ! defined (__i386__) >> inline /* don't inline this for the x86 */ >> #endif >> mp_limb_t >> find_a (mp_srcptr cp) >> { >> unsigned long int leading_zero_bits = 0; >> >> mp_limb_t n1_l = cp[0]; /* N1 == n1_h * 2^GMP_NUMB_BITS + n1_l. */ >> mp_limb_t n1_h = cp[1]; >> >> mp_limb_t n2_l = (-n1_l & GMP_NUMB_MASK); /* N2 == n2_h * >> 2^GMP_NUMB_BITS + n2_l. */ >> mp_limb_t n2_h = (~n1_h & GMP_NUMB_MASK); >> >> /* Main loop. */ >> while (n2_h != 0) /* While N2 >= 2^GMP_NUMB_BITS. */ >> { >> /* N1 <-- N1 % N2. */ >> if (((GMP_NUMB_HIGHBIT >> leading_zero_bits) & n2_h) == 0) >> { >> unsigned long int i; >> count_leading_zeros (i, n2_h); >> i -= GMP_NAIL_BITS; >> i -= leading_zero_bits; >> leading_zero_bits += i; >> n2_h = ((n2_h << i) & GMP_NUMB_MASK) | (n2_l >> (GMP_NUMB_BITS - >> i)); >> n2_l = (n2_l << i) & GMP_NUMB_MASK; >> do >> { >> if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l)) >> { >> n1_h -= n2_h + (n1_l < n2_l); >> n1_l = (n1_l - n2_l) & GMP_NUMB_MASK; >> } >> n2_l = (n2_l >> 1) | ((n2_h << (GMP_NUMB_BITS - 1)) & >> GMP_NUMB_MASK); >> n2_h >>= 1; >> i -= 1; >> } >> while (i != 0); >> } >> if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l)) >> { >> n1_h -= n2_h + (n1_l < n2_l); >> n1_l = (n1_l - n2_l) & GMP_NUMB_MASK; >> } >> >> MP_LIMB_T_SWAP (n1_h, n2_h); >> MP_LIMB_T_SWAP (n1_l, n2_l); >> } >> >> return n2_l; >> } >> #endif >> >> >> mp_size_t >> mpn_basic_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t >> vsize) >> { >> mp_ptr orig_vp = vp; >> mp_size_t orig_vsize = vsize; >> int binary_gcd_ctr; /* Number of times binary gcd will execute. >> */ >> mp_size_t scratch; >> mp_ptr tp; >> TMP_DECL; >> >> ASSERT (usize >= 1); >> ASSERT (vsize >= 1); >> ASSERT (usize >= vsize); >> ASSERT (vp[0] & 1); >> ASSERT (up[usize - 1] != 0); >> ASSERT (vp[vsize - 1] != 0); >> #if WANT_ASSERT >> if (usize == vsize) >> { >> int uzeros, vzeros; >> count_leading_zeros (uzeros, up[usize - 1]); >> count_leading_zeros (vzeros, vp[vsize - 1]); >> ASSERT (uzeros <= vzeros); >> } >> #endif >> ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize)); >> ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, up, usize)); >> ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, vp, vsize)); >> >> TMP_MARK; >> >> /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD. >> Two EXTRA limbs for U and V are required for kary reduction. */ >> if (vsize >= GCD_ACCEL_THRESHOLD) >> { >> unsigned long int vbitsize, d; >> mp_ptr orig_up = up; >> mp_size_t orig_usize = usize; >> mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB); >> >> MPN_COPY (anchor_up, orig_up, usize); >> up = anchor_up; >> >> count_leading_zeros (d, up[usize - 1]); >> d -= GMP_NAIL_BITS; >> d = usize * GMP_NUMB_BITS - d; >> count_leading_zeros (vbitsize, vp[vsize - 1]); >> vbitsize -= GMP_NAIL_BITS; >> vbitsize = vsize * GMP_NUMB_BITS - vbitsize; >> ASSERT (d >= vbitsize); >> d = d - vbitsize + 1; >> >> /* Use bmod reduction to quickly discover whether V divides U. */ >> up[usize++] = 0; /* Insert leading zero. */ >> mpn_bdivmod (up, up, usize, vp, vsize, d); >> >> /* Now skip U/V mod 2^d and any low zero limbs. */ >> d /= GMP_NUMB_BITS, up += d, usize -= d; >> while (usize != 0 && up[0] == 0) >> up++, usize--; >> >> if (usize == 0) /* GCD == ORIG_V. */ >> goto done; >> >> vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB); >> MPN_COPY (vp, orig_vp, vsize); >> >> do /* Main loop. */ >> { >> /* mpn_com_n can't be used here because anchor_up and up may >> partially overlap */ >> if ((up[usize - 1] & GMP_NUMB_HIGHBIT) != 0) /* U < 0; take twos' >> compl. */ >> { >> mp_size_t i; >> anchor_up[0] = -up[0] & GMP_NUMB_MASK; >> for (i = 1; i < usize; i++) >> anchor_up[i] = (~up[i] & GMP_NUMB_MASK); >> up = anchor_up; >> } >> >> MPN_NORMALIZE_NOT_ZERO (up, usize); >> >> if ((up[0] & 1) == 0) /* Result even; remove twos. >> */ >> { >> unsigned int r; >> count_trailing_zeros (r, up[0]); >> mpn_rshift (anchor_up, up, usize, r); >> usize -= (anchor_up[usize - 1] == 0); >> } >> else if (anchor_up != up) >> MPN_COPY_INCR (anchor_up, up, usize); >> >> MPN_PTR_SWAP (anchor_up,usize, vp,vsize); >> up = anchor_up; >> >> if (vsize <= 2) /* Kary can't handle < 2 limbs and */ >> break; /* isn't efficient for == 2 limbs. */ >> >> d = vbitsize; >> count_leading_zeros (vbitsize, vp[vsize - 1]); >> vbitsize -= GMP_NAIL_BITS; >> vbitsize = vsize * GMP_NUMB_BITS - vbitsize; >> d = d - vbitsize + 1; >> >> if (d > BMOD_THRESHOLD) /* Bmod reduction. */ >> { >> up[usize++] = 0; >> mpn_bdivmod (up, up, usize, vp, vsize, d); >> d /= GMP_NUMB_BITS, up += d, usize -= d; >> } >> else /* Kary reduction. */ >> { >> mp_limb_t bp[2], cp[2]; >> >> /* C <-- V/U mod 2^(2*GMP_NUMB_BITS). */ >> { >> mp_limb_t u_inv, hi, lo; >> modlimb_invert (u_inv, up[0]); >> cp[0] = (vp[0] * u_inv) & GMP_NUMB_MASK; >> umul_ppmm (hi, lo, cp[0], up[0] << GMP_NAIL_BITS); >> lo >>= GMP_NAIL_BITS; >> cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv & GMP_NUMB_MASK; >> } >> >> /* U <-- find_a (C) * U. */ >> up[usize] = mpn_mul_1 (up, up, usize, find_a (cp)); >> usize++; >> >> /* B <-- A/C == U/V mod 2^(GMP_NUMB_BITS + 1). >> bp[0] <-- U/V mod 2^GMP_NUMB_BITS and >> bp[1] <-- ( (U - bp[0] * V)/2^GMP_NUMB_BITS ) / V mod 2 >> >> Like V/U above, but simplified because only the low bit of >> bp[1] is wanted. */ >> { >> mp_limb_t v_inv, hi, lo; >> modlimb_invert (v_inv, vp[0]); >> bp[0] = (up[0] * v_inv) & GMP_NUMB_MASK; >> umul_ppmm (hi, lo, bp[0], vp[0] << GMP_NAIL_BITS); >> lo >>= GMP_NAIL_BITS; >> bp[1] = (up[1] + hi + (bp[0] & vp[1])) & 1; >> } >> >> up[usize++] = 0; >> if (bp[1] != 0) /* B < 0: U <-- U + (-B) * V. */ >> { >> mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0] & >> GMP_NUMB_MASK); >> mpn_add_1 (up + vsize, up + vsize, usize - vsize, c); >> } >> else /* B >= 0: U <-- U - B * V. */ >> { >> mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]); >> mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); >> } >> >> up += 2, usize -= 2; /* At least two low limbs are zero. */ >> } >> >> /* Must remove low zero limbs before complementing. */ >> while (usize != 0 && up[0] == 0) >> up++, usize--; >> } >> while (usize != 0); >> >> /* Compute GCD (ORIG_V, GCD (ORIG_U, V)). Binary will execute twice. >> */ >> up = orig_up, usize = orig_usize; >> binary_gcd_ctr = 2; >> } >> else >> binary_gcd_ctr = 1; >> >> scratch = MPN_NGCD_LEHMER_ITCH (vsize); >> if (usize + 1 > scratch) >> scratch = usize + 1; >> >> tp = TMP_ALLOC_LIMBS (scratch); >> >> /* Finish up with the binary algorithm. Executes once or twice. */ >> for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize) >> { >> #if 1 >> if (usize > vsize) >> { >> /* FIXME: Could use mpn_bdivmod. */ >> mp_size_t rsize; >> >> mpn_tdiv_qr (tp + vsize, tp, 0, up, usize, vp, vsize); >> rsize = vsize; >> MPN_NORMALIZE (tp, rsize); >> if (rsize == 0) >> continue; >> >> MPN_COPY (up, tp, vsize); >> } >> vsize = mpn_ngcd_lehmer (vp, up, vp, vsize, tp); >> #else >> if (usize > 2) /* First make U close to V in size. */ >> { >> unsigned long int vbitsize, d; >> count_leading_zeros (d, up[usize - 1]); >> d -= GMP_NAIL_BITS; >> d = usize * GMP_NUMB_BITS - d; >> count_leading_zeros (vbitsize, vp[vsize - 1]); >> vbitsize -= GMP_NAIL_BITS; >> vbitsize = vsize * GMP_NUMB_BITS - vbitsize; >> d = d - vbitsize - 1; >> if (d != -(unsigned long int)1 && d > 2) >> { >> mpn_bdivmod (up, up, usize, vp, vsize, d); /* Result > 0. */ >> d /= (unsigned long int)GMP_NUMB_BITS, up += d, usize -= d; >> } >> } >> >> /* Start binary GCD. */ >> do >> { >> mp_size_t zeros; >> >> /* Make sure U is odd. */ >> MPN_NORMALIZE (up, usize); >> while (up[0] == 0) >> up += 1, usize -= 1; >> if ((up[0] & 1) == 0) >> { >> unsigned int r; >> count_trailing_zeros (r, up[0]); >> mpn_rshift (up, up, usize, r); >> usize -= (up[usize - 1] == 0); >> } >> >> /* Keep usize >= vsize. */ >> if (usize < vsize) >> MPN_PTR_SWAP (up, usize, vp, vsize); >> >> if (usize <= 2) /* Double precision. >> */ >> { >> if (vsize == 1) >> vp[0] = mpn_gcd_1 (up, usize, vp[0]); >> else >> vsize = gcd_2 (vp, up); >> break; /* Binary GCD done. >> */ >> } >> >> /* Count number of low zero limbs of U - V. */ >> for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; ) >> continue; >> >> /* If U < V, swap U and V; in any case, subtract V from U. */ >> if (zeros == vsize) /* Subtract done. */ >> up += zeros, usize -= zeros; >> else if (usize == vsize) >> { >> mp_size_t size = vsize; >> do >> size--; >> while (up[size] == vp[size]); >> if (up[size] < vp[size]) /* usize == vsize. */ >> MP_PTR_SWAP (up, vp); >> up += zeros, usize = size + 1 - zeros; >> mpn_sub_n (up, up, vp + zeros, usize); >> } >> else >> { >> mp_size_t size = vsize - zeros; >> up += zeros, usize -= zeros; >> if (mpn_sub_n (up, up, vp + zeros, size)) >> { >> while (up[size] == 0) /* Propagate borrow. >> */ >> up[size++] = -(mp_limb_t)1; >> up[size] -= 1; >> } >> } >> } >> while (usize); /* End binary GCD. */ >> #endif >> } >> >> done: >> if (vp != gp) >> MPN_COPY_INCR (gp, vp, vsize); >> TMP_FREE; >> return vsize; >> } >> >> >> >> /* ****************************************************************** >> * END of original GMP mpn_gcd >> * *****************************************************************/ >> >> >> >> >> >> /* For input of size n, matrix elements are of size at most ceil(n/2) >> - 1, but we need one limb extra. */ >> >> void >> mpn_ngcd_matrix_init (struct ngcd_matrix *M, mp_size_t n, mp_ptr p) >> { >> mp_size_t s = (n+1)/2; >> M->alloc = s; >> M->n = 1; >> MPN_ZERO (p, 4 * s); >> M->p[0][0] = p; >> M->p[0][1] = p + s; >> M->p[1][0] = p + 2 * s; >> M->p[1][1] = p + 3 * s; >> M->tp = p + 4*s; >> >> M->p[0][0][0] = M->p[1][1][0] = 1; >> } >> >> #define NHGCD_BASE_ITCH MPN_NGCD_STEP_ITCH >> >> /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M >> with elements of size at most (n+1)/2 - 1. Returns new size of a, >> b, or zero if no reduction is possible. */ >> static mp_size_t >> nhgcd_base (mp_ptr ap, mp_ptr bp, mp_size_t n, >> struct ngcd_matrix *M, mp_ptr tp) >> { >> mp_size_t s = n/2 + 1; >> mp_size_t nn; >> >> ASSERT (n > s); >> ASSERT (ap[n-1] > 0 || bp[n-1] > 0); >> >> nn = mpn_ngcd_step (n, ap, bp, s, M, tp); >> if (!nn) >> return 0; >> >> for (;;) >> { >> n = nn; >> ASSERT (n > s); >> nn = mpn_ngcd_step (n, ap, bp, s, M, tp); >> if (!nn ) >> return n; >> } >> } >> >> /* Size analysis for nhgcd: >> >> For the recursive calls, we have n1 <= ceil(n / 2). Then the >> storage need is determined by the storage for the recursive call >> computing M1, and ngcd_matrix_adjust and ngcd_matrix_mul calls that use M1 >> (after this, the storage needed for M1 can be recycled). >> >> Let S(r) denote the required storage. For M1 we need 5 * ceil(n1/2) >> = 5 * ceil(n/4), and for the ngcd_matrix_adjust call, we need n + 2. In >> total, 5 * ceil(n/4) + n + 2 <= 9 ceil(n/4) + 2. >> >> For the recursive call, we need S(n1) = S(ceil(n/2)). >> >> S(n) <= 9*ceil(n/4) + 2 + S(ceil(n/2)) >> <= 9*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 2k + S(ceil(n/2^k)) >> <= 9*(2 ceil(n/4) + k) + 2k + S(n/2^k) >> <= 18 ceil(n/4) + 11k + S(n/2^k) >> >> */ >> >> mp_size_t >> mpn_nhgcd_itch (mp_size_t n) >> { >> unsigned k; >> mp_size_t nn; >> >> /* Inefficient way to almost compute >> log_2(n/NHGCD_BASE_THRESHOLD) */ >> for (k = 0, nn = n; >> ABOVE_THRESHOLD (nn, NHGCD_THRESHOLD); >> nn = (nn + 1) / 2) >> k++; >> >> if (k == 0) >> return NHGCD_BASE_ITCH (n); >> >> return 18 * ((n+3) / 4) + 11 * k >> + NHGCD_BASE_ITCH (NHGCD_THRESHOLD); >> } >> >> /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M >> with elements of size at most (n+1)/2 - 1. Returns new size of a, >> b, or zero if no reduction is possible. */ >> >> mp_size_t >> mpn_nhgcd (mp_ptr ap, mp_ptr bp, mp_size_t n, >> struct ngcd_matrix *M, mp_ptr tp) >> { >> mp_size_t s = n/2 + 1; >> mp_size_t n2 = (3*n)/4 + 1; >> >> mp_size_t p, nn; >> unsigned count; >> int success = 0; >> >> ASSERT (n > s); >> ASSERT ((ap[n-1] | bp[n-1]) > 0); >> >> ASSERT ((n+1)/2 - 1 < M->alloc); >> >> if (BELOW_THRESHOLD (n, NHGCD_THRESHOLD)) >> return nhgcd_base (ap, bp, n, M, tp); >> >> p = n/2; >> nn = mpn_nhgcd (ap + p, bp + p, n - p, M, tp); >> if (nn > 0) >> { >> /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1) >> = 2 (n - 1) */ >> n = mpn_ngcd_matrix_adjust (M, p + nn, ap, bp, p, tp); >> success = 1; >> } >> count = 0; >> while (n > n2) >> { >> count++; >> /* Needs n + 1 storage */ >> nn = mpn_ngcd_step (n, ap, bp, s, M, tp); >> if (!nn) >> return success ? n : 0; >> n = nn; >> success = 1; >> } >> >> if (n > s + 2) >> { >> struct ngcd_matrix M1; >> mp_size_t scratch; >> >> p = 2*s - n + 1; >> scratch = MPN_NGCD_MATRIX_INIT_ITCH (n-p); >> >> mpn_ngcd_matrix_init(&M1, n - p, tp); >> nn = mpn_nhgcd (ap + p, bp + p, n - p, &M1, tp + scratch); >> if (nn > 0) >> { >> /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1) >> = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */ >> n = mpn_ngcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch); >> /* Needs M->n <= n2 - s - 1 < n/4 */ >> mpn_ngcd_matrix_mul (M, &M1, tp + scratch); >> success = 1; >> } >> } >> >> /* FIXME: This really is the base case */ >> for (count = 0;; count++) >> { >> /* Needs s+3 < n */ >> nn = mpn_ngcd_step (n, ap, bp, s, M, tp); >> if (!nn) >> return success ? n : 0; >> >> n = nn; >> success = 1; >> } >> } >> >> #define EVEN_P(x) (((x) & 1) == 0) >> >> mp_size_t >> mpn_gcd (mp_ptr gp, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) >> { >> mp_size_t init_scratch; >> mp_size_t scratch; >> mp_ptr tp; >> TMP_DECL; >> >> ASSERT (an >= n); >> >> if (BELOW_THRESHOLD (n, NGCD_THRESHOLD)) >> return mpn_basic_gcd (gp, ap, an, bp, n); >> >> init_scratch = MPN_NGCD_MATRIX_INIT_ITCH ((n+1)/2); >> scratch = mpn_nhgcd_itch ((n+1)/2); >> >> /* Space needed for mpn_ngcd_matrix_adjust */ >> if (scratch < 2*n) >> scratch = 2*n; >> >> TMP_MARK; >> >> if (an + 1 > init_scratch + scratch) >> tp = TMP_ALLOC_LIMBS (an + 1); >> else >> tp = TMP_ALLOC_LIMBS (init_scratch + scratch); >> >> if (an > n) >> { >> mp_ptr rp = tp; >> mp_ptr qp = rp + n; >> >> mpn_tdiv_qr (qp, rp, 0, ap, an, bp, n); >> MPN_COPY (ap, rp, n); >> an = n; >> MPN_NORMALIZE (ap, an); >> if (an == 0) >> { >> MPN_COPY (gp, bp, n); >> TMP_FREE; >> return n; >> } >> } >> >> while (ABOVE_THRESHOLD (n, NGCD_THRESHOLD)) >> { >> struct ngcd_matrix M; >> mp_size_t p = n/2; >> mp_size_t nn; >> >> mpn_ngcd_matrix_init (&M, n - p, tp); >> nn = mpn_nhgcd (ap + p, bp + p, n - p, &M, tp + init_scratch); >> if (nn > 0) >> /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1) >> = 2(n-1) */ >> n = mpn_ngcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + init_scratch); >> >> else >> { >> mp_size_t gn; >> n = mpn_ngcd_subdiv_step (gp, &gn, ap, bp, n, tp); >> if (n == 0) >> { >> TMP_FREE; >> return gn; >> } >> } >> } >> >> ASSERT (ap[n-1] > 0 || bp[n-1] > 0); >> #if 0 >> /* FIXME: We may want to use lehmer on some systems. */ >> n = mpn_ngcd_lehmer (gp, ap, bp, n, tp); >> >> TMP_FREE; >> return n; >> #endif >> >> if (ap[n-1] < bp[n-1]) >> MP_PTR_SWAP (ap, bp); >> >> an = n; >> MPN_NORMALIZE (bp, n); >> >> if (n == 0) >> { >> MPN_COPY (gp, ap, an); >> TMP_FREE; >> return an; >> } >> >> if (EVEN_P (bp[0])) >> { >> /* Then a must be odd (since the calling convention implies that >> there's no common factor of 2) */ >> ASSERT (!EVEN_P (ap[0])); >> >> while (bp[0] == 0) >> { >> bp++; >> n--; >> } >> >> if (EVEN_P(bp[0])) >> { >> int count; >> count_trailing_zeros (count, bp[0]); >> ASSERT_NOCARRY (mpn_rshift (bp, bp, n, count)); >> n -= (bp[n-1] == 0); >> } >> } >> >> TMP_FREE; >> return mpn_basic_gcd (gp, ap, an, bp, n); >> } >> >> /* Schönhage's 1987 algorithm, reorganized into hgcd form */ >> >> #include <stdio.h> /* for NULL */ >> >> #include "gmp.h" >> #include "gmp-impl.h" >> #include "longlong.h" >> >> >> >> >> >> >> /* For input of size n, matrix elements are of size at most ceil(n/2) >> - 1, but we need one limb extra. */ >> >> void >> mpn_ngcd_matrix_init (struct ngcd_matrix *M, mp_size_t n, mp_ptr p); >> >> #define NHGCD_BASE_ITCH MPN_NGCD_STEP_ITCH >> >> /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M >> with elements of size at most (n+1)/2 - 1. Returns new size of a, >> b, or zero if no reduction is possible. */ >> static mp_size_t >> nhgcd_base (mp_ptr ap, mp_ptr bp, mp_size_t n, >> struct ngcd_matrix *M, mp_ptr tp); >> >> /* Size analysis for nhgcd: >> >> For the recursive calls, we have n1 <= ceil(n / 2). Then the >> storage need is determined by the storage for the recursive call >> computing M1, and ngcd_matrix_adjust and ngcd_matrix_mul calls that use M1 >> (after this, the storage needed for M1 can be recycled). >> >> Let S(r) denote the required storage. For M1 we need 5 * ceil(n1/2) >> = 5 * ceil(n/4), and for the ngcd_matrix_adjust call, we need n + 2. In >> total, 5 * ceil(n/4) + n + 2 <= 9 ceil(n/4) + 2. >> >> For the recursive call, we need S(n1) = S(ceil(n/2)). >> >> S(n) <= 9*ceil(n/4) + 2 + S(ceil(n/2)) >> <= 9*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 2k + S(ceil(n/2^k)) >> <= 9*(2 ceil(n/4) + k) + 2k + S(n/2^k) >> <= 18 ceil(n/4) + 11k + S(n/2^k) >> >> */ >> >> mp_size_t >> mpn_nhgcd_itch (mp_size_t n); >> >> >> /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M >> with elements of size at most (n+1)/2 - 1. Returns new size of a, >> b, or zero if no reduction is possible. */ >> >> mp_size_t >> mpn_nhgcd (mp_ptr ap, mp_ptr bp, mp_size_t n, >> struct ngcd_matrix *M, mp_ptr tp); >> >> >> #define EVEN_P(x) (((x) & 1) == 0) >> >> mp_size_t >> mpn_ngcd (mp_ptr gp, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) >> { >> mp_size_t init_scratch; >> mp_size_t scratch; >> mp_ptr tp; >> TMP_DECL; >> >> ASSERT (an >= n); >> >> if (BELOW_THRESHOLD (n, NGCD_THRESHOLD)) >> return mpn_basic_gcd (gp, ap, an, bp, n); >> >> init_scratch = MPN_NGCD_MATRIX_INIT_ITCH ((n+1)/2); >> scratch = mpn_nhgcd_itch ((n+1)/2); >> >> /* Space needed for mpn_ngcd_matrix_adjust */ >> if (scratch < 2*n) >> scratch = 2*n; >> >> TMP_MARK; >> >> if (an + 1 > init_scratch + scratch) >> tp = TMP_ALLOC_LIMBS (an + 1); >> else >> tp = TMP_ALLOC_LIMBS (init_scratch + scratch); >> >> if (an > n) >> { >> mp_ptr rp = tp; >> mp_ptr qp = rp + n; >> >> mpn_tdiv_qr (qp, rp, 0, ap, an, bp, n); >> MPN_COPY (ap, rp, n); >> an = n; >> MPN_NORMALIZE (ap, an); >> if (an == 0) >> { >> MPN_COPY (gp, bp, n); >> TMP_FREE; >> return n; >> } >> } >> >> while (ABOVE_THRESHOLD (n, NGCD_THRESHOLD)) >> { >> struct ngcd_matrix M; >> mp_size_t p = n/2; >> mp_size_t nn; >> >> mpn_ngcd_matrix_init (&M, n - p, tp); >> nn = mpn_nhgcd (ap + p, bp + p, n - p, &M, tp + init_scratch); >> if (nn > 0) >> /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1) >> = 2(n-1) */ >> n = mpn_ngcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + init_scratch); >> >> else >> { >> mp_size_t gn; >> n = mpn_ngcd_subdiv_step (gp, &gn, ap, bp, n, tp); >> if (n == 0) >> { >> TMP_FREE; >> return gn; >> } >> } >> } >> >> ASSERT (ap[n-1] > 0 || bp[n-1] > 0); >> #if 0 >> /* FIXME: We may want to use lehmer on some systems. */ >> n = mpn_ngcd_lehmer (gp, ap, bp, n, tp); >> >> TMP_FREE; >> return n; >> #endif >> >> if (ap[n-1] < bp[n-1]) >> MP_PTR_SWAP (ap, bp); >> >> an = n; >> MPN_NORMALIZE (bp, n); >> >> if (n == 0) >> { >> MPN_COPY (gp, ap, an); >> TMP_FREE; >> return an; >> } >> >> if (EVEN_P (bp[0])) >> { >> /* Then a must be odd (since the calling convention implies that >> there's no common factor of 2) */ >> ASSERT (!EVEN_P (ap[0])); >> >> while (bp[0] == 0) >> { >> bp++; >> n--; >> } >> >> if (EVEN_P(bp[0])) >> { >> int count; >> count_trailing_zeros (count, bp[0]); >> ASSERT_NOCARRY (mpn_rshift (bp, bp, n, count)); >> n -= (bp[n-1] == 0); >> } >> } >> >> TMP_FREE; >> return mpn_basic_gcd (gp, ap, an, bp, n); >> } >> >> > --~--~---------~--~----~------------~-------~--~----~ You received this message because you are subscribed to the Google Groups "mpir-devel" group. To post to this group, send email to mpir-devel@googlegroups.com To unsubscribe from this group, send email to mpir-devel+unsubscr...@googlegroups.com For more options, visit this group at http://groups.google.com/group/mpir-devel?hl=en -~----------~----~----~----~------~----~------~--~---