I've been plaing a bit with table based hgcd2, but to keep things simple, I started with hgcd1 (reducing full limbs to half limbs, with an associated matrix of half limb elements). Complete code below, the main loop is
for (;;) { mp_limb_t a_hi; mp_limb_t b_hi; mp_limb_t q; int c; if (a < b) { MP_LIMB_T_SWAP (a,b); MP_LIMB_T_SWAP (u00, u01); MP_LIMB_T_SWAP (u10, u11); swapped ^= 1; } count_leading_zeros (c, a); assert (c + TAB_BITS <= GMP_LIMB_BITS); a_hi = a >> (GMP_LIMB_BITS - TAB_BITS - c); b_hi = (b >> (GMP_LIMB_BITS - TAB_BITS - c)); if (UNLIKELY(b_hi == 0)) { q = a / b; } else q = q_tab[b_hi - 1][a_hi - (1U << (TAB_BITS - 1))]; assert (q > 0); assert (a >= q*b); a -= q*b; u01 += q*u00; u11 += q*u10; if (a < REDUCED_LIMIT) { u01 -= u00; u11 -= u10; break; } } I think it's promising, and also a bit simpler than the current div1 code. Some concerns: 1. I think the approximate q means that we will often reduce from a > b to a close to but larger than b, so we need two iterations to get to a < b. Might improve progress to add some kind of adjustment step after a - q*b, checking either if result >= b or < 0. 2. There are diferent options for the UNLIKELY branch. One could count leading zeros of b, shift, and look up a quotient, and apply a -= q * b << k; for suitable q and k. Likely cheaper than a division, but we'll then do multiple iterations if size difference is large (which I think is ok). 3. We get poor accuracy for q when b_hi == 1. One could make that less likely by using a somewhat asymmetric table, say looking up 3 bits of a but 6 bits of b. 4. One could also go in the other direction and do something closer to left-to-right binary, and in each step always use q = 2^k for suitable k, based on count_leading_zeros on both a and b. Regards, /Niels #include <assert.h> #include <stdio.h> #include <stdlib.h> #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" #define REDUCED_BITS (GMP_NUMB_BITS / 2 + 1) #define REDUCED_LIMIT ((mp_limb_t) 1 << (GMP_NUMB_BITS / 2 + 1)) #define MATRIX_LIMIT ((mp_limb_t) 1 << (GMP_NUMB_BITS / 2 - 1)) #define TAB_BITS 4 #define ABS_DIFF(a,b) ((a < b) ? b - a : a - b) static int hgcd1_ref(struct hgcd_matrix1 *m, mp_limb_t a, mp_limb_t b) { mp_limb_t u00, u01, u10, u11; if (a < REDUCED_LIMIT || b < REDUCED_LIMIT || ABS_DIFF(a,b) < REDUCED_LIMIT) return 0; u00 = u11 = 1; u01 = u10 = 0; if (a < b) goto a_less_b; for (;;) { mp_limb_t q; assert(a > b); q = a / b; a -= q*b; u01 += q*u00; u11 += q*u10; if (a < REDUCED_LIMIT) { u01 -= u00; u11 -= u10; break; } a_less_b: assert (a < b); q = b / a; b -= q * a; u00 += q * u01; u10 += q * u11; if (b < REDUCED_LIMIT) { u00 -= u01; u10 -= u11; break; } } m->u[0][0] = u00; m->u[0][1] = u01; m->u[1][0] = u10; m->u[1][1] = u11; return 1; } #if TAB_BITS == 3 /* 1,a1,a0.xxx / b2,b1,b0.xxx > 1,a1,a0 / b2,b1,b0 + 1 * Need to round b up, except when a_hi == b_hi, since we always have a > b. */ unsigned char q_tab[7][4] = { /* a = 4, 5, 6, 7 / b */ { /* 1 */ 2, 2, 3, 3 }, { /* 2 */ 1, 1, 2, 2 }, { /* 3 */ 1, 1, 1, 1 }, { /* 4 */ 1, 1, 1, 1 }, { /* 5 */ 0, 1, 1, 1 }, { /* 6 */ 0, 0, 1, 1 }, { /* 7 */ 0, 0, 0, 1 }, }; #elif TAB_BITS == 4 unsigned char q_tab[15][8] = { /* a = 8, 9,10,11,12,13,14,15 / b */ { /* 1 */ 4, 4, 5, 5, 6, 6, 7, 7 }, { /* 2 */ 2, 3, 3, 3, 4, 4, 4, 5 }, { /* 3 */ 2, 2, 2, 2, 3, 3, 3, 3 }, { /* 4 */ 1, 1, 2, 2, 2, 2, 2, 3 }, { /* 5 */ 1, 1, 1, 1, 2, 2, 2, 2 }, { /* 6 */ 1, 1, 1, 1, 1, 1, 2, 2 }, { /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1 }, { /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1 }, { /* 9 */ 0, 1, 1, 1, 1, 1, 1, 1 }, { /*10 */ 0, 0, 1, 1, 1, 1, 1, 1 }, { /*11 */ 0, 0, 0, 1, 1, 1, 1, 1 }, { /*12 */ 0, 0, 0, 0, 1, 1, 1, 1 }, { /*13 */ 0, 0, 0, 0, 0, 1, 1, 1 }, { /*14 */ 0, 0, 0, 0, 0, 0, 1, 1 }, { /*15 */ 0, 0, 0, 0, 0, 0, 0, 1 }, }; #else #error unsupported table size #endif static int hgcd1_tab(struct hgcd_matrix1 *m, mp_limb_t a, mp_limb_t b) { mp_limb_t u00, u01, u10, u11; int swapped; if (a < REDUCED_LIMIT || b < REDUCED_LIMIT || ABS_DIFF(a,b) < REDUCED_LIMIT) return 0; u00 = u11 = 1; u01 = u10 = 0; swapped = 0; for (;;) { mp_limb_t a_hi; mp_limb_t b_hi; mp_limb_t q; int c; if (a < b) { MP_LIMB_T_SWAP (a,b); MP_LIMB_T_SWAP (u00, u01); MP_LIMB_T_SWAP (u10, u11); swapped ^= 1; } count_leading_zeros (c, a); assert (c + TAB_BITS <= GMP_LIMB_BITS); a_hi = a >> (GMP_LIMB_BITS - TAB_BITS - c); b_hi = (b >> (GMP_LIMB_BITS - TAB_BITS - c)); if (UNLIKELY(b_hi == 0)) { q = a / b; } else q = q_tab[b_hi - 1][a_hi - (1U << (TAB_BITS - 1))]; assert (q > 0); assert (a >= q*b); a -= q*b; u01 += q*u00; u11 += q*u10; if (a < REDUCED_LIMIT) { u01 -= u00; u11 -= u10; break; } } if (swapped) { MP_LIMB_T_SWAP (u00, u01); MP_LIMB_T_SWAP (u10, u11); } m->u[0][0] = u00; m->u[0][1] = u01; m->u[1][0] = u10; m->u[1][1] = u11; return 1; } static int hgcd1_valid (const struct hgcd_matrix1 *m, mp_limb_t a, mp_limb_t b) { mp_limb_t u00, u01, u10, u11; mp_limb_t s0, s1, t0, t1; mp_limb_t alpha, beta; u00 = m->u[0][0]; u01 = m->u[0][1]; u10 = m->u[1][0]; u11 = m->u[1][1]; if (u00 >= MATRIX_LIMIT || u01 >= MATRIX_LIMIT || u10 >= MATRIX_LIMIT || u11 >= MATRIX_LIMIT) return 0; /* M^-1 = (u11, -u01; -u10,u00) */ umul_ppmm (s1, s0, u11, a); umul_ppmm (t1, t0, u01, b); if (t1 > s1 || (t1 == s1 && t0 > s0)) return 0; sub_ddmmss (s1, alpha, s1, s0, t1, t0); if (s1 != 0) return 0; if (alpha < REDUCED_LIMIT) return 0; umul_ppmm (s1, s0, u00, b); umul_ppmm (t1, t0, u10, a); if (t1 > s1 || (t1 == s1 && t0 > s0)) return 0; sub_ddmmss (s1, beta, s1, s0, t1, t0); if (s1 != 0) return 0; if (beta < REDUCED_LIMIT) return 0; return ABS_DIFF(alpha, beta) < REDUCED_LIMIT; } int main (int argc, char **argv) { gmp_randstate_t rands; unsigned i; gmp_randinit_default (rands); gmp_randseed_ui (rands, 17); for (i = 0; i < 10000; i++) { mp_limb_t a, b; struct hgcd_matrix1 m; a = gmp_urandomb_ui (rands, GMP_NUMB_BITS); b = gmp_urandomb_ui (rands, GMP_NUMB_BITS); if (hgcd1_ref (&m, a, b) && !hgcd1_valid (&m, a, b)) { gmp_printf("hgcd1 failed: i = %u, a = %Mx b = %Mx\n" " M = (%Mx, %Mx;%Mx, %Mx)\n", i, a, b, m.u[0][0], m.u[0][1], m.u[1][0], m.u[1][1]); abort(); } if (hgcd1_tab (&m, a, b) && !hgcd1_valid (&m, a, b)) { gmp_printf("hgcd1 failed: i = %u, a = %Mx b = %Mx\n" " M = (%Mx, %Mx;%Mx, %Mx)\n", i, a, b, m.u[0][0], m.u[0][1], m.u[1][0], m.u[1][1]); abort(); } } return 0; } -- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance. _______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel