ni...@lysator.liu.se (Niels Möller) writes: > I've now implemented inverse too. And now I've tried a different variant, using redc for the cofactor updates. Cofactors are now canonically reduced (which seems unexpectedly cheap). In the case that m is not normalized, so that 2m fits in n limbs, one could use a more relaxed range, 0 <= u, v < 2m, and get rid of a conditional adjustment when doing redc in the loop.
Using redc is a rather good fit, because in each iteration, we shift f, g *almost* a limb (62 bits on a 64-bit machine), except for the last iteration where we do less bits. While cofactors are multiplied by 2^{-64} mod m in each iteration. So there's a discrepancy of a few bits per iteration, which one can handle using very few extra redc calls outside of the loop (it's O(n^2) work, but with a pretty small constant). Speed is rather close to the previous version, but the only needed precomputation is binvert_limb for the redc. And the calls to mpn_sec_mul and mpn_sec_div_r are gone. I also added benchmarking of the existing mpn_sec_invert, for comparison. This is how it looks now: invert size 1, ref 0.327 (us), old 2.771 (us), djb 1.087 (us) invert size 2, ref 0.757 (us), old 5.366 (us), djb 1.571 (us) invert size 3, ref 1.082 (us), old 8.110 (us), djb 2.336 (us) [...] invert size 17, ref 5.106 (us), old 163.962 (us), djb 18.082 (us) invert size 18, ref 5.189 (us), old 170.565 (us), djb 18.608 (us) invert size 19, ref 5.715 (us), old 185.143 (us), djb 20.794 (us) One possible optimization would be to keep cofactors in signed (two's complement) representation for the first few iterations, when they are still small and don't need any reduction. And cofactor update for the very first iteration could be much more efficient. Regards, /Niels
/* Side channel silent gcd (work in progress) Copyright 2022 Niels Möller This is is free software; you can redistribute it and/or modify it under the terms of either: * the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. or * the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. or both in parallel, as here. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received copies of the GNU General Public License and the GNU Lesser General Public License along with the GNU MP Library. If not, see https://www.gnu.org/licenses/. */ #include <assert.h> #include <limits.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <gmp.h> #if GMP_NUMB_BITS < GMP_LIMB_BITS #error Nails not supported. #endif #define BITCNT_BITS (sizeof(mp_bitcnt_t) * CHAR_BIT) #define STEPS (GMP_LIMB_BITS - 2) #define MAX(a,b) ((a) >= (b) ? (a) : (b)) struct matrix { /* Matrix elements interpreted as signed two's complement. Absolute value of elements is at most 2^STEPS. */ mp_limb_t a[2][2]; }; /* Conditionally set (a, b) <-- (b, -a) */ #define CND_NEG_SWAP_LIMB(cnd, a, b) do {\ mp_limb_t __cnd_sum = (a) + (b); \ mp_limb_t __cnd_diff = (a) - (b); \ (a) -= __cnd_diff & -(cnd); \ (b) -= __cnd_sum & -(cnd); \ } while (0) /* Perform STEPS elementary reduction steps on (delta, f, g). In the least significant STEPS bits of f, g matter. Note that mp_bitcnt_t is an unsigned type, but is used as two's complement. */ static mp_bitcnt_t steps(struct matrix *M, unsigned k, mp_bitcnt_t delta, mp_limb_t f, mp_limb_t g) { mp_limb_t a00, a01, a10, a11; assert (f & 1); /* Identity matrix. */ a00 = a11 = 1; a01 = a10 = 0; /* Preserve invariant (f ; g) = 2^{-i} M (f_orig, g_orig) */ for (; k-- > 0; delta++) { mp_limb_t odd = g & 1; mp_limb_t swap = odd & ~(delta >> (BITCNT_BITS-1)); /* Swap; (f'; g') = (g; -f) = (0,1;-1,0) (g;f) */ CND_NEG_SWAP_LIMB(swap, f, g); CND_NEG_SWAP_LIMB(swap, a00, a10); CND_NEG_SWAP_LIMB(swap, a01, a11); /* Conditional negation. */ delta = (delta ^ - (mp_bitcnt_t) swap) + swap; /* Cancel low bit and shift. */ g += f & -odd; a10 += a00 & -odd; a11 += a01 & -odd; g >>= 1; a00 <<= 1; a01 <<= 1; } M->a[0][0] = a00; M->a[0][1] = a01; M->a[1][0] = a10; M->a[1][1] = a11; return delta; } /* Set R = (u * F + v * G), treating all numbers as two's complement. No overlap allowed. */ static mp_limb_t add_add_mul (mp_ptr rp, const mp_limb_t *fp, const mp_limb_t *gp, mp_size_t n, mp_limb_t u, mp_limb_t v) { mp_limb_t f_sign = fp[n-1] >> (GMP_LIMB_BITS - 1); mp_limb_t g_sign = gp[n-1] >> (GMP_LIMB_BITS - 1); mp_limb_t u_sign = u >> (GMP_LIMB_BITS - 1); mp_limb_t v_sign = v >> (GMP_LIMB_BITS - 1); mp_limb_t hi = mpn_mul_1 (rp, fp, n, u) - ((-f_sign) & u); hi -= ((-u_sign) & fp[n-1]) + mpn_cnd_sub_n (u_sign, rp + 1, rp + 1, fp, n-1); hi += mpn_addmul_1 (rp, gp, n, v) - ((-g_sign) & v); hi -= ((-v_sign) & gp[n-1]) + mpn_cnd_sub_n (v_sign, rp + 1, rp + 1, gp, n-1); return hi; } /* Update (f'; g') = M (f; g) / 2^{shift}, where all numbers are two's complement. */ static void matrix_vector_mul (const struct matrix *M, unsigned shift, mp_size_t n, mp_limb_t *fp, mp_limb_t *gp, mp_limb_t *tp) { mp_limb_t f_hi = add_add_mul (tp, fp, gp, n, M->a[0][0], M->a[0][1]); mp_limb_t g_hi = add_add_mul (tp + n, fp, gp, n, M->a[1][0], M->a[1][1]); mp_limb_t lo = mpn_rshift (fp, tp, n, shift); assert (lo == 0); fp[n-1] += (f_hi << (GMP_LIMB_BITS - shift)); lo = mpn_rshift (gp, tp + n, n, shift); assert (lo == 0); gp[n-1] += (g_hi << (GMP_LIMB_BITS - shift)); } static void __attribute__((noreturn)) die (const char *msg) { fprintf(stderr, "%s\n", msg); abort(); } static mp_limb_t * xalloc_limbs(mp_size_t n) { mp_limb_t *p = malloc(n * sizeof(*p)); if (!p) die("Out of memory"); mpn_zero (p, n); return p; } static mp_limb_t cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n, mp_limb_t* tp) { mp_limb_t cy = mpn_lshift (tp, ap, n, 1); return (cnd & cy) + mpn_cnd_sub_n (cnd, rp, ap, tp, n); } static mp_bitcnt_t iterations (mp_size_t n) { mp_bitcnt_t bits = n * GMP_LIMB_BITS; assert (bits >= 46); return (49 * bits + 57) / 17; } static void djb_gcd (mpz_t g, const mpz_t a, const mpz_t b) { mp_size_t n = MAX(mpz_size(a), mpz_size(b)); mp_bitcnt_t count; mp_limb_t *fp, *gp, *tp; mp_bitcnt_t delta; struct matrix M; assert (mpz_odd_p (a)); count = iterations (n); /* Extra limb needed for sign bit. */ fp = xalloc_limbs (n+1); gp = xalloc_limbs (n+1); tp = xalloc_limbs (2*(n+1)); mpn_copyi (fp, mpz_limbs_read (a), mpz_size (a)); mpn_copyi (gp, mpz_limbs_read (b), mpz_size (b)); for (delta = 1;count > STEPS;count -= STEPS) { delta = steps (&M, STEPS, delta, fp[0], gp[0]); matrix_vector_mul (&M, STEPS, n+1, fp, gp, tp); } delta = steps (&M, count, delta, fp[0], gp[0]); matrix_vector_mul (&M, count, n+1, fp, gp, tp); assert (mpn_zero_p (gp, n+1)); cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), fp, fp, n+1, tp); assert (fp[n] == 0); mpn_copyi (mpz_limbs_write (g, n), fp, n); mpz_limbs_finish (g, n); free (fp); free (gp); free (tp); } static mp_limb_t binvert_limb(mp_limb_t n) { static const unsigned char inv_table[8] = { 1, 11, 13, 7, 9, 3, 5, 15 }; mp_limb_t x; unsigned bits; assert (n & 1); for (bits = 4, x = inv_table[(n/2) & 7]; bits < GMP_LIMB_BITS; bits *= 2) x = 2 * x - x*x*n; return x; } /* Set R = (u * F + v * G) (mod M), treating u, v as two's complement, but F, G, R unsigned. No overlap allowed. n limb inputs, n+1 limb output. Input can be allowed in the range 0 <= F, G < min (2 M, B^n), output always in the range 0 <= R < B M. */ static void add_add_mul_mod (mp_ptr rp, const mp_limb_t *fp, const mp_limb_t *gp, const mp_limb_t *mp, mp_size_t n, mp_limb_t u, mp_limb_t v) { mp_limb_t u_sign = u >> (GMP_LIMB_BITS - 1); mp_limb_t v_sign = v >> (GMP_LIMB_BITS - 1); mp_limb_t r_sign; rp[n] = mpn_mul_1 (rp, fp, n, u); mpn_cnd_sub_n (u_sign, rp + 1, rp + 1, fp, n); rp[n] += mpn_addmul_1 (rp, gp, n, v); mpn_cnd_sub_n (v_sign, rp + 1, rp + 1, gp, n); /* Row sums of the matrix have absolute value <= B/4. With inputs F, G < 2 M, at this point we have |R| < B M/2. If R < 0, then R + B M is positive negative, adding B M makes the result positive. */ r_sign = rp[n] >> (GMP_LIMB_BITS - 1); r_sign -= mpn_cnd_add_n (r_sign, rp + 1, rp + 1, mp, n); assert (r_sign == 0); } /* Input in range 0 <= T < B M, output in range 0 <= R < M. */ static void redc_1 (mp_limb_t *rp, mp_limb_t *tp, const mp_limb_t *mp, mp_size_t n, mp_limb_t m_binv) { mp_limb_t cy, hi; /* If we knew that 2M < B^n, we could allow result value in the range 0 <= R < 2M, and use mpn_add_mul_1 without any adjustment step. */ hi = mpn_submul_1 (tp, mp, n, m_binv * tp[0]); assert (tp[0] == 0); cy = tp[n] < hi; tp[n] -= hi; cy -= mpn_cnd_add_n (cy, rp, tp + 1, mp, n); assert (cy == 0); } /* Input in range 0 <= T < M, output in range 0 <= R < M. */ static void redc_bits (mp_limb_t *rp, mp_limb_t *tp, const mp_limb_t *mp, mp_size_t n, mp_limb_t m_binv, mp_bitcnt_t k) { mp_limb_t hi, cy; assert (k > 0); for (; k > GMP_NUMB_BITS; k -= GMP_NUMB_BITS) { hi = mpn_addmul_1 (tp, mp, n, -m_binv * tp[0]); assert (tp[0] == 0); mpn_copyi (tp, tp + 1, n - 1); tp[n-1] = hi; } if (k > 0) { mp_limb_t mask = ((mp_limb_t) 2 << (k-1)) - 1; mp_limb_t q = (-m_binv * tp[0]) & mask; hi = mpn_addmul_1 (tp, mp, n, q); cy = mpn_rshift (rp, tp, n, k); assert (cy == 0); rp[n-1] |= hi << (GMP_NUMB_BITS - k); } } /* Update (u'; v') = M (u; v) B^{-1} (mod m), where u, v, m are unsigned n limbs, M has signed elements, and B is the bignum base. Inputs and outputs are canonically reduced, 0 <= U, V < M, but inputs of size up to 2 M should work. */ static void matrix_vector_mul_mod (const struct matrix *M, const mp_limb_t *mp, mp_limb_t m_binv, mp_size_t n, mp_limb_t *up, mp_limb_t *vp, mp_limb_t *tp) { add_add_mul_mod (tp, up, vp, mp, n, M->a[0][0], M->a[0][1]); add_add_mul_mod (tp + n + 1, up, vp, mp, n, M->a[1][0], M->a[1][1]); /* Reduce to n limbs, by multiplying with B^-1 (mod m) */ redc_1 (up, tp, mp, n, m_binv); redc_1 (vp, tp + n + 1, mp, n, m_binv); } static int one_p (const mp_limb_t *p, mp_size_t n) { mp_limb_t diff = p[0] ^ 1; mp_size_t i; for (i = 1; i < n; i++) diff |= p[i]; return diff == 0; } /* Inverse of a modulo m, m odd. */ static int djb_invert(mpz_t r, const mpz_t a, const mpz_t m, mp_limb_t m_binv) { mp_size_t n = mpz_size(m); const mp_limb_t *mp = mpz_limbs_read (m); mp_limb_t *fp, *gp, *up, *vp, *tp; mp_limb_t cy; mp_bitcnt_t count; mp_bitcnt_t delta; mp_bitcnt_t shift; struct matrix M; int success; assert (mpz_size (a) <= n); assert (mpz_odd_p (m)); count = iterations (n); /* Extra limb needed for sign bit. */ fp = xalloc_limbs (n+1); gp = xalloc_limbs (n+1); /* Cofactors are represented as n limbs. And we can represent them canonically reduced, more or less for free. */ up = xalloc_limbs (n); vp = xalloc_limbs (n); tp = xalloc_limbs (2*(n+1)); assert (mpz_size (a) <= n); mpn_copyi (fp, mp, n); mpn_copyi (gp, mpz_limbs_read (a), mpz_size (a)); shift = GMP_NUMB_BITS*((count + STEPS - 1) / STEPS) - count; /* Premultiply a by 2^{- shift} mod m */ redc_bits (gp, gp, mp, n, m_binv, shift); /* Maintain invariant a * u = 2^{-count} B^{ceil(count/STEPS)} f (mod m) a * v = 2^{-count} B^{ceil(count/STEPS)} g (mod m) */ vp[0] = 1; for (delta = 1;count > STEPS;count -= STEPS) { delta = steps (&M, STEPS, delta, fp[0], gp[0]); matrix_vector_mul (&M, STEPS, n+1, fp, gp, tp); matrix_vector_mul_mod (&M, mp, m_binv, n, up, vp, tp); } delta = steps (&M, count, delta, fp[0], gp[0]); matrix_vector_mul (&M, count, n+1, fp, gp, tp); /* Only compute u, we don't need v. */ add_add_mul_mod (tp, up, vp, mp, n, M.a[0][0], M.a[0][1]); redc_1 (up, tp, mp, n, m_binv); assert (mpn_zero_p (gp, n+1)); /* Now f = ±1 (if the inverse exists), and a * u = f (mod m) */ cy = cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), up, up, n, tp); /* Make u non-negative */ cy -= mpn_cnd_add_n (cy, up, up, mp, n); cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), fp, fp, n + 1, tp); success = one_p (fp, n+1); /* Now a * u' = 1, thanks to the premultiply. */ mpn_copyi (mpz_limbs_write (r, n), up, n); mpz_limbs_finish (r, n); free (fp); free (gp); free (up); free (vp); free (tp); return success; } static void test_gcd (gmp_randstate_t rands) { mp_size_t n; mpz_t a, b, g, ref; mpz_inits (a, b, g, ref, NULL); for (n = 1; n < 20; n++) { unsigned count; clock_t clocks_ref = 0; clock_t clocks_djb = 0; for (count = 0; count < 1000; count++) { clock_t time_start, time_ref, time_djb; if (count & 1) { mpz_rrandomb (a, rands, GMP_LIMB_BITS * n); mpz_rrandomb (b, rands, GMP_LIMB_BITS * n); } else { mpz_urandomb (a, rands, GMP_LIMB_BITS * n); mpz_urandomb (b, rands, GMP_LIMB_BITS * n); } mpz_setbit (a, 0); time_start = clock(); mpz_gcd (ref, a, b); time_ref = clock(); djb_gcd (g, a, b); time_djb = clock(); if (mpz_cmp (g, ref) != 0) { gmp_fprintf (stderr, "size = %zd\na = 0x%Zx\nb = 0x%Zx\nref = 0x%Zx\ng = 0x%Zx\n", (size_t) n, a, b, ref, g); die("Gcd failed"); } clocks_ref += (time_ref - time_start); clocks_djb += (time_djb - time_ref); } fprintf (stderr, "gcd size %zd, ref %.3f (us), djb %.3f (us)\n", (size_t) n, clocks_ref * (1e6 / CLOCKS_PER_SEC) / count, clocks_djb * (1e6 / CLOCKS_PER_SEC) / count); } mpz_clears (a, b, g, ref, NULL); } static int old_invert(mpz_t r, const mpz_t a, const mpz_t m) { mp_size_t n = mpz_size(m); mp_limb_t *ap = xalloc_limbs (n); mp_limb_t *mp = xalloc_limbs (n); mp_limb_t *scratch = xalloc_limbs (mpn_sec_invert_itch (n)); int success; assert (mpz_size (a) <= n); mpn_copyi (ap, mpz_limbs_read (a), mpz_size (a)); mpn_copyi (mp, mpz_limbs_read (m), mpz_size (m)); success = mpn_sec_invert (mpz_limbs_write (r, n), ap, mp, n, 2*n*GMP_NUMB_BITS, scratch); mpz_limbs_finish (r, n); free (ap); free (mp); free (scratch); return success; } static void test_invert(gmp_randstate_t rands) { mp_size_t n; mpz_t a, m, x, ref, o; mpz_inits (a, m, x, ref, o, NULL); for (n = 1; n < 20; n++) { unsigned count, success_count; clock_t clocks_ref = 0; clock_t clocks_old = 0; clock_t clocks_djb = 0; for (count = 0, success_count = 0; count < 1000; count++) { mp_limb_t binv; clock_t time_start, time_ref, time_old, time_djb; int success, ref_success, old_success; if (count & 1) { mpz_rrandomb (a, rands, GMP_LIMB_BITS * n); mpz_rrandomb (m, rands, GMP_LIMB_BITS * n); } else { mpz_urandomb (a, rands, GMP_LIMB_BITS * n); mpz_urandomb (m, rands, GMP_LIMB_BITS * n); } mpz_setbit (m, 0); binv = binvert_limb (mpz_getlimbn (m, 0)); time_start = clock(); ref_success = mpz_invert (ref, a, m); time_ref = clock(); old_success = old_invert (o, a, m); time_old = clock(); success = djb_invert (x, a, m, binv); time_djb = clock(); assert (old_success == ref_success && (!ref_success || mpz_cmp (ref, o) == 0)); if (ref_success != success || (success && mpz_cmp (x, ref) != 0)) { gmp_fprintf (stderr, "size = %zd ref_success = %d, success = %d\n" "a = 0x%Zx\nm = 0x%Zx\nref = 0x%Zx\nx = 0x%Zx\n", (size_t) n, ref_success, success, a, m, ref, x); die("Invert failed"); } if (success) { success_count++; clocks_ref += (time_ref - time_start); clocks_old += (time_old - time_ref); clocks_djb += (time_djb - time_old); } } fprintf (stderr, "invert size %zd, ref %.3f (us), old %.3f (us), djb %.3f (us)\n", (size_t) n, clocks_ref * (1e6 / CLOCKS_PER_SEC) / count, clocks_old * (1e6 / CLOCKS_PER_SEC) / count, clocks_djb * (1e6 / CLOCKS_PER_SEC) / count); } mpz_clears (a, m, x, ref, o, NULL); } int main (int argc, char **argv) { gmp_randstate_t rands; gmp_randinit_default (rands); test_gcd (rands); test_invert (rands); gmp_randclear (rands); }
-- Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677. Internet email is subject to wholesale government surveillance.
_______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel