Re: Fast constant-time gcd computation and modular inversion

Niels Möller Thu, 29 Sep 2022 08:44:40 -0700

ni...@lysator.liu.se (Niels Möller) writes:

> I've now implemented inverse too.
 
And now I've tried a different variant, using redc for the cofactor
updates. Cofactors are now canonically reduced (which seems unexpectedly
cheap). In the case that m is not normalized, so that 2m fits in n
limbs, one could use a more relaxed range, 0 <= u, v < 2m, and get rid
of a conditional adjustment when doing redc in the loop.


Using redc is a rather good fit, because in each iteration, we shift f,
g *almost* a limb (62 bits on a 64-bit machine), except for the last
iteration where we do less bits. While cofactors are multiplied by
2^{-64} mod m in each iteration. So there's a discrepancy of a few bits
per iteration, which one can handle using very few extra redc calls
outside of the loop (it's O(n^2) work, but with a pretty small
constant).

Speed is rather close to the previous version, but the only needed
precomputation is binvert_limb for the redc. And the calls to
mpn_sec_mul and mpn_sec_div_r are gone.

I also added benchmarking of the existing mpn_sec_invert, for
comparison. This is how it looks now:

invert size 1, ref 0.327 (us), old 2.771 (us), djb 1.087 (us)
invert size 2, ref 0.757 (us), old 5.366 (us), djb 1.571 (us)
invert size 3, ref 1.082 (us), old 8.110 (us), djb 2.336 (us)
[...]
invert size 17, ref 5.106 (us), old 163.962 (us), djb 18.082 (us)
invert size 18, ref 5.189 (us), old 170.565 (us), djb 18.608 (us)
invert size 19, ref 5.715 (us), old 185.143 (us), djb 20.794 (us)

One possible optimization would be to keep cofactors in signed (two's
complement) representation for the first few iterations, when they are
still small and don't need any reduction. And cofactor update for the
very first iteration could be much more efficient.

Regards,
/Niels

/* Side channel silent gcd (work in progress)

Copyright 2022 Niels Möller

This is is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <gmp.h>

#if GMP_NUMB_BITS < GMP_LIMB_BITS
#error Nails not supported.
#endif

#define BITCNT_BITS (sizeof(mp_bitcnt_t) * CHAR_BIT)

#define STEPS (GMP_LIMB_BITS - 2)

#define MAX(a,b) ((a) >= (b) ? (a) : (b))

struct matrix {
  /* Matrix elements interpreted as signed two's complement. Absolute
     value of elements is at most 2^STEPS. */
  mp_limb_t a[2][2];
};

/* Conditionally set (a, b) <-- (b, -a) */
#define CND_NEG_SWAP_LIMB(cnd, a, b) do {\
    mp_limb_t __cnd_sum = (a) + (b);	 \
    mp_limb_t __cnd_diff = (a) - (b);	 \
    (a) -= __cnd_diff & -(cnd);		 \
    (b) -= __cnd_sum & -(cnd);		 \
  } while (0)

/* Perform STEPS elementary reduction steps on (delta, f, g). In the
   least significant STEPS bits of f, g matter. Note that mp_bitcnt_t
   is an unsigned type, but is used as two's complement. */
static mp_bitcnt_t
steps(struct matrix *M, unsigned k, mp_bitcnt_t delta, mp_limb_t f, mp_limb_t g)
{
  mp_limb_t a00, a01, a10, a11;
  assert (f & 1);

  /* Identity matrix. */
  a00 = a11 = 1;
  a01 = a10 = 0;

  /* Preserve invariant (f ; g) = 2^{-i} M (f_orig, g_orig) */
  for (; k-- > 0; delta++)
    {
      mp_limb_t odd = g & 1;
      mp_limb_t swap = odd & ~(delta >> (BITCNT_BITS-1));

      /* Swap; (f'; g') = (g; -f) = (0,1;-1,0) (g;f) */
      CND_NEG_SWAP_LIMB(swap, f, g);
      CND_NEG_SWAP_LIMB(swap, a00, a10);
      CND_NEG_SWAP_LIMB(swap, a01, a11);

      /* Conditional negation. */
      delta = (delta ^ - (mp_bitcnt_t) swap) + swap;

      /* Cancel low bit and shift. */
      g += f & -odd;
      a10 += a00 & -odd;
      a11 += a01 & -odd;

      g >>= 1;
      a00 <<= 1;
      a01 <<= 1;
    }
  M->a[0][0] = a00; M->a[0][1] = a01;
  M->a[1][0] = a10; M->a[1][1] = a11;
  return delta;
}

/* Set R = (u * F + v * G), treating all numbers as two's complement.
   No overlap allowed. */
static mp_limb_t
add_add_mul (mp_ptr rp, const mp_limb_t *fp, const mp_limb_t *gp, mp_size_t n,
	     mp_limb_t u, mp_limb_t v) {
  mp_limb_t f_sign = fp[n-1] >> (GMP_LIMB_BITS - 1);
  mp_limb_t g_sign = gp[n-1] >> (GMP_LIMB_BITS - 1);
  mp_limb_t u_sign = u >> (GMP_LIMB_BITS - 1);
  mp_limb_t v_sign = v >> (GMP_LIMB_BITS - 1);
  mp_limb_t hi = mpn_mul_1 (rp, fp, n, u) - ((-f_sign) & u);
  hi -= ((-u_sign) & fp[n-1]) + mpn_cnd_sub_n (u_sign, rp + 1, rp + 1, fp, n-1);

  hi += mpn_addmul_1 (rp, gp, n, v) - ((-g_sign) & v);
  hi -= ((-v_sign) & gp[n-1]) + mpn_cnd_sub_n (v_sign, rp + 1, rp + 1, gp, n-1);

  return hi;
}

/* Update (f'; g') = M (f; g) / 2^{shift}, where all numbers are two's complement. */
static void
matrix_vector_mul (const struct matrix *M, unsigned shift,
		   mp_size_t n, mp_limb_t *fp, mp_limb_t *gp, mp_limb_t *tp)
{
  mp_limb_t f_hi = add_add_mul (tp, fp, gp, n, M->a[0][0], M->a[0][1]);
  mp_limb_t g_hi = add_add_mul (tp + n, fp, gp, n, M->a[1][0], M->a[1][1]);
  mp_limb_t lo = mpn_rshift (fp, tp, n, shift);
  assert (lo == 0);
  fp[n-1] += (f_hi << (GMP_LIMB_BITS - shift));
  lo = mpn_rshift (gp, tp + n, n, shift);
  assert (lo == 0);
  gp[n-1] += (g_hi << (GMP_LIMB_BITS - shift));
}

static void __attribute__((noreturn))
die (const char *msg)
{
  fprintf(stderr, "%s\n", msg);
  abort();
}

static mp_limb_t *
xalloc_limbs(mp_size_t n)
{
  mp_limb_t *p = malloc(n * sizeof(*p));
  if (!p)
    die("Out of memory");

  mpn_zero (p, n);
  return p;
}

static mp_limb_t
cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n,
	 mp_limb_t* tp)
{
  mp_limb_t cy = mpn_lshift (tp, ap, n, 1);
  return (cnd & cy) + mpn_cnd_sub_n (cnd, rp, ap, tp, n);
}

static mp_bitcnt_t
iterations (mp_size_t n)
{
  mp_bitcnt_t bits = n * GMP_LIMB_BITS;
  assert (bits >= 46);
  return (49 * bits + 57) / 17;
}

static void
djb_gcd (mpz_t g, const mpz_t a, const mpz_t b)
{
  mp_size_t n = MAX(mpz_size(a), mpz_size(b));
  mp_bitcnt_t count;
  mp_limb_t *fp, *gp, *tp;
  mp_bitcnt_t delta;
  struct matrix M;

  assert (mpz_odd_p (a));
  count = iterations (n);

  /* Extra limb needed for sign bit. */
  fp = xalloc_limbs (n+1);
  gp = xalloc_limbs (n+1);
  tp = xalloc_limbs (2*(n+1));

  mpn_copyi (fp, mpz_limbs_read (a), mpz_size (a));
  mpn_copyi (gp, mpz_limbs_read (b), mpz_size (b));

  for (delta = 1;count > STEPS;count -= STEPS)
    {
      delta = steps (&M, STEPS, delta, fp[0], gp[0]);
      matrix_vector_mul (&M, STEPS, n+1, fp, gp, tp);
    }
  delta = steps (&M, count, delta, fp[0], gp[0]);
  matrix_vector_mul (&M, count, n+1, fp, gp, tp);

  assert (mpn_zero_p (gp, n+1));
  cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), fp, fp, n+1, tp);
  assert (fp[n] == 0);
  mpn_copyi (mpz_limbs_write (g, n), fp, n);
  mpz_limbs_finish (g, n);

  free (fp);
  free (gp);
  free (tp);
}

static mp_limb_t
binvert_limb(mp_limb_t n)
{
  static const unsigned char inv_table[8] = { 1, 11, 13, 7, 9, 3, 5, 15 };
  mp_limb_t x;
  unsigned bits;

  assert (n & 1);
  for (bits = 4, x = inv_table[(n/2) & 7]; bits < GMP_LIMB_BITS; bits *= 2)
    x = 2 * x - x*x*n;
  return x;
}

/* Set R = (u * F + v * G) (mod M), treating u, v as two's complement,
   but F, G, R unsigned. No overlap allowed. n limb inputs, n+1 limb
   output.

   Input can be allowed in the range 0 <= F, G < min (2 M, B^n),
   output always in the range 0 <= R < B M.
*/
static void
add_add_mul_mod (mp_ptr rp, const mp_limb_t *fp, const mp_limb_t *gp,
		 const mp_limb_t *mp, mp_size_t n,
		 mp_limb_t u, mp_limb_t v) {
  mp_limb_t u_sign = u >> (GMP_LIMB_BITS - 1);
  mp_limb_t v_sign = v >> (GMP_LIMB_BITS - 1);
  mp_limb_t r_sign;
  rp[n] = mpn_mul_1 (rp, fp, n, u);
  mpn_cnd_sub_n (u_sign, rp + 1, rp + 1, fp, n);

  rp[n] += mpn_addmul_1 (rp, gp, n, v);
  mpn_cnd_sub_n (v_sign, rp + 1, rp + 1, gp, n);

  /* Row sums of the matrix have absolute value <= B/4. With inputs F,
     G < 2 M, at this point we have

       |R| < B M/2.

     If R < 0, then R + B M is positive negative, adding B M makes the
     result positive.
  */
  r_sign = rp[n] >> (GMP_LIMB_BITS - 1);
  r_sign -= mpn_cnd_add_n (r_sign, rp + 1, rp + 1, mp, n);
  assert (r_sign == 0);
}

/* Input in range 0 <= T < B M, output in range 0 <= R < M. */
static void
redc_1 (mp_limb_t *rp, mp_limb_t *tp,
	const mp_limb_t *mp, mp_size_t n, mp_limb_t m_binv)
{
  mp_limb_t cy, hi;
  /* If we knew that 2M < B^n, we could allow result value in the
     range 0 <= R < 2M, and use mpn_add_mul_1 without any adjustment
     step. */
  hi = mpn_submul_1 (tp, mp, n, m_binv * tp[0]);
  assert (tp[0] == 0);
  cy = tp[n] < hi;
  tp[n] -= hi;

  cy -= mpn_cnd_add_n (cy, rp, tp + 1, mp, n);
  assert (cy == 0);
}

/* Input in range 0 <= T < M, output in range 0 <= R < M. */
static void
redc_bits (mp_limb_t *rp, mp_limb_t *tp, const mp_limb_t *mp,
	   mp_size_t n, mp_limb_t m_binv, mp_bitcnt_t k)
{
  mp_limb_t hi, cy;
  assert (k > 0);

  for (; k > GMP_NUMB_BITS; k -= GMP_NUMB_BITS)
    {
      hi = mpn_addmul_1 (tp, mp, n, -m_binv * tp[0]);
      assert (tp[0] == 0);
      mpn_copyi (tp, tp + 1, n - 1);
      tp[n-1] = hi;
    }
  if (k > 0)
    {
      mp_limb_t mask = ((mp_limb_t) 2 << (k-1)) - 1;
      mp_limb_t q = (-m_binv * tp[0]) & mask;
      hi = mpn_addmul_1 (tp, mp, n, q);
      cy = mpn_rshift (rp, tp, n, k);
      assert (cy == 0);
      rp[n-1] |= hi << (GMP_NUMB_BITS - k);
    }
}

/* Update (u'; v') = M (u; v) B^{-1} (mod m), where u, v, m are
   unsigned n limbs, M has signed elements, and B is the bignum
   base.

   Inputs and outputs are canonically reduced, 0 <= U, V < M, but
   inputs of size up to 2 M should work.
*/
static void
matrix_vector_mul_mod (const struct matrix *M, const mp_limb_t *mp,
		       mp_limb_t m_binv,
		       mp_size_t n, mp_limb_t *up, mp_limb_t *vp, mp_limb_t *tp)
{
  add_add_mul_mod (tp, up, vp, mp, n, M->a[0][0], M->a[0][1]);
  add_add_mul_mod (tp + n + 1, up, vp, mp, n, M->a[1][0], M->a[1][1]);

  /* Reduce to n limbs, by multiplying with B^-1 (mod m) */
  redc_1 (up, tp, mp, n, m_binv);
  redc_1 (vp, tp + n + 1, mp, n, m_binv);
}

static int
one_p (const mp_limb_t *p, mp_size_t n)
{
  mp_limb_t diff = p[0] ^ 1;
  mp_size_t i;
  for (i = 1; i < n; i++)
    diff |= p[i];

  return diff == 0;
}

/* Inverse of a modulo m, m odd. */
static int
djb_invert(mpz_t r, const mpz_t a, const mpz_t m, mp_limb_t m_binv)
{
  mp_size_t n = mpz_size(m);
  const mp_limb_t *mp = mpz_limbs_read (m);
  mp_limb_t *fp, *gp, *up, *vp, *tp;
  mp_limb_t cy;
  mp_bitcnt_t count;
  mp_bitcnt_t delta;
  mp_bitcnt_t shift;
  struct matrix M;
  int success;

  assert (mpz_size (a) <= n);
  assert (mpz_odd_p (m));

  count = iterations (n);

  /* Extra limb needed for sign bit. */
  fp = xalloc_limbs (n+1);
  gp = xalloc_limbs (n+1);
  /* Cofactors are represented as n limbs. And we can represent them
     canonically reduced, more or less for free. */
  up = xalloc_limbs (n);
  vp = xalloc_limbs (n);
  tp = xalloc_limbs (2*(n+1));

  assert (mpz_size (a) <= n);

  mpn_copyi (fp, mp, n);
  mpn_copyi (gp, mpz_limbs_read (a), mpz_size (a));

  shift = GMP_NUMB_BITS*((count + STEPS - 1) / STEPS) - count;

  /* Premultiply a by 2^{- shift} mod m */
  redc_bits (gp, gp, mp, n, m_binv, shift);

  /* Maintain invariant

       a * u = 2^{-count} B^{ceil(count/STEPS)} f (mod m)
       a * v = 2^{-count} B^{ceil(count/STEPS)} g (mod m)
  */
  vp[0] = 1;

  for (delta = 1;count > STEPS;count -= STEPS)
    {
      delta = steps (&M, STEPS, delta, fp[0], gp[0]);
      matrix_vector_mul (&M, STEPS, n+1, fp, gp, tp);
      matrix_vector_mul_mod (&M, mp, m_binv, n, up, vp, tp);
    }
  delta = steps (&M, count, delta, fp[0], gp[0]);
  matrix_vector_mul (&M, count, n+1, fp, gp, tp);
  /* Only compute u, we don't need v. */
  add_add_mul_mod (tp, up, vp, mp, n, M.a[0][0], M.a[0][1]);
  redc_1 (up, tp, mp, n, m_binv);

  assert (mpn_zero_p (gp, n+1));

  /* Now f = ±1 (if the inverse exists), and a * u = f (mod m) */
  cy = cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), up, up, n, tp);
  /* Make u non-negative */
  cy -= mpn_cnd_add_n (cy, up, up, mp, n);

  cnd_neg (fp[n] >> (GMP_LIMB_BITS - 1), fp, fp, n + 1, tp);

  success = one_p (fp, n+1);

  /* Now a * u' = 1, thanks to the premultiply. */
  mpn_copyi (mpz_limbs_write (r, n), up, n);
  mpz_limbs_finish (r, n);

  free (fp);
  free (gp);
  free (up);
  free (vp);
  free (tp);

  return success;
}

static void
test_gcd (gmp_randstate_t rands)
{
  mp_size_t n;
  mpz_t a, b, g, ref;

  mpz_inits (a, b, g, ref, NULL);

  for (n = 1; n < 20; n++)
    {
      unsigned count;
      clock_t clocks_ref = 0;
      clock_t clocks_djb = 0;
      for (count = 0; count < 1000; count++)
	{
	  clock_t time_start, time_ref, time_djb;
	  if (count & 1)
	    {
	      mpz_rrandomb (a, rands, GMP_LIMB_BITS * n);
	      mpz_rrandomb (b, rands, GMP_LIMB_BITS * n);
	    }
	  else
	    {
	      mpz_urandomb (a, rands, GMP_LIMB_BITS * n);
	      mpz_urandomb (b, rands, GMP_LIMB_BITS * n);
	    }
	  mpz_setbit (a, 0);

	  time_start = clock();
	  mpz_gcd (ref, a, b);
	  time_ref = clock();
	  djb_gcd (g, a, b);
	  time_djb = clock();

	  if (mpz_cmp (g, ref) != 0)
	    {
	      gmp_fprintf (stderr,
			   "size = %zd\na   = 0x%Zx\nb   = 0x%Zx\nref = 0x%Zx\ng   = 0x%Zx\n",
			   (size_t) n, a, b, ref, g);
	      die("Gcd failed");
	    }
	  clocks_ref += (time_ref - time_start);
	  clocks_djb += (time_djb - time_ref);
	}
      fprintf (stderr, "gcd size %zd, ref %.3f (us), djb %.3f (us)\n", (size_t) n,
	       clocks_ref * (1e6 / CLOCKS_PER_SEC) / count,
	       clocks_djb * (1e6 / CLOCKS_PER_SEC) / count);

    }
  mpz_clears (a, b, g, ref, NULL);
}

static int
old_invert(mpz_t r, const mpz_t a, const mpz_t m)
{
  mp_size_t n = mpz_size(m);

  mp_limb_t *ap = xalloc_limbs (n);
  mp_limb_t *mp = xalloc_limbs (n);
  mp_limb_t *scratch = xalloc_limbs (mpn_sec_invert_itch (n));
  int success;
  assert (mpz_size (a) <= n);
  mpn_copyi (ap, mpz_limbs_read (a), mpz_size (a));
  mpn_copyi (mp, mpz_limbs_read (m), mpz_size (m));

  success = mpn_sec_invert (mpz_limbs_write (r, n), ap, mp, n, 2*n*GMP_NUMB_BITS, scratch);
  mpz_limbs_finish (r, n);
  free (ap);
  free (mp);
  free (scratch);
  return success;
}

static void
test_invert(gmp_randstate_t rands)
{
  mp_size_t n;
  mpz_t a, m, x, ref, o;

  mpz_inits (a, m, x, ref, o, NULL);
  for (n = 1; n < 20; n++)
    {
      unsigned count, success_count;
      clock_t clocks_ref = 0;
      clock_t clocks_old = 0;
      clock_t clocks_djb = 0;
      for (count = 0, success_count = 0; count < 1000; count++)
	{
	  mp_limb_t binv;
	  clock_t time_start, time_ref, time_old, time_djb;
	  int success, ref_success, old_success;

	  if (count & 1)
	    {
	      mpz_rrandomb (a, rands, GMP_LIMB_BITS * n);
	      mpz_rrandomb (m, rands, GMP_LIMB_BITS * n);
	    }
	  else
	    {
	      mpz_urandomb (a, rands, GMP_LIMB_BITS * n);
	      mpz_urandomb (m, rands, GMP_LIMB_BITS * n);
	    }
	  mpz_setbit (m, 0);

	  binv = binvert_limb (mpz_getlimbn (m, 0));

	  time_start = clock();
	  ref_success = mpz_invert (ref, a, m);
	  time_ref = clock();
	  old_success = old_invert (o, a, m);
	  time_old = clock();
	  success = djb_invert (x, a, m, binv);
	  time_djb = clock();
	  assert (old_success == ref_success && (!ref_success || mpz_cmp (ref, o) == 0));

	  if (ref_success != success || (success && mpz_cmp (x, ref) != 0))
	    {
	      gmp_fprintf (stderr,
			   "size = %zd ref_success = %d, success = %d\n"
			   "a   = 0x%Zx\nm   = 0x%Zx\nref = 0x%Zx\nx   = 0x%Zx\n",
			   (size_t) n, ref_success, success, a, m, ref, x);
	      die("Invert failed");
	    }
	  if (success)
	    {
	      success_count++;
	      clocks_ref += (time_ref - time_start);
	      clocks_old += (time_old - time_ref);
	      clocks_djb += (time_djb - time_old);
	    }
	}
      fprintf (stderr, "invert size %zd, ref %.3f (us), old %.3f (us), djb %.3f (us)\n", (size_t) n,
	       clocks_ref * (1e6 / CLOCKS_PER_SEC) / count,
	       clocks_old * (1e6 / CLOCKS_PER_SEC) / count,
	       clocks_djb * (1e6 / CLOCKS_PER_SEC) / count);
    }

  mpz_clears (a, m, x, ref, o, NULL);
}

int
main (int argc, char **argv)
{
  gmp_randstate_t rands;
  gmp_randinit_default (rands);
  test_gcd (rands);
  test_invert (rands);
  gmp_randclear (rands);
}

-- 
Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
Internet email is subject to wholesale government surveillance.

_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Re: Fast constant-time gcd computation and modular inversion

Reply via email to