Ciao,

Il Dom, 1 Settembre 2019 6:35 pm, Torbjörn Granlund ha scritto:
> "Marco Bodrato" <bodr...@mail.dm.unipi.it> writes:

> My code:   1.85 bits/iteration
> Your code: 2.60 bits/iteration

> It is worth remembering the table from my experiments with single-limb
> numbers:

> binary tab64                   :    479879082   0.401    2.495       40
> binary tab256                  :    456126047   0.381    2.625       40
> binary tab1024                 :    434483855   0.363    2.755       36

> The now observed 2.60 bits/iteration is very close to the 2.625
> bits/iteration from the table.

My code uses a 256 table... but extends it to a 1024 one...

What does the "binary tab###" single-limb implementation exactly do?

> (I looked briefly at your code, and don't understand it.  I haven't had
> time for a proper look yet.)

Yes, storing some unused bits in the table was not a good idea for clarity.

I send it again, I changed some variable names, added some comments and
some #ifdef to experiment the different flavours.

I get:
 -DNO_TG_TRICK -DTAB256 -DNLIMBS=10 -DSTAT=1
bits/iteration avg: 2.45

 -DTAB256 -DNLIMBS=10 -DSTAT=1
bits/iteration avg: 2.57

 -DNO_TG_TRICK -DNLIMBS=10 -DSTAT=1
bits/iteration avg: 2.53

 -DNLIMBS=10 -DSTAT=1
bits/iteration avg: 2.60


I added also an "#if 0"-ed bad idea :-)

Ĝis,
m

-- 
http://bodrato.it/papers/
#include "gmp-impl.h"
#include "longlong.h"

#if defined (__amd64__) && W_TYPE_SIZE == 64
#define CMP_SWAP(ap,bp,n)						\
  do {									\
    mp_limb_t tmp;							\
    __asm__ ("cmp\t%5, %6"	"\n\t"					\
	     "mov\t%1, %0"	"\n\t"					\
	     "cmovae\t%2, %1"	"\n\t"					\
	     "cmovae\t%0, %2"						\
	     : "=&r,&r" (tmp), "=r,r" (ap), "=r,r" (bp)			\
	     : "1,1" (ap), "2,2" (bp), "r,rm" (ap[n-1]), "rm,r" (bp[n-1])); \
  } while (0)
#endif

#ifndef CMP_SWAP
#define CMP_SWAP(ap,bp,n)						\
  do {									\
    mp_limb_t m = -(mp_limb_t) (ap[n - 1] < bp[n - 1]);			\
    mp_limb_t *_tp = ap;						\
    ap = (mp_ptr) (((size_t) ap & ~m) | ((size_t) bp & m));		\
    bp = (mp_ptr) (((size_t) bp & ~m) | ((size_t) _tp & m));		\
  } while (0)
#endif

#ifndef CMP_SWAP
#define CMP_SWAP(ap,bp,n)						\
  do {									\
    if (ap[n - 1] < bp[n - 1])						\
      {									\
	mp_limb_t *_tp = ap; ap = bp; bp = _tp;				\
      }									\
  } while (0)
#endif

#ifdef STAT
#define IFSTAT(x) x
#else
#define IFSTAT(x)
#endif

IFSTAT(extern size_t n_iter;)

#if NO_TG_TRICK
#define NEG_MASK (mask ^ 1)
#else
#define NEG_MASK GMP_NUMB_MAX
#endif

void
mpn_gcd_n (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp, mp_size_t n)
{
  static unsigned char tabp[256] = {
  63, 21, 51,  9,  7, 29, 59, 17, 15, 37,  3, 25, 23, 45, 11, 33,
  61, 63, 25, 27, 21, 23, 49, 51, 45, 47,  9, 11,  5,  7, 33, 35,
  59, 41, 63, 45, 35, 17, 39, 21, 11, 57, 15, 61, 51, 33, 55, 37,
  57, 19, 37, 63, 49, 11, 29, 55, 41,  3, 21, 47, 33, 59, 13, 39,
  55, 61, 11, 17, 63,  5, 19, 25,  7, 13, 27, 33, 15, 21, 35, 41,
  53, 39, 49, 35, 13, 63,  9, 59, 37, 23, 33, 19, 61, 47, 57, 43,
  51, 17, 23, 53, 27, 57, 63, 29,  3, 33, 39,  5, 43,  9, 15, 45,
  49, 59, 61,  7, 41, 51, 53, 63, 33, 43, 45, 55, 25, 35, 37, 47,
  47, 37, 35, 25, 55, 45, 43, 33, 63, 53, 51, 41,  7, 61, 59, 49,
  45, 15,  9, 43,  5, 39, 33,  3, 29, 63, 57, 27, 53, 23, 17, 51,
  43, 57, 47, 61, 19, 33, 23, 37, 59,  9, 63, 13, 35, 49, 39, 53,
  41, 35, 21, 15, 33, 27, 13,  7, 25, 19,  5, 63, 17, 11, 61, 55,
  39, 13, 59, 33, 47, 21,  3, 41, 55, 29, 11, 49, 63, 37, 19, 57,
  37, 55, 33, 51, 61, 15, 57, 11, 21, 39, 17, 35, 45, 63, 41, 59,
  35, 33,  7,  5, 11,  9, 47, 45, 51, 49, 23, 21, 27, 25, 63, 61,
  33, 11, 45, 23, 25,  3, 37, 15, 17, 59, 29,  7,  9, 51, 21, 63
  };

#ifdef TAB256
  const mp_limb_t mask = (2 << 4) - 2;
#else
  const mp_limb_t mask = (2 << 5) - 2;
#endif

  MPN_FILL (rp, n, 0);

  while (n > 2)
    {
      mp_limb_t a, b, m, cy, signa, signb, maskm;
      int cnt, v2cnt, ucnt, vcnt;

      IFSTAT(n_iter++);

      CMP_SWAP (up, vp, n);

      a = up[0];
      b = vp[0];
      /* Let m = -tabp[] = GMP_NUMB_MAX ^ (tabp[] & mask) */
#ifdef TAB256
      m = NEG_MASK ^ (tabp[((a & mask) << (4 - 1)) + ((b & mask) >> 1)] & mask);
#else
      signa = -((a >> 4) & 2);
      signb = -((b >> 4) & 2);
      /* tabp[- a, b] == - tabp[a, b]; tabp[a, - b] == - tabp[a, b]; tabp[- a, - b] == - - tabp[a, b] */
      m = NEG_MASK ^ ((tabp[(((a^signa) & mask) << (4 - 1)) + (((b^signb) & mask) >> 1)]^signa^signb) & mask);
#endif

      count_leading_zeros (ucnt, up[n - 1]);
      count_leading_zeros (v2cnt, vp[n - 2] | up[n - 1]);
      ASSERT (v2cnt <= ucnt); /* We do not care if V is more than one full limb shorter than U*/
      /* if vp[n-1] == 0 we will discard the value, otherwise clz(vp[n-1] | 1) = clz(vp[n-1]) */
      count_leading_zeros (vcnt, vp[n - 1] | 1);

      /* Compute difference in bitsize, GMP_NUMB_BITS if too large. */
      cnt = (vp[n - 1] == 0 ? GMP_NUMB_BITS + v2cnt : vcnt) - ucnt;
      ASSERT (0 <= cnt && cnt <= GMP_NUMB_BITS);

      /* maskm = ((1 << cnt) - 1) | 1 */
      maskm = GMP_NUMB_MASK >> (GMP_NUMB_BITS - MAX (cnt , 1));
      m &= maskm;

#if 0 && HAVE_NATIVE_mpn_rsh1add_n
      if (UNLIKELY ((((maskm << 1) | 7) & (a+b)) == 0))
	{
	  ASSERT_NOCARRY (mpn_rsh1add_n (up, up, vp, n));
	}
      else
#endif
	{
	  cy = mpn_submul_1 (up, vp, n, m);

	  if (UNLIKELY (cy))
	    mpn_neg (up, up, n);
	}

      a = up[0];
#ifdef TAB256
      ASSERT_ALWAYS ((a & 0x1f & maskm) == 0);
#else
      ASSERT_ALWAYS ((a & 0x3f & maskm) == 0);
#endif

      if (UNLIKELY (a == 0)) /* lowz */
	{
	  mp_size_t i;

	  for (i = 1; (a = up[i]) == 0;)
	    if (++i == n) /* all zeros, we are done. */
	      {
		mpn_copyi (rp, vp, n);
		return;
	      }

	  if ((a & 1) == 0)
	    {
	      int cnt;
	      count_trailing_zeros (cnt, a);
	      mpn_rshift (up, up + i, n - i, cnt);
	    }
	  else
	    {
	      mpn_copyi (up, up + i, n - i);
	    }
	  MPN_FILL (up + n - i, i, 0);
	}
      else
	{
	  count_trailing_zeros (cnt, a);
	  mpn_rshift (up, up, n, cnt);
	}

      n -= (up[n - 1] | vp[n - 1]) == 0;
    }

 mp_double_limb_t g = mpn_gcd_22 (up[1], up[0], vp[1], vp[0]);
 rp[0] = g.d0;
 rp[1] = g.d1;
}

void
mpn_gcd_33 (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp)
{
  mpn_gcd_n (rp, up, vp, 3);
}
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to