I tried using a segmented sieve. this avoids multiplication or mod in the
inner loop.
It's an improvement and the code is easy to comprehend.

$ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime -p mpz_nextprime
$ hg import ~/Downloads/nextprime_segment.patch --no-commit
$ make -j4; make -C tune/ speed -j4
$ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime -P mpz_nextprime2

$ pr -m -t mpz_nextprime.data mpz_nextprime2.data | awk '{ print
$0,"\t"$4/$2 }'
100 3.658593750e-05    100     3.326367187e-05 0.909193
110 3.904296875e-05    110     3.598828125e-05 0.921761
121 4.675585937e-05    121     4.157031250e-05 0.889093
133 8.379882812e-05    133     7.977343750e-05 0.951964
146 9.650195312e-05    146     8.803710937e-05 0.912283
160 1.245546875e-04    160     1.032539062e-04 0.828985
176 1.269101562e-04    176     1.173007813e-04 0.924282
193 1.977187500e-04    193     1.846230469e-04 0.933766
212 2.087812500e-04    212     1.933378906e-04 0.926031
233 2.356660156e-04    233     2.218164063e-04 0.941232
256 3.030214844e-04    256     2.868691406e-04 0.946696
281 4.203593750e-04    281     3.908398437e-04 0.929775
500 2.132431641e-03    500     2.013937500e-03 0.944432
1000 2.307201758e-02    1000     2.284832031e-02 0.990304

This can undoubtedly benefit from a larger sieve also,
which I'm testing in a different patch.

On Thu, Mar 5, 2020 at 6:05 PM Seth Troisi <brain...@gmail.com> wrote:

> I tried using a segmented sieve. this avoids multiplication or mod in the
> inner loop.
> It's an improvement and the code is easy to comprehend.
>
>
>
>
> On Wed, Feb 12, 2020 at 10:25 AM Niels Möller <ni...@lysator.liu.se>
> wrote:
>
>> "Marco Bodrato" <bodr...@mail.dm.unipi.it> writes:
>>
>> > Ciao,
>> >
>> > Il Mer, 12 Febbraio 2020 7:26 am, Niels Möller ha scritto:
>> >> And for a small prime gap (common case), this cost should be the main
>> >> part of the sieving. So it would make sense to optimize it. If we also
>> >> use the ptab, we could compute modulo single-limb prime products using
>> >> precomputed constants.
>> >
>> > Yes, of course.
>>
>> Another version below, using ptab. Timing:
>>
>> $ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime
>>         mpz_nextprime
>> 100       0.000116762
>> 110       0.000126770
>> 121       0.000147203
>> 133       0.000257994
>> 146       0.000291051
>> 160       0.000343168
>> 176       0.000386605
>> 193       0.000625994
>> 212       0.000670035
>> 233       0.000763240
>> 256       0.000942154
>> 281       0.001418639
>> 500       0.006632588
>> 1000      0.073935992
>>
>> So also close to 15% speedup for 100 bits, and the same as previous
>> version, almost 25%, for 1000 bits.
>>
>> I used the very crude tuning
>>
>>   limit = (nbits < 200) ? nbits / 2 : PTAB_SIZE;
>>
>> to avoid regressing for smaller sizes.
>>
>> > And the next question will be: should we generate on the fly an even
>> > larger table of primes when the required size grows beyond the
>> > precomputed table?
>>
>> Don't know. I don't quite understand how large a table could be
>> beneficial, but I guess it might make sense to at least limit it to size
>> of L1 cache.
>>
>> Regards,
>> /Niels
>>
>> /* mpz_nextprime(p,t) - compute the next prime > t and store that in p.
>>
>> Copyright 1999-2001, 2008, 2009, 2012 Free Software Foundation, Inc.
>>
>> Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
>>
>> This file is part of the GNU MP Library.
>>
>> The GNU MP Library is free software; you can redistribute it and/or modify
>> it under the terms of either:
>>
>>   * the GNU Lesser General Public License as published by the Free
>>     Software Foundation; either version 3 of the License, or (at your
>>     option) any later version.
>>
>> or
>>
>>   * the GNU General Public License as published by the Free Software
>>     Foundation; either version 2 of the License, or (at your option) any
>>     later version.
>>
>> or both in parallel, as here.
>>
>> The GNU MP Library is distributed in the hope that it will be useful, but
>> WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
>> or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
>> for more details.
>>
>> You should have received copies of the GNU General Public License and the
>> GNU Lesser General Public License along with the GNU MP Library.  If not,
>> see https://www.gnu.org/licenses/.  */
>>
>> #include "gmp-impl.h"
>> #include "longlong.h"
>>
>> #if 1
>> struct gmp_primes_dtab {
>>   mp_limb_t p;
>>   mp_limb_t binv;
>>   mp_limb_t lim;
>> };
>>
>> struct gmp_primes_ptab {
>>   mp_limb_t ppp;        /* primes, multiplied together */
>>   mp_limb_t cps[7];     /* ppp values pre-computed for mpn_mod_1s_4p */
>>   gmp_uint_least32_t idx:24;    /* index of  first primes in dtab */
>>   gmp_uint_least32_t np :8;     /* number of primes related to this entry
>> */
>> };
>>
>> static const struct gmp_primes_dtab dtab[] =
>> {
>> #define WANT_dtab
>> #define P(p,inv,lim) {p, inv,lim}
>> #include "trialdivtab.h"
>> #undef WANT_dtab
>> #undef P
>> };
>>
>> #define DTAB_SIZE (sizeof (dtab) / sizeof (dtab[0]))
>>
>> static const struct gmp_primes_ptab ptab[] =
>> {
>> #define WANT_ptab
>> #include "trialdivtab.h"
>> #undef WANT_ptab
>> };
>>
>> #define PTAB_SIZE (sizeof (ptab) / sizeof (ptab[0]))
>>
>> void
>> mpz_nextprime (mpz_ptr p, mpz_srcptr n)
>> {
>>   mp_limb_t *moduli;
>>   unsigned long difference;
>>   int i;
>>   unsigned prime_limit;
>>   mp_bitcnt_t nbits;
>>   unsigned incr;
>>   TMP_DECL;
>>
>>   /* First handle tiny numbers */
>>   if (mpz_cmp_ui (n, 2) < 0)
>>     {
>>       mpz_set_ui (p, 2);
>>       return;
>>     }
>>   mpz_add_ui (p, n, 1);
>>   mpz_setbit (p, 0);
>>
>>   if (mpz_cmp_ui (p, 7) <= 0)
>>     return;
>>
>>   MPN_SIZEINBASE_2EXP(nbits, PTR(p), SIZ(p), 1);
>>
>>   if (nbits < 200)
>>     {
>>       prime_limit = nbits / 2;
>>       if (SIZ(p) == 1)
>>         {
>>           /* If the prime list includes any prime >= p, do plain
>>              search. */
>>           mp_limb_t limb = PTR(p)[0];
>>           unsigned last = ptab[prime_limit-1].idx +
>> ptab[prime_limit-1].np - 1;
>>           if (dtab[last].p >= limb)
>>             {
>>               while (dtab[last-1].p >= limb)
>>                 last--;
>>               PTR(p)[0] = dtab[last].p;
>>               return;
>>             }
>>         }
>>     }
>>   else
>>     prime_limit = PTAB_SIZE;
>>
>>   TMP_MARK;
>>
>>   /* Compute residues modulo small odd primes */
>>   moduli = TMP_ALLOC_TYPE (prime_limit, mp_limb_t);
>>
>>   for (;;)
>>     {
>>       for (i = 0; i < prime_limit; i++)
>>         {
>>           mp_limb_t ppp = ptab[i].ppp;
>>           const mp_limb_t *cps = ptab[i].cps;
>>           moduli[i] = mpn_mod_1s_4p (PTR(p), SIZ(p), ppp << cps[1], cps);
>>         }
>>
>> #define INCR_LIMIT 0x10000      /* deep science */
>>
>>       for (difference = incr = 0; incr < INCR_LIMIT; difference += 2)
>>         {
>>           /* First check divisibility based on prime list */
>>           for (i = 0; i < prime_limit; i++)
>>             {
>>               /* FIXME: Ensure that this addition can't overflow */
>>               mp_limb_t m = moduli[i] + incr;
>>               int idx = ptab[i].idx;
>>               int np = ptab[i].np;
>>               int j;
>>
>>               for (j = 0; j < np; j++)
>>                 if (m * dtab[idx+j].binv <= dtab[idx+j].lim)
>>                   goto next;
>>             }
>>
>>           mpz_add_ui (p, p, difference);
>>           difference = 0;
>>
>>           /* Miller-Rabin test */
>>           if (mpz_millerrabin (p, 25))
>>             goto done;
>>         next:;
>>           incr += 2;
>>         }
>>       mpz_add_ui (p, p, difference);
>>       difference = 0;
>>     }
>>  done:
>>   TMP_FREE;
>> }
>> #else
>> static const unsigned char primegap[] =
>> {
>>   2,2,4,2,4,2,4,6,2,6,4,2,4,6,6,2,6,4,2,6,4,6,8,4,2,4,2,4,14,4,6,
>>   2,10,2,6,6,4,6,6,2,10,2,4,2,12,12,4,2,4,6,2,10,6,6,6,2,6,4,2,10,14,4,2,
>>   4,14,6,10,2,4,6,8,6,6,4,6,8,4,8,10,2,10,2,6,4,6,8,4,2,4,12,8,4,8,4,6,
>>
>> 12,2,18,6,10,6,6,2,6,10,6,6,2,6,6,4,2,12,10,2,4,6,6,2,12,4,6,8,10,8,10,8,
>>
>> 6,6,4,8,6,4,8,4,14,10,12,2,10,2,4,2,10,14,4,2,4,14,4,2,4,20,4,8,10,8,4,6,
>>   6,14,4,6,6,8,6,12
>> };
>>
>> #define NUMBER_OF_PRIMES 167
>>
>> void
>> mpz_nextprime (mpz_ptr p, mpz_srcptr n)
>> {
>>   unsigned short *moduli;
>>   unsigned long difference;
>>   int i;
>>   unsigned prime_limit;
>>   unsigned long prime;
>>   mp_size_t pn;
>>   mp_bitcnt_t nbits;
>>   unsigned incr;
>>   TMP_SDECL;
>>
>>   /* First handle tiny numbers */
>>   if (mpz_cmp_ui (n, 2) < 0)
>>     {
>>       mpz_set_ui (p, 2);
>>       return;
>>     }
>>   mpz_add_ui (p, n, 1);
>>   mpz_setbit (p, 0);
>>
>>   if (mpz_cmp_ui (p, 7) <= 0)
>>     return;
>>
>>   pn = SIZ(p);
>>   MPN_SIZEINBASE_2EXP(nbits, PTR(p), pn, 1);
>>   if (nbits / 2 >= NUMBER_OF_PRIMES)
>>     prime_limit = NUMBER_OF_PRIMES - 1;
>>   else
>>     prime_limit = nbits / 2;
>>
>>   TMP_SMARK;
>>
>>   /* Compute residues modulo small odd primes */
>>   moduli = TMP_SALLOC_TYPE (prime_limit, unsigned short);
>>
>>   for (;;)
>>     {
>>       /* FIXME: Compute lazily? */
>>       prime = 3;
>>       for (i = 0; i < prime_limit; i++)
>>         {
>>           moduli[i] = mpz_tdiv_ui (p, prime);
>>           prime += primegap[i];
>>         }
>>
>> #define INCR_LIMIT 0x10000      /* deep science */
>>
>>       for (difference = incr = 0; incr < INCR_LIMIT; difference += 2)
>>         {
>>           /* First check residues */
>>           prime = 3;
>>           for (i = 0; i < prime_limit; i++)
>>             {
>>               unsigned r;
>>               /* FIXME: Reduce moduli + incr and store back, to allow for
>>                  division-free reductions.  Alternatively, table
>> primes[]'s
>>                  inverses (mod 2^16).  */
>>               r = (moduli[i] + incr) % prime;
>>               prime += primegap[i];
>>
>>               if (r == 0)
>>                 goto next;
>>             }
>>
>>           mpz_add_ui (p, p, difference);
>>           difference = 0;
>>
>>           /* Miller-Rabin test */
>>           if (mpz_millerrabin (p, 25))
>>             goto done;
>>         next:;
>>           incr += 2;
>>         }
>>       mpz_add_ui (p, p, difference);
>>       difference = 0;
>>     }
>>  done:
>>   TMP_SFREE;
>> }
>>
>> #endif
>>
>> --
>> Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
>> Internet email is subject to wholesale government surveillance.
>>
>
diff -r bcca14c8a090 mpz/nextprime.c
--- a/mpz/nextprime.c	Thu Mar 05 00:43:26 2020 +0100
+++ b/mpz/nextprime.c	Fri Mar 06 13:55:09 2020 -0800
@@ -30,6 +30,8 @@
 GNU Lesser General Public License along with the GNU MP Library.  If not,
 see https://www.gnu.org/licenses/.  */
 
+#include <string.h>
+
 #include "gmp-impl.h"
 #include "longlong.h"
 
@@ -48,11 +50,13 @@
 void
 mpz_nextprime (mpz_ptr p, mpz_srcptr n)
 {
-  unsigned short *moduli;
+  unsigned short *next_mult;
+  char *sieve;
   unsigned long difference;
-  int i;
+  int i, m;
   unsigned prime_limit;
   unsigned long prime;
+  unsigned two_prime;
   mp_size_t pn;
   mp_bitcnt_t nbits;
   unsigned incr;
@@ -79,49 +83,59 @@
 
   TMP_SMARK;
 
-  /* Compute residues modulo small odd primes */
-  moduli = TMP_SALLOC_TYPE (prime_limit, unsigned short);
+  /* Compute next multiple for small odd primes */
+  next_mult = TMP_SALLOC_TYPE (prime_limit, unsigned short);
+
+  #define INCR_LIMIT 0x400
+  sieve = TMP_SALLOC_TYPE (INCR_LIMIT, char);
+  memset(sieve, 0, INCR_LIMIT);
 
+  /* Invert p for modulo */
+  mpz_neg(p, p);
+  prime = 3;
+  for (i = 0; i < prime_limit; i++)
+    {
+      m = mpz_fdiv_ui(p, prime);
+      /* Only care about odd multiplies */
+      if (m & 1)
+        m += prime;
+
+      /* mark off any composites in sieve */
+      for (; m < INCR_LIMIT; m += 2*prime)
+        sieve[m] = 1;
+      next_mult[i] = m;
+      prime += primegap[i];
+    }
+  mpz_neg(p, p);
+
+  difference = 0;
   for (;;)
     {
-      /* FIXME: Compute lazily? */
-      prime = 3;
-      for (i = 0; i < prime_limit; i++)
-	{
-	  moduli[i] = mpz_tdiv_ui (p, prime);
-	  prime += primegap[i];
-	}
-
-#define INCR_LIMIT 0x10000	/* deep science */
-
-      for (difference = incr = 0; incr < INCR_LIMIT; difference += 2)
+      for (incr = 0; incr < INCR_LIMIT; difference += 2, incr += 2)
 	{
-	  /* First check residues */
-	  prime = 3;
-	  for (i = 0; i < prime_limit; i++)
-	    {
-	      unsigned r;
-	      /* FIXME: Reduce moduli + incr and store back, to allow for
-		 division-free reductions.  Alternatively, table primes[]'s
-		 inverses (mod 2^16).  */
-	      r = (moduli[i] + incr) % prime;
-	      prime += primegap[i];
-
-	      if (r == 0)
-		goto next;
-	    }
+          if (sieve[incr] == 1)
+            continue;
 
 	  mpz_add_ui (p, p, difference);
 	  difference = 0;
 
 	  /* Miller-Rabin test */
-	  if (mpz_millerrabin (p, 25))
+	  if (mpz_millerrabin (p, 25)) {
 	    goto done;
-	next:;
-	  incr += 2;
+          }
 	}
-      mpz_add_ui (p, p, difference);
-      difference = 0;
+
+      /* Sieve next segment */
+      two_prime = 6;
+      memset(sieve, 0, INCR_LIMIT);
+      for (i = 0; i < prime_limit; i++)
+        {
+          m = next_mult[i] - INCR_LIMIT;
+          for (; m < INCR_LIMIT; m += two_prime)
+            sieve[m] = 1;
+          next_mult[i] = m;
+          two_prime += 2 * primegap[i];
+        }
     }
  done:
   TMP_SFREE;
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to