>From Marco Bodrato's analysis I got inspired and modified nextprime to uses a much larger segmented sieve and tuned the prime limit for large inputs. You can see some of my thresholds and testing input in this google sheet[1]. The end result is sieve up ~ LOG2(N) ^ (20/7) / 3268 For 1000 bit numbers this sieves up to 114,000 (in 1e-5 seconds) and using the resulting 11k primes for trial division. For 10,000 bit numbers this sieves up to 80 million (which takes ~4 seconds but reduces nextprime from ~300seconds to ~200 seconds)
Compared with the segmented sieve version this ends up being 1.3x-2.5x faster for numbers > 500 bits. I have detailed numbers justifying my calculations in the google sheet[1] and colab python optimizer script[2] My code I used to benchmark attached in the first patch: tune.patch It prints some timing info at a number of different input sizes $ hg import ~/Downloads/tune.patch --no-commit $ cd ~/Projects/gmp/build/ && make -j8 >> /dev/null && cd tests/mpz && make clean t-nextprime-tune >> /dev/null && time ./t-nextprime-tune 32, count: 14062, gap: 302639, 0.08919 seconds = avg 0.00001 64, count: 7031, gap: 302559, 0.10212 seconds = avg 0.00001 128, count: 3515, gap: 312423, 0.20790 seconds = avg 0.00006 256, count: 1757, gap: 304433, 0.57408 seconds = avg 0.00033 1000, count: 450, gap: 316709, 11.79883 seconds = avg 0.02622 2000, count: 225, gap: 313179, 80.45503 seconds = avg 0.35758 My improvements to next_prime (with debugging print statements) are included in the 2nd patch nextprime_sieve_debug.patch $ hg import ~/Downloads/nextprime_sieve_debug.patch --force --no-commit 32 bits, 16 primes 0.0000 , mod 0.0000 | => 2 tests 64 bits, 32 primes 0.0000 , mod 0.0000 | => 4 tests 128 bits, 64 primes 0.0000 , mod 0.0000 | => 1 tests Pi(2324) = 342 | 256 bits, 342 primes 0.0000 , mod 0.0000 | => 8 tests 256, count: 1757, gap: 304433, 0.49184 seconds = avg 0.00028 Pi(114063) = 10792 | 1000 bits, 10792 primes 0.0002 , mod 0.0007 | => 59 tests 1000, count: 450, gap: 316709, 7.44536 seconds = avg 0.01655 Pi(826479) = 65910 | 2000 bits, 65910 primes 0.0012 , mod 0.0051 | => 18 tests 2000, count: 225, gap: 313179, 44.00031 seconds = avg 0.19556 I also tested with speed (thanks for helping me get that committed last year) $ hg revert mpz/nextprime.c $ hg import ~/Downloads/nextprime_sieve_clean.patch --force --no-commit $ cd build; make clean -j4; make -C tune/ speed -j4 $ ./tune/speed -s 300,400,500,750,1000 mpz_nextprime -P time_nextprime2 (run before change and save in time_nextprime) $ pr -m -t time_nextprime.data time_nextprime2.data | awk '{ print $0,"\t"$4/$2 }' 300 4.754023437e-04 300 4.111347656e-04 0.864814 400 1.221230469e-03 400 1.058787109e-03 0.866984 500 2.007132813e-03 500 1.590521484e-03 0.792435 750 8.607291016e-03 750 5.961019531e-03 0.692555 1000 2.284160547e-02 1000 1.475637695e-02 0.646031 I cleaned up the code without any of the debug printing in the 3rd patch nextprime_sieve_clean.patch which I'd love people to consider for submission [1] https://docs.google.com/spreadsheets/d/1sVm0ZGxFIASj5drxznxjrtTNMGgNXbZ4BZukReQlPVM/edit?usp=sharing [2] https://colab.research.google.com/drive/1UZyBbiRfuhFwQxM_N_ahXH0nNy9b-hXP On Fri, Mar 6, 2020 at 4:03 PM Seth Troisi <brain...@gmail.com> wrote: > Some more justification for my patch > > I wrote some code that replaces mpz_millerrabin with a static lookup and > ran it for a number of values. > > Existing Code (with real mpz_millerrabin) > 100, gap: 1533, 0.00164 seconds > 200, gap: 2547, 0.00675 seconds > 300, gap: 3757, 0.01161 seconds > 500, gap: 3765, 0.02568 seconds > 1000, gap: 20835, 0.67636 seconds > 2000, gap: 23685, 6.03032 seconds > 5000, gap: 58867, 167.65375 seconds > > With is_prime_lookup (e.g. nearly instant mpz_millerrabin) > 100, gap: 1533, 0.00018 seconds > 200, gap: 2547, 0.00045 seconds > 300, gap: 3757, 0.00087 seconds > 500, gap: 3765, 0.00085 seconds > 1000, gap: 27393, 0.00580 seconds > 2000, gap: 27711, 0.00557 seconds > 5000, gap: 74443, 0.01320 seconds > > Code in nextprime_segment.patch (see previous message) with is_prime_lookup > 100, gap: 1533, 0.00011 seconds > 200, gap: 2547, 0.00015 seconds > 300, gap: 3757, 0.00018 seconds > 500, gap: 3765, 0.00023 seconds > 1000, gap: 27393, 0.00061 seconds > 2000, gap: 27711, 0.00070 seconds > 5000, gap: 74443, 0.00198 seconds > > So the overhead from the existing code is less than > 6% at 200 bits > 3% at 500 bits > 1% at 1000 bits. > > The new code improves this to > 2% at 200 bits > 1% at 500 bits > 0.1% at 1000 bits > > I'm going to use this program to tune a new prime_limit on the range > 300-5000 bits which is harder to tune with tune/speed. > > > On Fri, Mar 6, 2020 at 1:55 PM Seth Troisi <brain...@gmail.com> wrote: > >> I tried using a segmented sieve. this avoids multiplication or mod in the >> inner loop. >> It's an improvement and the code is easy to comprehend. >> >> $ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime -p mpz_nextprime >> $ hg import ~/Downloads/nextprime_segment.patch --no-commit >> $ make -j4; make -C tune/ speed -j4 >> $ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime -P mpz_nextprime2 >> >> $ pr -m -t mpz_nextprime.data mpz_nextprime2.data | awk '{ print >> $0,"\t"$4/$2 }' >> 100 3.658593750e-05 100 3.326367187e-05 0.909193 >> 110 3.904296875e-05 110 3.598828125e-05 0.921761 >> 121 4.675585937e-05 121 4.157031250e-05 0.889093 >> 133 8.379882812e-05 133 7.977343750e-05 0.951964 >> 146 9.650195312e-05 146 8.803710937e-05 0.912283 >> 160 1.245546875e-04 160 1.032539062e-04 0.828985 >> 176 1.269101562e-04 176 1.173007813e-04 0.924282 >> 193 1.977187500e-04 193 1.846230469e-04 0.933766 >> 212 2.087812500e-04 212 1.933378906e-04 0.926031 >> 233 2.356660156e-04 233 2.218164063e-04 0.941232 >> 256 3.030214844e-04 256 2.868691406e-04 0.946696 >> 281 4.203593750e-04 281 3.908398437e-04 0.929775 >> 500 2.132431641e-03 500 2.013937500e-03 0.944432 >> 1000 2.307201758e-02 1000 2.284832031e-02 0.990304 >> >> This can undoubtedly benefit from a larger sieve also, >> which I'm testing in a different patch. >> >> On Thu, Mar 5, 2020 at 6:05 PM Seth Troisi <brain...@gmail.com> wrote: >> >>> I tried using a segmented sieve. this avoids multiplication or mod in >>> the inner loop. >>> It's an improvement and the code is easy to comprehend. >>> >>> >>> >>> >>> On Wed, Feb 12, 2020 at 10:25 AM Niels Möller <ni...@lysator.liu.se> >>> wrote: >>> >>>> "Marco Bodrato" <bodr...@mail.dm.unipi.it> writes: >>>> >>>> > Ciao, >>>> > >>>> > Il Mer, 12 Febbraio 2020 7:26 am, Niels Möller ha scritto: >>>> >> And for a small prime gap (common case), this cost should be the main >>>> >> part of the sieving. So it would make sense to optimize it. If we >>>> also >>>> >> use the ptab, we could compute modulo single-limb prime products >>>> using >>>> >> precomputed constants. >>>> > >>>> > Yes, of course. >>>> >>>> Another version below, using ptab. Timing: >>>> >>>> $ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime >>>> mpz_nextprime >>>> 100 0.000116762 >>>> 110 0.000126770 >>>> 121 0.000147203 >>>> 133 0.000257994 >>>> 146 0.000291051 >>>> 160 0.000343168 >>>> 176 0.000386605 >>>> 193 0.000625994 >>>> 212 0.000670035 >>>> 233 0.000763240 >>>> 256 0.000942154 >>>> 281 0.001418639 >>>> 500 0.006632588 >>>> 1000 0.073935992 >>>> >>>> So also close to 15% speedup for 100 bits, and the same as previous >>>> version, almost 25%, for 1000 bits. >>>> >>>> I used the very crude tuning >>>> >>>> limit = (nbits < 200) ? nbits / 2 : PTAB_SIZE; >>>> >>>> to avoid regressing for smaller sizes. >>>> >>>> > And the next question will be: should we generate on the fly an even >>>> > larger table of primes when the required size grows beyond the >>>> > precomputed table? >>>> >>>> Don't know. I don't quite understand how large a table could be >>>> beneficial, but I guess it might make sense to at least limit it to size >>>> of L1 cache. >>>> >>>> Regards, >>>> /Niels >>>> >>>> /* mpz_nextprime(p,t) - compute the next prime > t and store that in p. >>>> >>>> Copyright 1999-2001, 2008, 2009, 2012 Free Software Foundation, Inc. >>>> >>>> Contributed to the GNU project by Niels Möller and Torbjorn Granlund. >>>> >>>> This file is part of the GNU MP Library. >>>> >>>> The GNU MP Library is free software; you can redistribute it and/or >>>> modify >>>> it under the terms of either: >>>> >>>> * the GNU Lesser General Public License as published by the Free >>>> Software Foundation; either version 3 of the License, or (at your >>>> option) any later version. >>>> >>>> or >>>> >>>> * the GNU General Public License as published by the Free Software >>>> Foundation; either version 2 of the License, or (at your option) any >>>> later version. >>>> >>>> or both in parallel, as here. >>>> >>>> The GNU MP Library is distributed in the hope that it will be useful, >>>> but >>>> WITHOUT ANY WARRANTY; without even the implied warranty of >>>> MERCHANTABILITY >>>> or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License >>>> for more details. >>>> >>>> You should have received copies of the GNU General Public License and >>>> the >>>> GNU Lesser General Public License along with the GNU MP Library. If >>>> not, >>>> see https://www.gnu.org/licenses/. */ >>>> >>>> #include "gmp-impl.h" >>>> #include "longlong.h" >>>> >>>> #if 1 >>>> struct gmp_primes_dtab { >>>> mp_limb_t p; >>>> mp_limb_t binv; >>>> mp_limb_t lim; >>>> }; >>>> >>>> struct gmp_primes_ptab { >>>> mp_limb_t ppp; /* primes, multiplied together */ >>>> mp_limb_t cps[7]; /* ppp values pre-computed for mpn_mod_1s_4p */ >>>> gmp_uint_least32_t idx:24; /* index of first primes in dtab */ >>>> gmp_uint_least32_t np :8; /* number of primes related to this >>>> entry */ >>>> }; >>>> >>>> static const struct gmp_primes_dtab dtab[] = >>>> { >>>> #define WANT_dtab >>>> #define P(p,inv,lim) {p, inv,lim} >>>> #include "trialdivtab.h" >>>> #undef WANT_dtab >>>> #undef P >>>> }; >>>> >>>> #define DTAB_SIZE (sizeof (dtab) / sizeof (dtab[0])) >>>> >>>> static const struct gmp_primes_ptab ptab[] = >>>> { >>>> #define WANT_ptab >>>> #include "trialdivtab.h" >>>> #undef WANT_ptab >>>> }; >>>> >>>> #define PTAB_SIZE (sizeof (ptab) / sizeof (ptab[0])) >>>> >>>> void >>>> mpz_nextprime (mpz_ptr p, mpz_srcptr n) >>>> { >>>> mp_limb_t *moduli; >>>> unsigned long difference; >>>> int i; >>>> unsigned prime_limit; >>>> mp_bitcnt_t nbits; >>>> unsigned incr; >>>> TMP_DECL; >>>> >>>> /* First handle tiny numbers */ >>>> if (mpz_cmp_ui (n, 2) < 0) >>>> { >>>> mpz_set_ui (p, 2); >>>> return; >>>> } >>>> mpz_add_ui (p, n, 1); >>>> mpz_setbit (p, 0); >>>> >>>> if (mpz_cmp_ui (p, 7) <= 0) >>>> return; >>>> >>>> MPN_SIZEINBASE_2EXP(nbits, PTR(p), SIZ(p), 1); >>>> >>>> if (nbits < 200) >>>> { >>>> prime_limit = nbits / 2; >>>> if (SIZ(p) == 1) >>>> { >>>> /* If the prime list includes any prime >= p, do plain >>>> search. */ >>>> mp_limb_t limb = PTR(p)[0]; >>>> unsigned last = ptab[prime_limit-1].idx + >>>> ptab[prime_limit-1].np - 1; >>>> if (dtab[last].p >= limb) >>>> { >>>> while (dtab[last-1].p >= limb) >>>> last--; >>>> PTR(p)[0] = dtab[last].p; >>>> return; >>>> } >>>> } >>>> } >>>> else >>>> prime_limit = PTAB_SIZE; >>>> >>>> TMP_MARK; >>>> >>>> /* Compute residues modulo small odd primes */ >>>> moduli = TMP_ALLOC_TYPE (prime_limit, mp_limb_t); >>>> >>>> for (;;) >>>> { >>>> for (i = 0; i < prime_limit; i++) >>>> { >>>> mp_limb_t ppp = ptab[i].ppp; >>>> const mp_limb_t *cps = ptab[i].cps; >>>> moduli[i] = mpn_mod_1s_4p (PTR(p), SIZ(p), ppp << cps[1], >>>> cps); >>>> } >>>> >>>> #define INCR_LIMIT 0x10000 /* deep science */ >>>> >>>> for (difference = incr = 0; incr < INCR_LIMIT; difference += 2) >>>> { >>>> /* First check divisibility based on prime list */ >>>> for (i = 0; i < prime_limit; i++) >>>> { >>>> /* FIXME: Ensure that this addition can't overflow */ >>>> mp_limb_t m = moduli[i] + incr; >>>> int idx = ptab[i].idx; >>>> int np = ptab[i].np; >>>> int j; >>>> >>>> for (j = 0; j < np; j++) >>>> if (m * dtab[idx+j].binv <= dtab[idx+j].lim) >>>> goto next; >>>> } >>>> >>>> mpz_add_ui (p, p, difference); >>>> difference = 0; >>>> >>>> /* Miller-Rabin test */ >>>> if (mpz_millerrabin (p, 25)) >>>> goto done; >>>> next:; >>>> incr += 2; >>>> } >>>> mpz_add_ui (p, p, difference); >>>> difference = 0; >>>> } >>>> done: >>>> TMP_FREE; >>>> } >>>> #else >>>> static const unsigned char primegap[] = >>>> { >>>> 2,2,4,2,4,2,4,6,2,6,4,2,4,6,6,2,6,4,2,6,4,6,8,4,2,4,2,4,14,4,6, >>>> >>>> 2,10,2,6,6,4,6,6,2,10,2,4,2,12,12,4,2,4,6,2,10,6,6,6,2,6,4,2,10,14,4,2, >>>> 4,14,6,10,2,4,6,8,6,6,4,6,8,4,8,10,2,10,2,6,4,6,8,4,2,4,12,8,4,8,4,6, >>>> >>>> 12,2,18,6,10,6,6,2,6,10,6,6,2,6,6,4,2,12,10,2,4,6,6,2,12,4,6,8,10,8,10,8, >>>> >>>> 6,6,4,8,6,4,8,4,14,10,12,2,10,2,4,2,10,14,4,2,4,14,4,2,4,20,4,8,10,8,4,6, >>>> 6,14,4,6,6,8,6,12 >>>> }; >>>> >>>> #define NUMBER_OF_PRIMES 167 >>>> >>>> void >>>> mpz_nextprime (mpz_ptr p, mpz_srcptr n) >>>> { >>>> unsigned short *moduli; >>>> unsigned long difference; >>>> int i; >>>> unsigned prime_limit; >>>> unsigned long prime; >>>> mp_size_t pn; >>>> mp_bitcnt_t nbits; >>>> unsigned incr; >>>> TMP_SDECL; >>>> >>>> /* First handle tiny numbers */ >>>> if (mpz_cmp_ui (n, 2) < 0) >>>> { >>>> mpz_set_ui (p, 2); >>>> return; >>>> } >>>> mpz_add_ui (p, n, 1); >>>> mpz_setbit (p, 0); >>>> >>>> if (mpz_cmp_ui (p, 7) <= 0) >>>> return; >>>> >>>> pn = SIZ(p); >>>> MPN_SIZEINBASE_2EXP(nbits, PTR(p), pn, 1); >>>> if (nbits / 2 >= NUMBER_OF_PRIMES) >>>> prime_limit = NUMBER_OF_PRIMES - 1; >>>> else >>>> prime_limit = nbits / 2; >>>> >>>> TMP_SMARK; >>>> >>>> /* Compute residues modulo small odd primes */ >>>> moduli = TMP_SALLOC_TYPE (prime_limit, unsigned short); >>>> >>>> for (;;) >>>> { >>>> /* FIXME: Compute lazily? */ >>>> prime = 3; >>>> for (i = 0; i < prime_limit; i++) >>>> { >>>> moduli[i] = mpz_tdiv_ui (p, prime); >>>> prime += primegap[i]; >>>> } >>>> >>>> #define INCR_LIMIT 0x10000 /* deep science */ >>>> >>>> for (difference = incr = 0; incr < INCR_LIMIT; difference += 2) >>>> { >>>> /* First check residues */ >>>> prime = 3; >>>> for (i = 0; i < prime_limit; i++) >>>> { >>>> unsigned r; >>>> /* FIXME: Reduce moduli + incr and store back, to allow >>>> for >>>> division-free reductions. Alternatively, table >>>> primes[]'s >>>> inverses (mod 2^16). */ >>>> r = (moduli[i] + incr) % prime; >>>> prime += primegap[i]; >>>> >>>> if (r == 0) >>>> goto next; >>>> } >>>> >>>> mpz_add_ui (p, p, difference); >>>> difference = 0; >>>> >>>> /* Miller-Rabin test */ >>>> if (mpz_millerrabin (p, 25)) >>>> goto done; >>>> next:; >>>> incr += 2; >>>> } >>>> mpz_add_ui (p, p, difference); >>>> difference = 0; >>>> } >>>> done: >>>> TMP_SFREE; >>>> } >>>> >>>> #endif >>>> >>>> -- >>>> Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. >>>> Internet email is subject to wholesale government surveillance. >>>> >>>
diff -r bcca14c8a090 tests/mpz/Makefile.am --- a/tests/mpz/Makefile.am Thu Mar 05 00:43:26 2020 +0100 +++ b/tests/mpz/Makefile.am Sun Mar 08 17:26:51 2020 -0700 @@ -30,7 +30,8 @@ t-fac_ui t-mfac_uiui t-primorial_ui t-fib_ui t-lucnum_ui t-scan t-fits \ t-divis t-divis_2exp t-cong t-cong_2exp t-sizeinbase t-set_str \ t-aorsmul t-cmp_d t-cmp_si t-hamdist t-oddeven t-popcount t-set_f \ - t-io_raw t-import t-export t-pprime_p t-nextprime t-remove t-limbs + t-io_raw t-import t-export t-pprime_p t-nextprime t-remove t-limbs \ + t-nextprime-tune TESTS = $(check_PROGRAMS) diff -r bcca14c8a090 tests/mpz/t-nextprime-tune.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/mpz/t-nextprime-tune.c Sun Mar 08 17:26:51 2020 -0700 @@ -0,0 +1,83 @@ +/* Test mpz_nextprime. + +Copyright 2009, 2015, 2018, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + + +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> + +#include "gmp-impl.h" +#include "tests.h" + +int +main (int argc, char **argv) +{ + gmp_randstate_ptr rands; + int reps = 20; + + tests_start(); + + struct timeval tval_before, tval_after, tval_result; + + mpz_t n, m; + mpz_init(n); + mpz_init(m); + + //int sizes[] = {100, 200, 300, 500, 1000, 2000, 5000}; + //int sizes[] = {300, 500, 1000, 2000}; + //int sizes[] = {3000, 5000}; + int sizes[] = {32, 64, 128, 256, +// 180, 200, 220, +// 400, 500, 600, 700, 800, 900, + 1000, 1250, 1500, 1750, 2000, 2500, 3000, 4000, + 5000, 7500, + 10000, 12500, 15000 + }; + + for (int i = 0; i < sizeof(sizes)/sizeof(sizes[0]); i++) { + int size = sizes[i]; + + mpz_ui_pow_ui(n, 2, size-1); + mpz_set(m, n); + + gettimeofday(&tval_before, NULL); + +// int counts = 5; +// int counts = 50; + int counts = 30 * 15000 / size; + for (int t = 0; t < counts; t++) { + mpz_nextprime(n, n); + } + + gettimeofday(&tval_after, NULL); + timersub(&tval_after, &tval_before, &tval_result); + float seconds = tval_result.tv_sec; + seconds += tval_result.tv_usec / 1e6; + + mpz_sub(m, n, m); + gmp_printf("%d, count: %d, gap: %Zd, %.5f seconds = avg %.5f\n\n\n", + size, counts, m, seconds, seconds / counts); + } + + mpz_clear(n); + mpz_clear(m); + + tests_end (); + return 0; +}
diff -r bcca14c8a090 mpz/nextprime.c --- a/mpz/nextprime.c Thu Mar 05 00:43:26 2020 +0100 +++ b/mpz/nextprime.c Sun Mar 08 17:30:44 2020 -0700 @@ -30,10 +30,64 @@ GNU Lesser General Public License along with the GNU MP Library. If not, see https://www.gnu.org/licenses/. */ +#include <string.h> +#include <sys/time.h> + #include "gmp-impl.h" #include "longlong.h" -static const unsigned char primegap[] = +/*********************************************************/ +/* Section sieve: sieving functions and tools for primes */ +/*********************************************************/ + +static mp_limb_t +id_to_n (mp_limb_t id) { return id*3+1+(id&1); } + +static mp_limb_t +n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; } + +static mp_size_t +primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; } + + +/*************************************************************/ +/* Section macros: common macros, for swing/fac/bin (&sieve) */ +/*************************************************************/ + +#define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) \ + __max_i = (end); \ + \ + do { \ + ++__i; \ + if (((sieve)[__index] & __mask) == 0) \ + { \ + mp_limb_t prime; \ + prime = id_to_n(__i) + +#define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve) \ + do { \ + mp_limb_t __mask, __index, __max_i, __i; \ + \ + __i = (start)-(off); \ + __index = __i / GMP_LIMB_BITS; \ + __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS); \ + __i += (off); \ + \ + LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) + +#define LOOP_ON_SIEVE_STOP \ + } \ + __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1); \ + __index += __mask & 1; \ + } while (__i <= __max_i) + +#define LOOP_ON_SIEVE_END \ + LOOP_ON_SIEVE_STOP; \ + } while (0) + + + +static const unsigned char primegap_small[] = { 2,2,4,2,4,2,4,6,2,6,4,2,4,6,6,2,6,4,2,6,4,6,8,4,2,4,2,4,14,4,6, 2,10,2,6,6,4,6,6,2,10,2,4,2,12,12,4,2,4,6,2,10,6,6,6,2,6,4,2,10,14,4,2, @@ -48,15 +102,17 @@ void mpz_nextprime (mpz_ptr p, mpz_srcptr n) { - unsigned short *moduli; + unsigned int *next_mult; + char *sieve; + unsigned char *primegap; unsigned long difference; - int i; + int i, m; unsigned prime_limit; unsigned long prime; + unsigned two_prime; mp_size_t pn; mp_bitcnt_t nbits; unsigned incr; - TMP_SDECL; /* First handle tiny numbers */ if (mpz_cmp_ui (n, 2) < 0) @@ -70,59 +126,157 @@ if (mpz_cmp_ui (p, 7) <= 0) return; + TMP_DECL; + TMP_SDECL; + + // Question: Is this needed. + TMP_MARK; + TMP_SMARK; + + // Temp code for benchmarking. + int print = mpz_fdiv_ui(p, 65536) <= 1; + struct timeval tval_before, tval_after, tval_result; + if (print) + gettimeofday(&tval_before, NULL); + pn = SIZ(p); MPN_SIZEINBASE_2EXP(nbits, PTR(p), pn, 1); - if (nbits / 2 >= NUMBER_OF_PRIMES) - prime_limit = NUMBER_OF_PRIMES - 1; + if (nbits <= 199) + { + prime_limit = nbits / 2; + primegap = primegap_small; + } else - prime_limit = nbits / 2; + { + // Estimate a good sieve bound. Based on derivative of + // Merten's 3rd theorem * avg gap * cost of mod + // vs + // Cost of PRP test O(N^2.55) + + // 3.06e-4 * nbits ^ 2.86 ~= nbits ^ (20/7) / 1880 + mp_limb_t *sieve; + mpz_t tmp; + + mpz_init (tmp); + mpz_ui_pow_ui (tmp, nbits, 20); + mpz_root (tmp, tmp, 7); + mpz_tdiv_q_ui(tmp, tmp, 3268); + + // A reasonable bound if something goes wrong + long sieve_limit = 50000; + // avoid having a primegap > 255, first occurence: 436,273,009 + // TODO: break the rare gap larger than this into gaps <= 255 + if (mpz_cmp_ui(tmp, 430000000) <= 0) + sieve_limit = mpz_get_ui(tmp); + + mpz_clear (tmp); + ASSERT( 0 < sieve_limit && sieve_limit < 436000000 ); + + sieve = TMP_ALLOC_LIMBS (primesieve_size (sieve_limit)); + int prime_limit_pre = gmp_primesieve(sieve, sieve_limit); + primegap = TMP_ALLOC_TYPE (prime_limit_pre, unsigned char); + + prime_limit = 0; + int last = 3; + LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), n_to_bit (sieve_limit), 0, sieve); + primegap[prime_limit++] = prime - last; + last = prime; + LOOP_ON_SIEVE_END; + + ASSERT(prime_limit == prime_limit_pre); - TMP_SMARK; + if (print) { + gmp_printf("Pi(%d) = %d | ", sieve_limit, prime_limit); + } +//*/ + } + + // Temp code for benchmarking. + if (print) { + gettimeofday(&tval_after, NULL); + timersub(&tval_after, &tval_before, &tval_result); + float seconds = tval_result.tv_sec; + seconds += tval_result.tv_usec / 1e6; + gmp_printf("%d bits, %d primes %.4f ", nbits, prime_limit, seconds); + } + + /* Compute next multiple for small odd primes */ + next_mult = TMP_ALLOC_TYPE (prime_limit, unsigned int); + + // TODO set this based on nbits. + #define INCR_LIMIT 0x400 + sieve = TMP_SALLOC_TYPE (INCR_LIMIT, char); + memset(sieve, 0, INCR_LIMIT); - /* Compute residues modulo small odd primes */ - moduli = TMP_SALLOC_TYPE (prime_limit, unsigned short); + /* Invert p for modulo */ + mpz_neg(p, p); + prime = 3; + for (i = 0; i < prime_limit; i++) + { + m = mpz_fdiv_ui(p, prime); + /* Only care about odd multiplies */ + if (m & 1) + m += prime; + /* mark off any composites in sieve */ + for (; m < INCR_LIMIT; m += 2*prime) + sieve[m] = 1; + next_mult[i] = m; + prime += primegap[i]; + } + mpz_neg(p, p); + + // Temp code for benchmarking. + if (print) { + gettimeofday(&tval_before, NULL); + timersub(&tval_before, &tval_after, &tval_result); + float seconds = tval_result.tv_sec; + seconds += tval_result.tv_usec / 1e6; + gmp_printf(", mod %.4f | ", seconds); + } + + int tests = 0; + //goto done; + + difference = 0; for (;;) { - /* FIXME: Compute lazily? */ - prime = 3; - for (i = 0; i < prime_limit; i++) - { - moduli[i] = mpz_tdiv_ui (p, prime); - prime += primegap[i]; - } + for (incr = 0; incr < INCR_LIMIT; difference += 2, incr += 2) + { + if (sieve[incr] == 1) + continue; -#define INCR_LIMIT 0x10000 /* deep science */ + mpz_add_ui (p, p, difference); + difference = 0; + tests += 1; - for (difference = incr = 0; incr < INCR_LIMIT; difference += 2) - { - /* First check residues */ - prime = 3; - for (i = 0; i < prime_limit; i++) - { - unsigned r; - /* FIXME: Reduce moduli + incr and store back, to allow for - division-free reductions. Alternatively, table primes[]'s - inverses (mod 2^16). */ - r = (moduli[i] + incr) % prime; - prime += primegap[i]; + /* Miller-Rabin test */ + if (mpz_millerrabin (p, 25)) + goto done; + } - if (r == 0) - goto next; - } - - mpz_add_ui (p, p, difference); - difference = 0; - - /* Miller-Rabin test */ - if (mpz_millerrabin (p, 25)) - goto done; - next:; - incr += 2; - } - mpz_add_ui (p, p, difference); - difference = 0; + /* Sieve next segment */ + two_prime = 6; + memset(sieve, 0, INCR_LIMIT); + for (i = 0; i < prime_limit; i++) + { + m = next_mult[i] - INCR_LIMIT; + for (; m < INCR_LIMIT; m += two_prime) + sieve[m] = 1; + next_mult[i] = m; + two_prime += 2 * primegap[i]; + } } done: + // Temp code for benchmarking. + if (print) { + gmp_printf("=> %d\n", tests); + } + TMP_FREE; TMP_SFREE; } + +#undef LOOP_ON_SIEVE_END +#undef LOOP_ON_SIEVE_STOP +#undef LOOP_ON_SIEVE_BEGIN +#undef LOOP_ON_SIEVE_CONTINUE
diff -r bcca14c8a090 mpz/nextprime.c --- a/mpz/nextprime.c Thu Mar 05 00:43:26 2020 +0100 +++ b/mpz/nextprime.c Sun Mar 08 18:36:12 2020 -0700 @@ -30,10 +30,63 @@ GNU Lesser General Public License along with the GNU MP Library. If not, see https://www.gnu.org/licenses/. */ +#include <string.h> + #include "gmp-impl.h" #include "longlong.h" -static const unsigned char primegap[] = +/*********************************************************/ +/* Section sieve: sieving functions and tools for primes */ +/*********************************************************/ + +static mp_limb_t +id_to_n (mp_limb_t id) { return id*3+1+(id&1); } + +static mp_limb_t +n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; } + +static mp_size_t +primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; } + + +/*************************************************************/ +/* Section macros: common macros, for swing/fac/bin (&sieve) */ +/*************************************************************/ + +#define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) \ + __max_i = (end); \ + \ + do { \ + ++__i; \ + if (((sieve)[__index] & __mask) == 0) \ + { \ + mp_limb_t prime; \ + prime = id_to_n(__i) + +#define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve) \ + do { \ + mp_limb_t __mask, __index, __max_i, __i; \ + \ + __i = (start)-(off); \ + __index = __i / GMP_LIMB_BITS; \ + __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS); \ + __i += (off); \ + \ + LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) + +#define LOOP_ON_SIEVE_STOP \ + } \ + __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1); \ + __index += __mask & 1; \ + } while (__i <= __max_i) + +#define LOOP_ON_SIEVE_END \ + LOOP_ON_SIEVE_STOP; \ + } while (0) + + + +static const unsigned char primegap_small[] = { 2,2,4,2,4,2,4,6,2,6,4,2,4,6,6,2,6,4,2,6,4,6,8,4,2,4,2,4,14,4,6, 2,10,2,6,6,4,6,6,2,10,2,4,2,12,12,4,2,4,6,2,10,6,6,6,2,6,4,2,10,14,4,2, @@ -48,15 +101,17 @@ void mpz_nextprime (mpz_ptr p, mpz_srcptr n) { - unsigned short *moduli; - unsigned long difference; - int i; + unsigned int *next_mult; + char *sieve; + const unsigned char *primegap; unsigned prime_limit; unsigned long prime; + unsigned two_prime; mp_size_t pn; mp_bitcnt_t nbits; unsigned incr; - TMP_SDECL; + unsigned long difference; + int i, m; /* First handle tiny numbers */ if (mpz_cmp_ui (n, 2) < 0) @@ -70,59 +125,129 @@ if (mpz_cmp_ui (p, 7) <= 0) return; + TMP_DECL; + TMP_SDECL; + + // Question: Is this needed. + TMP_MARK; + TMP_SMARK; + pn = SIZ(p); MPN_SIZEINBASE_2EXP(nbits, PTR(p), pn, 1); - if (nbits / 2 >= NUMBER_OF_PRIMES) - prime_limit = NUMBER_OF_PRIMES - 1; + if (nbits <= 199) + { + prime_limit = nbits / 2; + primegap = primegap_small; + } else - prime_limit = nbits / 2; + { + // Estimates a better sieve bound. Based on derivative of + // Merten's 3rd theorem * avg gap * cost of mod + // vs + // Cost of PRP test O(N^2.55) + + mp_limb_t *sieve; + unsigned char *primegap_tmp; + int prime_limit_pre; + long sieve_limit; + int last_prime; + mpz_t tmp; + + // 3.06e-4 * nbits ^ 2.86 ~= nbits ^ (20/7) / 1880 + mpz_init (tmp); + mpz_ui_pow_ui (tmp, nbits, 20); + mpz_root (tmp, tmp, 7); + mpz_tdiv_q_ui(tmp, tmp, 3268); + + /* Avoid having a primegap > 255, first occurence: 436,273,009 + * For really large numbers (50,000 bits) limitting to 436M is + * 1-4% slower with reduced memory, code complexity. + */ + if (mpz_cmp_ui(tmp, 430000000) <= 0) + sieve_limit = mpz_get_ui(tmp); + else + sieve_limit = 430000000; + + mpz_clear (tmp); + ASSERT( 1000 < sieve_limit && sieve_limit < 436000000 ); - TMP_SMARK; + /* sieve numbers up to sieve_limit and save prime count */ + sieve = TMP_ALLOC_LIMBS (primesieve_size (sieve_limit)); + prime_limit_pre = gmp_primesieve(sieve, sieve_limit); + primegap_tmp = TMP_ALLOC_TYPE (prime_limit_pre, unsigned char); + primegap = primegap_tmp; + + prime_limit = 0; + last_prime = 3; + LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), n_to_bit (sieve_limit), 0, sieve); + primegap_tmp[prime_limit++] = prime - last_prime; + last_prime = prime; + LOOP_ON_SIEVE_END; + + /* Both primesieve and prime_limit ignore the first two primes. */ + ASSERT(prime_limit == prime_limit_pre); + } + + /* Compute next multiple for small odd primes */ + next_mult = TMP_ALLOC_TYPE (prime_limit, unsigned int); - /* Compute residues modulo small odd primes */ - moduli = TMP_SALLOC_TYPE (prime_limit, unsigned short); + // TODO set this based on nbits. + #define INCR_LIMIT 0x400 + sieve = TMP_SALLOC_TYPE (INCR_LIMIT, char); + memset(sieve, 0, INCR_LIMIT); + /* Invert p for modulo */ + mpz_neg(p, p); + prime = 3; + for (i = 0; i < prime_limit; i++) + { + m = mpz_fdiv_ui(p, prime); + /* Only care about odd multiplies (even indexes) */ + if (m & 1) + m += prime; + + /* mark off any composites in sieve */ + for (; m < INCR_LIMIT; m += 2*prime) + sieve[m] = 1; + next_mult[i] = m; + prime += primegap[i]; + } + mpz_neg(p, p); + + difference = 0; for (;;) { - /* FIXME: Compute lazily? */ - prime = 3; - for (i = 0; i < prime_limit; i++) - { - moduli[i] = mpz_tdiv_ui (p, prime); - prime += primegap[i]; - } + for (incr = 0; incr < INCR_LIMIT; difference += 2, incr += 2) + { + if (sieve[incr] == 1) + continue; -#define INCR_LIMIT 0x10000 /* deep science */ + mpz_add_ui (p, p, difference); + difference = 0; - for (difference = incr = 0; incr < INCR_LIMIT; difference += 2) - { - /* First check residues */ - prime = 3; - for (i = 0; i < prime_limit; i++) - { - unsigned r; - /* FIXME: Reduce moduli + incr and store back, to allow for - division-free reductions. Alternatively, table primes[]'s - inverses (mod 2^16). */ - r = (moduli[i] + incr) % prime; - prime += primegap[i]; + /* Miller-Rabin test */ + if (mpz_millerrabin (p, 25)) + goto done; + } - if (r == 0) - goto next; - } - - mpz_add_ui (p, p, difference); - difference = 0; - - /* Miller-Rabin test */ - if (mpz_millerrabin (p, 25)) - goto done; - next:; - incr += 2; - } - mpz_add_ui (p, p, difference); - difference = 0; + /* Sieve next segment */ + two_prime = 6; + memset(sieve, 0, INCR_LIMIT); + for (i = 0; i < prime_limit; i++) + { + m = next_mult[i] - INCR_LIMIT; + for (; m < INCR_LIMIT; m += two_prime) + sieve[m] = 1; + next_mult[i] = m; + two_prime += 2 * primegap[i]; + } } done: + TMP_FREE; TMP_SFREE; } + +#undef LOOP_ON_SIEVE_END +#undef LOOP_ON_SIEVE_STOP +#undef LOOP_ON_SIEVE_BEGIN +#undef LOOP_ON_SIEVE_CONTINUE
_______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel