I tried using a segmented sieve. this avoids multiplication or mod in the inner loop. It's an improvement and the code is easy to comprehend.
$ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime -p mpz_nextprime $ hg import ~/Downloads/nextprime_segment.patch --no-commit $ make -j4; make -C tune/ speed -j4 $ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime -P mpz_nextprime2 $ pr -m -t mpz_nextprime.data mpz_nextprime2.data | awk '{ print $0,"\t"$4/$2 }' 100 3.658593750e-05 100 3.326367187e-05 0.909193 110 3.904296875e-05 110 3.598828125e-05 0.921761 121 4.675585937e-05 121 4.157031250e-05 0.889093 133 8.379882812e-05 133 7.977343750e-05 0.951964 146 9.650195312e-05 146 8.803710937e-05 0.912283 160 1.245546875e-04 160 1.032539062e-04 0.828985 176 1.269101562e-04 176 1.173007813e-04 0.924282 193 1.977187500e-04 193 1.846230469e-04 0.933766 212 2.087812500e-04 212 1.933378906e-04 0.926031 233 2.356660156e-04 233 2.218164063e-04 0.941232 256 3.030214844e-04 256 2.868691406e-04 0.946696 281 4.203593750e-04 281 3.908398437e-04 0.929775 500 2.132431641e-03 500 2.013937500e-03 0.944432 1000 2.307201758e-02 1000 2.284832031e-02 0.990304 This can undoubtedly benefit from a larger sieve also, which I'm testing in a different patch. On Thu, Mar 5, 2020 at 6:05 PM Seth Troisi <brain...@gmail.com> wrote: > I tried using a segmented sieve. this avoids multiplication or mod in the > inner loop. > It's an improvement and the code is easy to comprehend. > > > > > On Wed, Feb 12, 2020 at 10:25 AM Niels Möller <ni...@lysator.liu.se> > wrote: > >> "Marco Bodrato" <bodr...@mail.dm.unipi.it> writes: >> >> > Ciao, >> > >> > Il Mer, 12 Febbraio 2020 7:26 am, Niels Möller ha scritto: >> >> And for a small prime gap (common case), this cost should be the main >> >> part of the sieving. So it would make sense to optimize it. If we also >> >> use the ptab, we could compute modulo single-limb prime products using >> >> precomputed constants. >> > >> > Yes, of course. >> >> Another version below, using ptab. Timing: >> >> $ ./tune/speed -s 100-300,500,1000 -f 1.1 mpz_nextprime >> mpz_nextprime >> 100 0.000116762 >> 110 0.000126770 >> 121 0.000147203 >> 133 0.000257994 >> 146 0.000291051 >> 160 0.000343168 >> 176 0.000386605 >> 193 0.000625994 >> 212 0.000670035 >> 233 0.000763240 >> 256 0.000942154 >> 281 0.001418639 >> 500 0.006632588 >> 1000 0.073935992 >> >> So also close to 15% speedup for 100 bits, and the same as previous >> version, almost 25%, for 1000 bits. >> >> I used the very crude tuning >> >> limit = (nbits < 200) ? nbits / 2 : PTAB_SIZE; >> >> to avoid regressing for smaller sizes. >> >> > And the next question will be: should we generate on the fly an even >> > larger table of primes when the required size grows beyond the >> > precomputed table? >> >> Don't know. I don't quite understand how large a table could be >> beneficial, but I guess it might make sense to at least limit it to size >> of L1 cache. >> >> Regards, >> /Niels >> >> /* mpz_nextprime(p,t) - compute the next prime > t and store that in p. >> >> Copyright 1999-2001, 2008, 2009, 2012 Free Software Foundation, Inc. >> >> Contributed to the GNU project by Niels Möller and Torbjorn Granlund. >> >> This file is part of the GNU MP Library. >> >> The GNU MP Library is free software; you can redistribute it and/or modify >> it under the terms of either: >> >> * the GNU Lesser General Public License as published by the Free >> Software Foundation; either version 3 of the License, or (at your >> option) any later version. >> >> or >> >> * the GNU General Public License as published by the Free Software >> Foundation; either version 2 of the License, or (at your option) any >> later version. >> >> or both in parallel, as here. >> >> The GNU MP Library is distributed in the hope that it will be useful, but >> WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY >> or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License >> for more details. >> >> You should have received copies of the GNU General Public License and the >> GNU Lesser General Public License along with the GNU MP Library. If not, >> see https://www.gnu.org/licenses/. */ >> >> #include "gmp-impl.h" >> #include "longlong.h" >> >> #if 1 >> struct gmp_primes_dtab { >> mp_limb_t p; >> mp_limb_t binv; >> mp_limb_t lim; >> }; >> >> struct gmp_primes_ptab { >> mp_limb_t ppp; /* primes, multiplied together */ >> mp_limb_t cps[7]; /* ppp values pre-computed for mpn_mod_1s_4p */ >> gmp_uint_least32_t idx:24; /* index of first primes in dtab */ >> gmp_uint_least32_t np :8; /* number of primes related to this entry >> */ >> }; >> >> static const struct gmp_primes_dtab dtab[] = >> { >> #define WANT_dtab >> #define P(p,inv,lim) {p, inv,lim} >> #include "trialdivtab.h" >> #undef WANT_dtab >> #undef P >> }; >> >> #define DTAB_SIZE (sizeof (dtab) / sizeof (dtab[0])) >> >> static const struct gmp_primes_ptab ptab[] = >> { >> #define WANT_ptab >> #include "trialdivtab.h" >> #undef WANT_ptab >> }; >> >> #define PTAB_SIZE (sizeof (ptab) / sizeof (ptab[0])) >> >> void >> mpz_nextprime (mpz_ptr p, mpz_srcptr n) >> { >> mp_limb_t *moduli; >> unsigned long difference; >> int i; >> unsigned prime_limit; >> mp_bitcnt_t nbits; >> unsigned incr; >> TMP_DECL; >> >> /* First handle tiny numbers */ >> if (mpz_cmp_ui (n, 2) < 0) >> { >> mpz_set_ui (p, 2); >> return; >> } >> mpz_add_ui (p, n, 1); >> mpz_setbit (p, 0); >> >> if (mpz_cmp_ui (p, 7) <= 0) >> return; >> >> MPN_SIZEINBASE_2EXP(nbits, PTR(p), SIZ(p), 1); >> >> if (nbits < 200) >> { >> prime_limit = nbits / 2; >> if (SIZ(p) == 1) >> { >> /* If the prime list includes any prime >= p, do plain >> search. */ >> mp_limb_t limb = PTR(p)[0]; >> unsigned last = ptab[prime_limit-1].idx + >> ptab[prime_limit-1].np - 1; >> if (dtab[last].p >= limb) >> { >> while (dtab[last-1].p >= limb) >> last--; >> PTR(p)[0] = dtab[last].p; >> return; >> } >> } >> } >> else >> prime_limit = PTAB_SIZE; >> >> TMP_MARK; >> >> /* Compute residues modulo small odd primes */ >> moduli = TMP_ALLOC_TYPE (prime_limit, mp_limb_t); >> >> for (;;) >> { >> for (i = 0; i < prime_limit; i++) >> { >> mp_limb_t ppp = ptab[i].ppp; >> const mp_limb_t *cps = ptab[i].cps; >> moduli[i] = mpn_mod_1s_4p (PTR(p), SIZ(p), ppp << cps[1], cps); >> } >> >> #define INCR_LIMIT 0x10000 /* deep science */ >> >> for (difference = incr = 0; incr < INCR_LIMIT; difference += 2) >> { >> /* First check divisibility based on prime list */ >> for (i = 0; i < prime_limit; i++) >> { >> /* FIXME: Ensure that this addition can't overflow */ >> mp_limb_t m = moduli[i] + incr; >> int idx = ptab[i].idx; >> int np = ptab[i].np; >> int j; >> >> for (j = 0; j < np; j++) >> if (m * dtab[idx+j].binv <= dtab[idx+j].lim) >> goto next; >> } >> >> mpz_add_ui (p, p, difference); >> difference = 0; >> >> /* Miller-Rabin test */ >> if (mpz_millerrabin (p, 25)) >> goto done; >> next:; >> incr += 2; >> } >> mpz_add_ui (p, p, difference); >> difference = 0; >> } >> done: >> TMP_FREE; >> } >> #else >> static const unsigned char primegap[] = >> { >> 2,2,4,2,4,2,4,6,2,6,4,2,4,6,6,2,6,4,2,6,4,6,8,4,2,4,2,4,14,4,6, >> 2,10,2,6,6,4,6,6,2,10,2,4,2,12,12,4,2,4,6,2,10,6,6,6,2,6,4,2,10,14,4,2, >> 4,14,6,10,2,4,6,8,6,6,4,6,8,4,8,10,2,10,2,6,4,6,8,4,2,4,12,8,4,8,4,6, >> >> 12,2,18,6,10,6,6,2,6,10,6,6,2,6,6,4,2,12,10,2,4,6,6,2,12,4,6,8,10,8,10,8, >> >> 6,6,4,8,6,4,8,4,14,10,12,2,10,2,4,2,10,14,4,2,4,14,4,2,4,20,4,8,10,8,4,6, >> 6,14,4,6,6,8,6,12 >> }; >> >> #define NUMBER_OF_PRIMES 167 >> >> void >> mpz_nextprime (mpz_ptr p, mpz_srcptr n) >> { >> unsigned short *moduli; >> unsigned long difference; >> int i; >> unsigned prime_limit; >> unsigned long prime; >> mp_size_t pn; >> mp_bitcnt_t nbits; >> unsigned incr; >> TMP_SDECL; >> >> /* First handle tiny numbers */ >> if (mpz_cmp_ui (n, 2) < 0) >> { >> mpz_set_ui (p, 2); >> return; >> } >> mpz_add_ui (p, n, 1); >> mpz_setbit (p, 0); >> >> if (mpz_cmp_ui (p, 7) <= 0) >> return; >> >> pn = SIZ(p); >> MPN_SIZEINBASE_2EXP(nbits, PTR(p), pn, 1); >> if (nbits / 2 >= NUMBER_OF_PRIMES) >> prime_limit = NUMBER_OF_PRIMES - 1; >> else >> prime_limit = nbits / 2; >> >> TMP_SMARK; >> >> /* Compute residues modulo small odd primes */ >> moduli = TMP_SALLOC_TYPE (prime_limit, unsigned short); >> >> for (;;) >> { >> /* FIXME: Compute lazily? */ >> prime = 3; >> for (i = 0; i < prime_limit; i++) >> { >> moduli[i] = mpz_tdiv_ui (p, prime); >> prime += primegap[i]; >> } >> >> #define INCR_LIMIT 0x10000 /* deep science */ >> >> for (difference = incr = 0; incr < INCR_LIMIT; difference += 2) >> { >> /* First check residues */ >> prime = 3; >> for (i = 0; i < prime_limit; i++) >> { >> unsigned r; >> /* FIXME: Reduce moduli + incr and store back, to allow for >> division-free reductions. Alternatively, table >> primes[]'s >> inverses (mod 2^16). */ >> r = (moduli[i] + incr) % prime; >> prime += primegap[i]; >> >> if (r == 0) >> goto next; >> } >> >> mpz_add_ui (p, p, difference); >> difference = 0; >> >> /* Miller-Rabin test */ >> if (mpz_millerrabin (p, 25)) >> goto done; >> next:; >> incr += 2; >> } >> mpz_add_ui (p, p, difference); >> difference = 0; >> } >> done: >> TMP_SFREE; >> } >> >> #endif >> >> -- >> Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. >> Internet email is subject to wholesale government surveillance. >> >
diff -r bcca14c8a090 mpz/nextprime.c --- a/mpz/nextprime.c Thu Mar 05 00:43:26 2020 +0100 +++ b/mpz/nextprime.c Fri Mar 06 13:55:09 2020 -0800 @@ -30,6 +30,8 @@ GNU Lesser General Public License along with the GNU MP Library. If not, see https://www.gnu.org/licenses/. */ +#include <string.h> + #include "gmp-impl.h" #include "longlong.h" @@ -48,11 +50,13 @@ void mpz_nextprime (mpz_ptr p, mpz_srcptr n) { - unsigned short *moduli; + unsigned short *next_mult; + char *sieve; unsigned long difference; - int i; + int i, m; unsigned prime_limit; unsigned long prime; + unsigned two_prime; mp_size_t pn; mp_bitcnt_t nbits; unsigned incr; @@ -79,49 +83,59 @@ TMP_SMARK; - /* Compute residues modulo small odd primes */ - moduli = TMP_SALLOC_TYPE (prime_limit, unsigned short); + /* Compute next multiple for small odd primes */ + next_mult = TMP_SALLOC_TYPE (prime_limit, unsigned short); + + #define INCR_LIMIT 0x400 + sieve = TMP_SALLOC_TYPE (INCR_LIMIT, char); + memset(sieve, 0, INCR_LIMIT); + /* Invert p for modulo */ + mpz_neg(p, p); + prime = 3; + for (i = 0; i < prime_limit; i++) + { + m = mpz_fdiv_ui(p, prime); + /* Only care about odd multiplies */ + if (m & 1) + m += prime; + + /* mark off any composites in sieve */ + for (; m < INCR_LIMIT; m += 2*prime) + sieve[m] = 1; + next_mult[i] = m; + prime += primegap[i]; + } + mpz_neg(p, p); + + difference = 0; for (;;) { - /* FIXME: Compute lazily? */ - prime = 3; - for (i = 0; i < prime_limit; i++) - { - moduli[i] = mpz_tdiv_ui (p, prime); - prime += primegap[i]; - } - -#define INCR_LIMIT 0x10000 /* deep science */ - - for (difference = incr = 0; incr < INCR_LIMIT; difference += 2) + for (incr = 0; incr < INCR_LIMIT; difference += 2, incr += 2) { - /* First check residues */ - prime = 3; - for (i = 0; i < prime_limit; i++) - { - unsigned r; - /* FIXME: Reduce moduli + incr and store back, to allow for - division-free reductions. Alternatively, table primes[]'s - inverses (mod 2^16). */ - r = (moduli[i] + incr) % prime; - prime += primegap[i]; - - if (r == 0) - goto next; - } + if (sieve[incr] == 1) + continue; mpz_add_ui (p, p, difference); difference = 0; /* Miller-Rabin test */ - if (mpz_millerrabin (p, 25)) + if (mpz_millerrabin (p, 25)) { goto done; - next:; - incr += 2; + } } - mpz_add_ui (p, p, difference); - difference = 0; + + /* Sieve next segment */ + two_prime = 6; + memset(sieve, 0, INCR_LIMIT); + for (i = 0; i < prime_limit; i++) + { + m = next_mult[i] - INCR_LIMIT; + for (; m < INCR_LIMIT; m += two_prime) + sieve[m] = 1; + next_mult[i] = m; + two_prime += 2 * primegap[i]; + } } done: TMP_SFREE;
_______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel