`New and Improved` (current) version, with significantly reduced memory footprint as numbers get larger.
Now I create|initialize `nextp` arrays of 1st prime multiples in each thread, which gets gc'd (garbage collected) at end of thread, thus using a much lower constant runtime memory. (I could also generate|use the segment memory on a per thread basis too, but at the end I need full last segment memory to find last twinprime and correct count.) For this version I need to compile using gc, so compile as below: $ nim c --cc:gcc --d:release --threads:on twinprimes_ssoz.nim Hey @mratsim, I started converting code to C++, and hit a roadblock in passing multiple outputs from a function to multiple inputs, needed to do `const paramterspxx = genPGparameters(xx)` and in `selectPG`. And not sure how to compile to deallocate memory from a thread, as C++ doesn't use GC. Also, the information you provided on using OpenMP is way over my head. Do you feel like trying to do an OpenMP implementation in Nim? Anyway, here's the updated gist file, and source code: [https://gist.github.com/jzakiya/6c7e1868bd749a6b1add62e3e3b2341e](https://gist.github.com/jzakiya/6c7e1868bd749a6b1add62e3e3b2341e) #[ This Nim source file is a multple threaded implementation to perform an extremely fast Segmented Sieve of Zakiya (SSoZ) to find Twin Primes <= N. Based on the size of input value N, it dynamically selects the best PG to use from PGs P5, P7, P11, and P13, and then sets the optimum segment size. This code was developed on a System76 laptop with an Intel I7 6700HQ cpu, 2.6-3.5 GHz clock, with 8 threads, and 16GB of memory. I suspect parameter tuning may have to be done on other hadware systems (ARM, PowerPC, etc) to achieve optimum performance on them. It was tested on various Linux 64 bit distros, native and in Virtual Box, using 8 or 4 threads, or 16|4GB of mem. At time of writing, verified using Nim: 0.17.2, 0.18.0 The code was compiled using these compiler directives|flags. Not usng GC produces smaller executable, and maybe little faster. Try w/wo to see difference on your system. For optimum performance use gcc over clang. $ nim c --cc:gcc --d:release --threads:on twinprimes_ssoz.nim Then run executable: $ ./twinprimes_ssoz <cr>, and enter value for N. As coded, input values cover the range: 13 -- 2^64-1 This source file, and updates, will be available here: https://gist.github.com/jzakiya/6c7e1868bd749a6b1add62e3e3b2341e Related code, papers, and tutorials, are available here: https://www.scribd.com/doc/228155369/The-Segmented-Sieve-of-Zakiya-SSoZ https://www.scribd.com/document/266461408/Primes-Utils-Handbook Use of this code is free subject to acknowledgment of copyright. Copyright (c) 2017 Jabari Zakiya -- jzakiya at gmail dot com Version Date: 2018/04/03 This code is provided under the terms of the GNU General Public License Version 3, GPLv3, or greater. License copy/terms are here: http://www.gnu.org/licenses/ ]# import math # for sqrt, gcd, mod functions import strutils, typetraits # for number input import times, os # for timing code execution import osproc # for getting threads count import threadpool # for parallel processing {.experimental.} # required to use 'parallel' (<= 0.18.x) # Compute modular inverse a^(-1) of a to base b, e.g. a*(a^-1) mod b = 1. proc modinv(a0, b0: int): int = var (a, b, x0) = (a0, b0, 0) result = 1 if b == 1: return while a > 1: let q = a div b a = a mod b swap a, b result -= q * x0 swap x0, result if result < 0: result += b0 # Create constant parameters for given PG at compile time. proc genPGparameters(prime: int): (int, int, int, seq[int], seq[int], seq[int]) = echo("generating parameters for P", prime) let primes = [2, 3, 5, 7, 11, 13, 17, 19, 23] var modpg = 1 for prm in primes: (modpg *= prm; if prm == prime: break) # PG's mdoulus var residues: seq[int] = @[] # generate PG's residue values here var (pc, inc) = (1, 4) while pc < modpg: (pc += inc; inc = inc xor 0b110; if gcd(modpg, pc) == 1: residues.add(pc)) let rescnt = residues.len # PG's residues count var restwins: seq[int] = @[] # extract twinpair residues here var j = 0 while j < rescnt-1: if residues[j]+2 == residues[j+1]: restwins.add residues[j]; restwins.add residues[j+1]; j += 1 j += 1 let rescntp = restwins.len # twinpair residues count var inverses: seq[int] = @[] # create PG's residues inverses here for res in residues: inverses.add(modinv(res,modpg)) result = (modpg, rescnt, rescntp, residues, restwins, inverses) # Generate at compile time the parameters for PGs P5-P13. const parametersp5 = genPGparameters(5) const parametersp7 = genPGparameters(7) const parametersp11 = genPGparameters(11) const parametersp13 = genPGparameters(13) # Global parameters var pcnt = 0 # number of primes from r1..sqrt(N) num = 0'u64 # adjusted (odd) input value twinscnt = 0'u64 # number of twinprimes <= N primes: seq[int] # list of primes r1..sqrt(N) seg: seq[uint8] # segment byte array to perform ssoz KB = 0 # segment size for each seg restrack cnts: seq[uint] # hold twinprime counts for seg bytes pos: seq[int] # convert residue val to its residues index val # faster than `residues.find(residue)` modpg: int # PG's modulus value rescnt: int # PG's residues count rescntp: int # PG's twinpairs residues count residues: seq[int] # PG's list of residues restwins: seq[int] # PG's list of twinpair residues resinvrs: seq[int] # PG's list of residues inverses Bn: int # segment size factor for PG and input number # Select at runtime best PG and segment size factor to use for input value. # These are good estimates derived from PG data profiling. Can be improved. proc selectPG(num: uint) = if num < 10_000_000: (modpg, rescnt, rescntp, residues, restwins, resinvrs) = parametersp5 Bn = 16 elif num < 1_100_000_000'u: (modpg, rescnt, rescntp, residues, restwins, resinvrs) = parametersp7 Bn = 32 elif num < 35_500_000_000'u: (modpg, rescnt, rescntp, residues, restwins, resinvrs) = parametersp11 Bn = 64 else: (modpg, rescnt, rescntp, residues, restwins, resinvrs) = parametersp13 if num > 7_000_000_000_000'u: Bn = 384 elif num > 2_500_000_000_000'u: Bn = 320 elif num > 250_000_000_000'u: Bn = 196 else: Bn = 96 cnts = newSeq[uint](rescntp div 2) # twinprime sums for seg bytes pos = newSeq[int](modpg) # create modpg size array to for i in 0..rescnt-1: pos[residues[i]-2] = i # convert residue val -> indx # Compute the list of primes r1..sqrt(input_num), and store in global # 'primes' array, and store their count in global var 'pcnt'. # Any algorithm (fast|small) is usable. Here the SoZ for the PG is used. proc sozpg(val: int | int64) = # Use PG to finds primes upto val let md = modpg # PG's modulus value let rscnt = rescnt # PG's residue count let res = residues # PG's residues list let num = (val-1) or 1 # if val even then subtract 1 var k = num div md # compute its residue group value var modk = md * k # compute the resgroup base value var r = 0 # from 1st residue in num's resgroup while num >= modk+res[r]: r += 1 # find last pc val|position <= num let maxpcs = k * rscnt + r # max number of prime candidates <= num var prms = newSeq[bool](maxpcs) # array of prime candidates set False let sqrtN =int(sqrt float64(num)) # compute integer sqrt of input num modk = 0; r = -1; k = 0 # initialize sieve parameters # mark the multiples of the primes r1..sqrtN in 'prms' for prm in prms: # from list of pc positions in prms r += 1; if r == rscnt: (r = 0; modk += md; k += 1) if prm: continue # if pc not prime go to next location let prm_r = res[r] # if prime save its residue value let prime = modk + prm_r # numerate the prime value if prime > sqrtN: break # we're finished if it's > sqrtN let prmstep = prime * rscnt # prime's stepsize to mark its mults for ri in res: # mark prime's multiples in prms let prod = prm_r * ri - 2 # compute cross-product for r|ri pair # compute resgroup val of 1st prime multiple for each restrack # starting there, mark all prime multiples on restrack upto end of prms var prm_mult = (k*(prime + ri) + prod div md)*rscnt + pos[prod mod md] while prm_mult < maxpcs: prms[prm_mult] = true; prm_mult += prmstep # prms now contains the nonprime positions for the prime candidates r1..N # extract primes into global var 'primes' and count into global var 'pcnt' primes = @[] # create empty dynamic array for primes modk = 0; r = -1 # initialize loop parameters for prm in prms: # numerate|store primes from pcs list r += 1; if r == rscnt: (r = 0; modk += md) if not prm: primes.add(modk + res[r]) # put prime in global 'primes' list pcnt = primes.len # set global count of primes # Print twinprimes for given twinpair for given segment slice. # Primes will not be displayed in sorted order, collect|sort later for that. proc printprms(Kn: int, Ki: uint64, indx: int)= # starting at seg resgroup Ki var modk = Ki * modpg.uint # compute its base num let s_row = indx * KB # set seg row index for given twinpair let res = restwins[indx * 2 + 1] # get upper twinpair residue value for k in 0..Kn-1: # then for each of Kn seg resgroups if int(seg[s_row + k]) == 0: # if both lsb bits in byte are prime if modk + res.uint <= num: # and upper twinprime <= num echo(modk+res.uint-1) # print one twinprime mid val per line modk += modpg.uint # For 'nextp' array for given twinpair thread, for twinpair restracks 'i|i+1', # init each col w/1st prime multiple resgroup, for the primes r1..sqrt(N). proc resinit(i: int, nextp: seq[uint64]): seq[uint64] = var nextp = nextp # 1st mults array for this twinpair for indx in 0..1: # for both twinpair residues let row = indx * pcnt # along each restrack row in 'nextp' let res = restwins[i + indx] # for this twinpair residue for j, prime in primes: # for each primes r1..sqrt(N) let k = (prime-2) div modpg # find the resgroup it's in let r = (prime-2) mod modpg + 2 # and its residue value let ri = (res*resinvrs[pos[r-2]]-2) mod modpg + 2 # compute the ri for r let prod = r * ri - 2 # compute residues cross-product # compute|store 1st prime mult resgroup at col j for prime, for 'res' nextp[row + j] = uint(k*(prime + ri) + prod div modpg) result = nextp # Perform in a thread, the ssoz for a given twinpair, along its seg byte row, # for Kmax resgroups, and max segsize of Ks regroups, for twinpairs at 'indx'. # First create|init 'nextp' array of 1st prime mults for given twinpair, which # at end of thread will be gc'd. For sieve, set 2 lsbs in seg byte to '1' for # primes mults resgroups and update 'nextp' restrack seg slices acccordingly. # Then compute twinprimes count for segment, store in 'cnts' array for row. # Can optionally compile to print mid twinprime values generated by twinpair. proc twins_sieve(Kmax: uint, indx, Ks: int) {.gcsafe.} = {.gcsafe.}: let i = indx * 2 # lower lsb seg row index for twinpair var nextp = newSeq[uint64](pcnt * 2) # create 1st mults array for twinpair nextp = resinit(i, nextp) # init w/1st prime mults for twinpair var sum = 0'u # init primes cnt for this seg byte row var Ki = 0'u # 1st resgroup seg val for each slice let s_row = indx * KB # set seg row address for this twinpair while Ki < Kmax: # for Ks resgroup size slices upto Kmax let Kn = min(Ks, int(Kmax-Ki)) # set segment slice resgroup length for b in s_row..s_row+KB-1: seg[b]=0 # set all seg restrack bits to prime for r in 0..1: # for 2 lsbs for twinpair bits in byte let row = r * pcnt # set address to its restrack in 'nextp' let biti = uint8(1 shl r) # set residue track bit mask for j in 0..pcnt-1: # for each prime index r1..sqrt(N) if nextp[row + j] < Kn.uint: # if 1st mult resgroup is within 'seg' var k = int(nextp[row + j]) # starting from this resgroup in 'seg' let prime = primes[j] # for this prime while k < Kn: # for each primenth byte to end of 'seg' seg[s_row+k] = seg[s_row+k] or biti # mark restrack bit nonprime k += prime # compute next prime multiple resgroup nextp[row + j] = uint(k-Kn) # save 1st resgroup in next eligible seg else: nextp[row+j] -= Kn.uint # do if 1st mult resgroup not within seg for k in 0..Kn-1: (if seg[s_row+k] == 0: sum += 1) # sum bytes with twins #printprms(Kn, Ki, indx) # display twinprimes for this twinpair Ki += Ks.uint # set 1st resgroup val of next seg slice cnts[indx] = sum # save twins count for this seg twinpair # Perform sieve on twinpair residues for segment of Kn resgroups in parallel. # Then add seg twinprimes count for each seg tp restrack total to 'twinscnt'. proc segsieve(Kmax: uint) = # perform sieve on all segments let Ks = KB # make default seg size immutable let twinpairs = rescntp div 2 # for number of seg twinpair byte rows parallel: # perform in parallel for indx in 0..twinpairs-1: # for nextp|seg twinpair row indexes spawn twins_sieve(Kmax,indx,Ks)# sieve the selected twinpair restracks sync() # then count the twinprimes in the seg for sum in cnts: twinscnt += sum # update 'twinscnt' w/seg twinprime cnts proc twinprimes_ssoz() = # Use selected Pn prime generator for SSoZ stdout.write "Enter integer number: " let val = stdin.readline.parseBiggestUInt # find primes <= val (13..2^64-1) echo("threads = ", countProcessors()) let ts = epochTime() # start timing sieve setup execution num = uint64((val-1) or 1) # if val even subtract 1 selectPG(num.uint) # select PG and seg factor for input number let modpg = modpg.uint # to reduce casting hell noise var k = num div modpg # compute its resgroup var modk = modpg * k.uint # compute its base num let Kmax = (num-2) div modpg + 1 # maximum number of resgroups for num let B = Bn * 1024 # set seg size to optimize for selected PG KB = if Kmax.int < B: (Kmax.int + 1) else: B # min num of segment resgroups seg = newSeq[uint8]((rescntp div 2)*KB) # create [twinpairs x KB] bytes array echo("segment is [", (rescntp div 2), " x ", KB, "] bytes array") # This is not necessary for running the program but provides information # to determine the 'efficiency' of the used PG: (num of primes)/(num of pcs) # The closer the ratio is to '1' the higher the PG's 'efficiency'. var r = 0 # starting with first residue while num.uint >= modk+restwins[r].uint: r += 1 # find last tp index <= num let maxpcs = k*rescntp.uint + r.uint # maximum number of twinprime pcs let maxpairs = maxpcs div 2 # maximum number of twinprime candidates echo("twinprime candidates = ", maxpairs, "; resgroups = ", Kmax) let sqrtN=int(sqrt float64(num)) # compute integer sqrt of input num sozpg(sqrtN) # compute pcnt and primes <= sqrt(num) let te = (epochTime()-ts) # sieve setup time echo("setup time = ", te.formatFloat(ffDecimal, 3), " secs") twinscnt = if modpg > 210'u: 3 else: 2 # for 1st tps (3,5)|(5,7)|(11,13) echo("perform twinprimes ssoz sieve") # start doing ssoz now let t1 = epochTime() # start timing ssoz sieve execution segsieve(Kmax.uint) # perform total twinprime ssoz sieve let Kn = min(KB, (Kmax.int mod KB)) # set number of resgroups in last slice var ltwin = 0'u64 # to store last twinprime value <= num modk = (Kmax-1).uint * modpg # set mod for last resgroup in last segment k = uint(Kn-1) # set val for last resgroup in last segment let lasti = rescntp div 2 - 1 # set val for last seg twinpair row index r = lasti # starting from last resgroup twinpair byte while true: # step backwards from end of last resgroup var row_i = r * KB # set 'seg' byte row for twinpair restrack if int(seg[row_i + k.int]) == 0: # if both twinpair bits in byte are prime ltwin = modk + restwins[r*2 + 1].uint # numerate the upper twinprime val if ltwin <= num: break # if its <= num its the last prime, so exit twinscnt -= 1 # else reduce twinscnt, keep backtracking # reduce restrack, next resgroup if needed r -= 1; if r < 0: (r = lasti; modk -= modpg; k -= 1) let t2 = (epochTime()-t1) # sieve execution time echo("sieve time = ", t2.formatFloat(ffDecimal, 3), " secs") echo("last segment = ", Kn, " resgroups; segment slices = ", Kmax div KB.uint+1) echo("total twins = ", twinscnt, "; last twin = ", ltwin-1, "+/-1") echo("total time = ", (t2+te).formatFloat(ffDecimal, 3), " secs\n") twinprimes_ssoz()