There is actually no need to benchmark.
You shouldn't put 1000 elements in an array, 1000 elements of int32 or float32
will take 4000 bytes and 1000 elements of int64 or float64 will take 8000 bytes
(int32 is 32 bits = 4 bytes).
1\. Stack space is very limited, 1MB to 8MB in general. So with 2 arrays of
1000 elements you are taking 1%~16% of the stack space which is used to store
local variables, function pointers, code segments, ...
> * Note that contrary to most languages, Nim will not copy by value stack
> variables when passing them to other functions if the copy is too costly.
>
2\. The costly functions of a sequence compared to a stack-based array are:
> * memory allocation, but you allocate only once
> * appending, with add operator, because the sequence has to check the
> current reserved memory and possibly reallocate.
>
If you only use indexing a[i] in sequences, like with an array, the cost are
the same: `optional bound-checking` \+ `pointer dereference`. Bound-checking is
not done on release builds and pointer dereference is automatically cached by
the CPU if done multiple times in a tight loop.
If you are really really worried about the total performance allocate your
elements in a sequence and then access them via a ptr UncheckedArray[T] which
will work like a C pointer.
In summary, if you want the best performance for your algorithm:
* use seqs
* avoid temporary sequence memory allocation if you can
* use the indexing scheme `a[i] = value`
* do not use the appending proc `a.add value`
Now here is the benchmark, note that you might run out of stack space for the
arrays:
# ##########################################
# Benchmarking tools
import random, times, stats, strformat, math, sequtils
proc warmup() =
# Warmup - make sure cpu is on max perf
let start = cpuTime()
var foo = 123
for i in 0 ..< 300_000_000:
foo += i*i mod 456
foo = foo mod 789
# Compiler shouldn't optimize away the results as cpuTime rely on
sideeffects
let stop = cpuTime()
echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid
compiler optimizing warmup away)"
template printStats(name: string, output: openarray) {.dirty.} =
echo "\n" & name
echo &"Collected {stats.n} samples in {global_stop - global_start:>4.3f}
seconds"
echo &"Average time: {stats.mean * 1000 :>4.3f} ms"
echo &"Stddev time: {stats.standardDeviationS * 1000 :>4.3f} ms"
echo &"Min time: {stats.min * 1000 :>4.3f} ms"
echo &"Max time: {stats.max * 1000 :>4.3f} ms"
echo &"Theoretical perf: {a.len.float / (float(10^6) * stats.mean):>4.3f}
MFLOP/s"
echo "\nDisplay output[0] to make sure it's not optimized away"
echo output[0] # Prevents compiler from optimizing stuff away
template bench(name: string, output: openarray, body: untyped) {.dirty.}=
block: # Actual bench
var stats: RunningStat
let global_start = cpuTime()
for _ in 0 ..< nb_samples:
let start = cpuTime()
body
let stop = cpuTime()
stats.push stop - start
let global_stop = cpuTime()
printStats(name, output)
# #############################################
proc benchArray(a, b: array[1000, int], nb_samples: int) =
var output: array[1000, int]
bench("Array ifelse", output):
for i in 0 ..< a.len:
if a[i] == b[i]:
output[i] = a[i] * b[i]
else:
output[i] = a[i] + b[i]
proc benchSeq(a, b: array[1000, int], nb_samples: int) =
let a_seq = @a
let b_seq = @b
var output = newSeq[int](1000)
bench("Seq ifelse", output):
for i in 0 ..< a.len:
if a[i] == b[i]:
output[i] = a[i] * b[i]
else:
output[i] = a[i] + b[i]
# ###########################################
when defined(fast_math):
{.passC:"-ffast-math".}
when defined(march_native):
{.passC:"-march=native".}
when isMainModule:
randomize(42) # For reproducibility
warmup()
block:
var a, b: array[1000, int]
for i in 0 ..< 1000:
a[i] = rand(0..100)
b[i] = rand(0..100)
benchArray(a, b, nb_samples = 1_000_000)
benchSeq(a, b, nb_samples = 1_000_000)
Run
And the result on my machine:
Warmup: 1.3207 s, result 224 (displayed to avoid compiler optimizing warmup
away)
Array ifelse
Collected 1000000 samples in 2.685 seconds
Average time: 0.002 ms
Stddev time: 0.001 ms
Min time: 0.001 ms
Max time: 0.087 ms
Theoretical perf: 565.750 MFLOP/s
Display output[0] to make sure it's not optimized away
18
Seq ifelse
Collected 1000000 samples in 2.611 seconds
Average time: 0.002 ms
Stddev time: 0.001 ms
Min time: 0.001 ms
Max time: 0.219 ms
Theoretical perf: 589.900 MFLOP/s
Display output[0] to make sure it's not optimized away
18
Run
Note that MFLOP/s isn't technically true, first of all, we are using integers
and second it consider that the following is a single operation, while there is
a comparison + mul/add.
if a[i] == b[i]:
output[i] = a[i] * b[i]
else:
output[i] = a[i] + b[i]
Run