Re: R-style logical vector operations in Nim?

mratsim Tue, 06 Nov 2018 03:30:53 -0800

There is actually no need to benchmark.

You shouldn't put 1000 elements in an array, 1000 elements of int32 or float32 
will take 4000 bytes and 1000 elements of int64 or float64 will take 8000 bytes 
(int32 is 32 bits = 4 bytes).


1\. Stack space is very limited, 1MB to 8MB in general. So with 2 arrays of 
1000 elements you are taking 1%~16% of the stack space which is used to store 
local variables, function pointers, code segments, ...

>   * Note that contrary to most languages, Nim will not copy by value stack 
> variables when passing them to other functions if the copy is too costly.
> 


2\. The costly functions of a sequence compared to a stack-based array are:

>   * memory allocation, but you allocate only once
>   * appending, with add operator, because the sequence has to check the 
> current reserved memory and possibly reallocate.
> 


If you only use indexing a[i] in sequences, like with an array, the cost are 
the same: `optional bound-checking` \+ `pointer dereference`. Bound-checking is 
not done on release builds and pointer dereference is automatically cached by 
the CPU if done multiple times in a tight loop.

If you are really really worried about the total performance allocate your 
elements in a sequence and then access them via a ptr UncheckedArray[T] which 
will work like a C pointer.

In summary, if you want the best performance for your algorithm:
    

  * use seqs
  * avoid temporary sequence memory allocation if you can
  * use the indexing scheme `a[i] = value`
  * do not use the appending proc `a.add value`



Now here is the benchmark, note that you might run out of stack space for the 
arrays:
    
    
    # ##########################################
    # Benchmarking tools
    import random, times, stats, strformat, math, sequtils
    
    proc warmup() =
      # Warmup - make sure cpu is on max perf
      let start = cpuTime()
      var foo = 123
      for i in 0 ..< 300_000_000:
        foo += i*i mod 456
        foo = foo mod 789
      
      # Compiler shouldn't optimize away the results as cpuTime rely on 
sideeffects
      let stop = cpuTime()
      echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid 
compiler optimizing warmup away)"
    
    template printStats(name: string, output: openarray) {.dirty.} =
      echo "\n" & name
      echo &"Collected {stats.n} samples in {global_stop - global_start:>4.3f} 
seconds"
      echo &"Average time: {stats.mean * 1000 :>4.3f} ms"
      echo &"Stddev  time: {stats.standardDeviationS * 1000 :>4.3f} ms"
      echo &"Min     time: {stats.min * 1000 :>4.3f} ms"
      echo &"Max     time: {stats.max * 1000 :>4.3f} ms"
      echo &"Theoretical perf: {a.len.float / (float(10^6) * stats.mean):>4.3f} 
MFLOP/s"
      echo "\nDisplay output[0] to make sure it's not optimized away"
      echo output[0] # Prevents compiler from optimizing stuff away
    
    template bench(name: string, output: openarray, body: untyped) {.dirty.}=
      block: # Actual bench
        var stats: RunningStat
        let global_start = cpuTime()
        for _ in 0 ..< nb_samples:
          let start = cpuTime()
          body
          let stop = cpuTime()
          stats.push stop - start
        let global_stop = cpuTime()
        printStats(name, output)
    
    # #############################################
    
    proc benchArray(a, b: array[1000, int], nb_samples: int) =
      var output: array[1000, int]
      bench("Array ifelse", output):
        for i in 0 ..< a.len:
          if a[i] == b[i]:
            output[i] = a[i] * b[i]
          else:
            output[i] = a[i] + b[i]
    
    proc benchSeq(a, b: array[1000, int], nb_samples: int) =
      let a_seq = @a
      let b_seq = @b
      var output = newSeq[int](1000)
      bench("Seq ifelse", output):
        for i in 0 ..< a.len:
          if a[i] == b[i]:
            output[i] = a[i] * b[i]
          else:
            output[i] = a[i] + b[i]
    
    # ###########################################
    
    when defined(fast_math):
      {.passC:"-ffast-math".}
    
    when defined(march_native):
      {.passC:"-march=native".}
    
    when isMainModule:
      randomize(42) # For reproducibility
      warmup()
      block:
        var a, b: array[1000, int]
        for i in 0 ..< 1000:
          a[i] = rand(0..100)
          b[i] = rand(0..100)
        
        benchArray(a, b, nb_samples = 1_000_000)
        benchSeq(a, b, nb_samples = 1_000_000)
    
    
    Run

And the result on my machine:
    
    
    Warmup: 1.3207 s, result 224 (displayed to avoid compiler optimizing warmup 
away)
    
    Array ifelse
    Collected 1000000 samples in 2.685 seconds
    Average time: 0.002 ms
    Stddev  time: 0.001 ms
    Min     time: 0.001 ms
    Max     time: 0.087 ms
    Theoretical perf: 565.750 MFLOP/s
    
    Display output[0] to make sure it's not optimized away
    18
    
    Seq ifelse
    Collected 1000000 samples in 2.611 seconds
    Average time: 0.002 ms
    Stddev  time: 0.001 ms
    Min     time: 0.001 ms
    Max     time: 0.219 ms
    Theoretical perf: 589.900 MFLOP/s
    
    Display output[0] to make sure it's not optimized away
    18
    
    
    Run

Note that MFLOP/s isn't technically true, first of all, we are using integers 
and second it consider that the following is a single operation, while there is 
a comparison + mul/add.
    
    
    if a[i] == b[i]:
            output[i] = a[i] * b[i]
          else:
            output[i] = a[i] + b[i]
    
    
    Run

Re: R-style logical vector operations in Nim?

Reply via email to