Background:
I have a 3,000,000 entry unsorted **word2vec** model (_each word has a 300
floating point vector_). I have to sort this wordlist in alphabetic order and
also reorder the associated vectors to allow binary search lookups of _word - >
vector_
The problem is that I can't seem to write **generic** Nim sorts which handles
these dynamically **UncheckedArray** _mmap based_ data structures.
The following is a simplified **failing** exemplar [ _Sorry I couldn 't make it
shorter :-(_ ]
Any assistance would be greatly appreciated.
I am particularly interested in using large **mmap structures** (_whose
dimensionality is not known at compile time_) as to avoid unnecessary string
handling overhead
import algorithm, strutils
# Used to reorder original data inplace once sort is performed
proc reorderInPlace[T](
elements: var openArray[T];
index: var openArray[int]) =
for i in 0 ..< elements.len:
while index[i] != i:
let targetIndex = index[i]
swap elements[i], elements[targetIndex]
swap index[i], index[targetIndex]
# Want to dynamically allocate ordinal data structure at run-time
proc createUncheckedArray*(n: int): ptr UncheckedArray[int] =
let rawMem = cast[pointer](alloc(n * sizeof(int)))
cast[ptr UncheckedArray[int]](rawMem)
# Future: return sorted index orderings to reorder additional associated
datastores
proc sortInPlace[T](elements: var openArray[T]) =
let length = elements.len
# Ideally want non-hardwired index orderings
discard """
var index = createUncheckedArray length
for i in 0 ..< length: index[i] = i
# Sort doesn't seem to like UncheckedArrays :-(
index.sort do (a, b: int) -> int: elements[a].cmpIgnoreCase elements[b]
generates compile time error:
Expression: sort(index)do (a, b: int) -> int:
elements[a].cmpIgnoreCase elements[b]
[1] index: ptr UncheckedArray[int]
[2] do (a, b: int) -> int:
elements[a].cmpIgnoreCase elements[b]: void
Expected one of (first mismatch at [position]):
[1] func sort[T](a: var openArray[T]; cmp: proc (x, y: T): int
{.closure.};
order = SortOrder.Ascending)
[1] proc sort[T](a: var openArray[T]; order = SortOrder.Ascending)
"""
# Even when I kludge a hardwired 0..n array I get a runtime error
var index: array[4, int]
for i in 0 ..< length: index[i] = i
index.sort do (a, b: int) -> int: elements[a].cmpIgnoreCase elements[b]
discard """
Genererates runtime error:
... Error: 'elements' is of type <var seq[string]>
which cannot be captured as it would violate memory safety,
declared here:
/Users/dennismisener/work/Nim/test_reorder_in_place.nim(17, 21);
using '-d:nimNoLentIterators' helps in some cases.
Consider using a <ref var seq[string]> which can be captured.
"""
elements.reorderInPlace index
# dealloc index # for UncheckedArray
# Example usage
when isMainModule:
# Test reorder array - works!
var
elements1 = ['a', 'b', 'c', 'd']
index1 = [3, 0, 1, 2]
reorderInPlace(elements1, index1)
assert elements1 == ['b', 'c', 'd', 'a']
# Test reorder seq - works!
var
elements2 = @['a', 'b', 'c', 'd']
index2 = [3, 2, 1, 0]
reorderInPlace(elements2, index2)
assert elements2 == @['d', 'c', 'b', 'a']
# Now Test sortInplace - Fails
var words = @["dog", "cat", "apple", "bob"]
words.sortInPlace
Run