@markebbert Here is a rough version of the working code. Unfortunately, the 
speed is quite a bit slower. And it does indeed look like most of the time is 
spent in the split function, because without it, this code runs in ~39 seconds. 
With it, it takes about 7 minutes on my machine.
    
    
    import zip/gzipfiles
    import strutils
    
    # This is from lib/pure/streams.nim. Optimized to take a buffer
    proc readStr*(s: Stream, length: int, res: var TaintedString, capacity = 
4096): bool =
      ## Reads a string of length `length` from the stream `s`. Raises 
`IOError` if
      ## an error occurred.
      
      var sp = max(res.string.len, length)
      res.string.setLen(sp)
      
      var L = readData(s, cstring(res), length)
      if L != length: setLen(res.string, L)
      return L != 0
    
    proc c_memchr(s: pointer, c: cint, n: csize): pointer {.
      importc: "memchr", header: "<string.h>".}
    
    iterator lines*(s: GzFileStream, cap = 100000000): TaintedString {.tags: 
[ReadIOEffect].} =
      ## An iterator across GzFileStream that takes a buffer capacity
      var
        bufSize = 4096
        data = TaintedString(newStringOfCap(cap))
        buffer = TaintedString(newStringOfCap(bufSize))
      
      var
        pos = 0
        last = 0
      
      while not s.atEnd():
        var success = readStr(s, cap, data)
        if not success: break
        
        pos = 0
        last = 0
        
        while pos < data.len:
          let m = c_memchr(addr data.string[pos], '\L'.ord, bufSize)
          
          if m != nil:
            # \l found: Could be our own or the one by fgets, in any case, 
we're done
            last = cast[ByteAddress](m) - cast[ByteAddress](addr data.string[0])
            
            if last > 0 or data.string[last] == '\l':
              if buffer.len > 0:
                yield buffer & data[pos ..< last]
                buffer.setLen(0)
              else:
                yield data[pos ..< last]
              
              pos = last + 1
            else:
              break
          else:
            pos += bufSize
            if data[last] == '\l':
              buffer.add data[last+1 ..< pos]
            else:
              buffer.add data[last ..< pos]
        
        if last < data.len:
          # there is still data left over and we
          # need to keep it for the next iteration
          if data[last] == '\l':
            buffer.add data[last+1 ..< data.len]
          else:
            buffer.add data[last ..< data.len]
    
    
    proc main() =
      let vcf = newGzFileStream("genotypes.vcf.gz")
      # let vcf = newGzFileStream("readgeno.nim.gz")
      
      var
        genotype: string
        i: int
      
      i = 0
      while not vcf.atEnd():
        
        # This is the magic. Reading 100MB of Gzip at a time speeds things up.
        # You can change this as necessary the more/less Ram you have
        for line in vcf.lines(100_000_000):
          
          if not (line[0] == '#'):
            
            for col in split(line, '\t'):
              
              # Only need sample columns
              if i >= 9:
                for fmt in split(col, ':'):
                  genotype = fmt
                  
                  # Only need the first element (GT)
                  break
              
              inc(i)
    
    main()
    
    
    Run

Maybe there is a way to better optimize split, but it looks like it's been 
optimized quite a bit already. Or perhaps there is a more efficient version of 
my code. This is a rough copy due to time constraints but I'll try to play with 
it later.

@cblake's solution is really good if decompressing the files is an option for 
you. 

Reply via email to