With that version in my file: [enwik8](http://mattmahoney.net/dc/enwik8.zip) 
drops to 53s vs the python 64s. But I can't see the results with that version. 
And if I save the string slice I will be back to the same problem.

The python version: 
    
    
    from timeit import default_timer as timer
    
    WORD_SIZE = 12
    K = 10000
    
    def window(line, size):
      for i in range(len(line) - size + 1):
        yield line[i : i + size]
    
    def counter(file, size, k):
      lines = ""
      for line in open(file):
        lines += line
      
      counts = {}
      for word in window(lines, size):
        if word in counts:
          counts[word] += 1
        elif len(counts) < k:
          counts[word] = 1
        else:
          to_remove = []
          for i in counts:
            if counts[i] == 1:
              to_remove.append(i)
            else:
              counts[i] -= 1
          
          for r in to_remove:
            del counts[r]
      
      return counts
    
    def printTop(table, top):
      sorted_keys = sorted(table, key = table.__getitem__, reverse = True)
      
      n = 0
      for key in sorted_keys:
        n += 1
        if n > top: break
        escaped_key = key.replace('\n', '\\n')
        print("{}: '{}' -> {}".format(n, escaped_key, table[key]))
    
    t0 = timer()
    res = counter("enwik8", WORD_SIZE, K)
    print("CPU time [s] ", timer() - t0)
    
    printTop(res, 30)
    

Reply via email to