Here's a working version of the ngram counter with nested dict, wonder
how it can be improved!

lines = ["abra ca dabra",
        "abra ca shvabra",
        "abra movich roman",
        "abra ca dabra",
        "a bra cadadra"]

ngrams = [x.split() for x in lines]

N = 3
N1 = N-1

orig = {}

for ngram in ngrams:
        h = orig
        # iterating over i, not word, to notice the last i
        for i in range(N):
          word = ngram[i]
          if word not in h:
            if i < N1: # (*)
              h[word] = {}
            else:
              h[word] = 0
          if i < N1:
                h = h[word]
          print i, h

        h[word] += 1

print orig


-- e.g., perhaps we could do short-circuit vivification to the end in
(*)?

Cheers,
Alexy

-- 
http://mail.python.org/mailman/listinfo/python-list

Reply via email to