Hi, my name is heba ibrahim abukaff from jordan ,iam a computer information system student at university of jordan . i have a trouble using the tokenizer to find the frequency list for URL using arabic text.and iam using python 2.7.2 on winXP,I tried this code but every time i run the code appears error with first line COULD YOU HELP ME. WITH REGARDS. :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: python 2.7.2 (default, Jun 12 2011, 15:08:59) [MSC v.1500 32 bit (Intel)] on win32 > import re, codecs import nltk
from urllib import request url = "http://ar.wikipedia.org/wiki/%D9%85%D9%88%D9%82%D8%B9_%D9%88%D9%8A%D8%A8" response = request.urlopen(url) raw = response.read().decode('utf8') print(raw) import re, codecs import nltk from nltk.probability import * def construct_frequency_list(words): outfile = codecs.open(r'raw.txt' , 'w') fd = nltk.probability.FreqDist() for w in words: fd.inc(w) print "Total number of words: %d Vocbulary size : %d" % (fd.N(), fd.B()) print "word with highest count: %s" % (fd.max()) tokenlist = fd.iteritems() for (key, value) in tokenlist: print>>outfile, "%s\t%d" % (key, value) outfile.close() def read_textfile(raw ): lines = codecs.open(raw ,'r','utf_8').readlines() outfile = codecs.open(r'raw.txt' ,'w','utf_8') counter = 0 wordlist = [] for line in lines: tokens = line.rstrip().lstrip().split() for t in tokens: wordlist.append(t) print>>outfile, '%s\t%d' % (t, len(t)) counter += len(t) outfile.close() return wordlist if __name__ == "__main__": words = read_textfile(r'raw .txt') construct_frequency_list(words) print "Done!" -- https://mail.python.org/mailman/listinfo/python-list