I guess this may help you
--
import operator
from string import whitespace as space
from string import punctuation as punc
class TextProcessing(object):
"""."""
def __init__(self):
"""."""
self.file = None
self.sorted_list = []
self.words_and_occurence = {}
def __sort_dict_by_value(self):
"""."""
sorted_in_rev = sorted(self.words_and_occurence.items(), key=lambda
x: x[1])
self.sorted_list = sorted_in_rev[::-1]
def __validate_words(self, word):
"""."""
if word in self.words_and_occurence:
self.words_and_occurence[word] += 1
else:
self.words_and_occurence[word] = 1
def __parse_file(self, file_name):
"""."""
fp = open(file_name, 'r')
line = fp.readline()
while line:
split_line = [self.__validate_words(word.strip(punc + space)) \
for word in line.split()
if word.strip(punc + space)]
line = fp.readline()
fp.close()
def parse_file(self, file_name=None):
"""."""
if file_name is None:
raise Exception("Please pass the file to be parsed")
if not file_name.endswith(r".txt"):
raise Exception("*** Error *** Not a valid text file")
self.__parse_file(file_name)
self.__sort_dict_by_value()
def print_top_n(self, n):
"""."""
print "Top {0} words:".format(n), [self.sorted_list[i][0] for i in
xrange(n)]
def print_unique_words(self):
"""."""
print "Unique words:", [self.sorted_list[i][0] for i in
xrange(len(self.sorted_list))]
if __name__ == "__main__":
"""."""
obj = TextProcessing()
obj.parse_file(r'test_input.txt')
obj.print_top_n(4)
obj.print_unique_words()
*-- Regards --*
*
*
* Siva Cn*
*Python Developer*
*
*
*+91 9620339598*
*http://www.cnsiva.com*
-
On Thu, Oct 17, 2013 at 7:58 PM, wrote:
> Send Tutor mailing list submissions to
> tutor@python.org
>
> To subscribe or unsubscribe via the World Wide Web, visit
> https://mail.python.org/mailman/listinfo/tutor
> or, via email, send a message with subject or body 'help' to
> tutor-requ...@python.org
>
> You can reach the person managing the list at
> tutor-ow...@python.org
>
> When replying, please edit your Subject line so it is more specific
> than "Re: Contents of Tutor digest..."
>
>
> Today's Topics:
>
>1. Re: Help please (Alan Gauld)
>2. Re: Help please (Peter Otten)
>3. Re: Help please (Dominik George)
>4. Re: Help please (Kengesbayev, Askar)
>
>
> --
>
> Message: 1
> Date: Thu, 17 Oct 2013 14:13:07 +0100
> From: Alan Gauld
> To: tutor@python.org
> Subject: Re: [Tutor] Help please
> Message-ID:
> Content-Type: text/plain; charset=ISO-8859-1; format=flowed
>
> On 16/10/13 19:49, Pinedo, Ruben A wrote:
> > I was given this code and I need to modify it so that it will:
> >
> > #1. Error handling for the files to ensure reading only .txt file
>
> I'm not sure what is meant here since your code only ever opens
> 'emma.txt', so it is presumably a text file... Or are you
> supposed to make the filename a user provided value maybe
> (using raw_input maybe?)
>
> > #2. Print a range of top words... ex: print top 10-20 words
>
> I assume 'top' here means the most common? Whoever is writing the
> specification for this problem needs to be a bit more specific
> in their definitions.
>
> If so you need to fix the bugs in process_line() and
> process_file(). I don;t know if these are deliberate bugs
> or somebody is just sloppy. But neither work as expected
> right now. (Hint: Consider the return values of each)
>
> Once you've done that you can figure out how to extract
> the required number of words from your (unsorted) dictionary.
> and put that in a reporting function and print the output.
> You might be able to use the two common words functions,
> although watch out because they don't do exactly what
> you want and one of them is basically broken...
>
> > #3. Print only the words with > 3 characters
>
> Modify the above to discard words of 3 letters or less.
>
> > #4. Modify the printing function to print top 1 or