Sorry, I did not see the other thread in which this approach has already been covered. The point Kent has raised about going into infinite loop with file having single line is very true.
Following is the corrected version (for completeness sake) - import os,random def getrandfromMem(filename) : fd = file(filename,'rb') l = fd.readlines() pos = random.randint(0,len(l)) fd.close() return (pos,l[pos]) def getrandomline2(filename) : filesize = os.stat(filename)[6] if filesize < 4096 : # Seek may not be very useful return getrandfromMem(filename) fd = file(filename,'rb') for _ in range(10) : # Try 10 times pos = random.randint(0,filesize) fd.seek(pos) fd.readline() # Read and ignore line = fd.readline() if line != '' : break if line != '' : return (pos,line) else : getrandfromMem(filename) getrandomline2("shaks12.txt") Caveat : It will still skip 1st line during random selection if its size exceed 4096 chars !! --- Aditya Lal <[EMAIL PROTECTED]> wrote: > An alternative approach (I found the Yorick's code > to > be too slow for large # of calls) : > > We can use file size to pick a random point in the > file. We can read and ignore text till next new > line. > This will avoid outputting partial lines. Return the > next line (which I guess is still random :)). > > Indicative code - > > import os,random > > def getrandomline(filename) : > offset = random.randint(0,os.stat(filename)[6]) > fd = file(filename,'rb') > fd.seek(offset) > fd.readline() # Read and ignore > return fd.readline() > > getrandomline("shaks12.txt") > > Caveat: The above code will never choose 1st line > and > will return '' for last line. Other than the > boundary > conditions it will work well (even for large files). > > > Interestingly : > > On modifying this code to take in file object rather > than filename, the performance improved by ~50%. On > wrapping it in a class, it further improved by ~25%. > > On executing the get random line 100,000 times on > large file (size 2707519 with 9427 lines), the class > version finished < 5 seconds. > > Platform : 2GHz Intel Core 2 Duo macBook (2GB RAM) > running Mac OSX (10.4.10). > > Output using python 2.5.1 (stackless) > > Approach using enum approach : 9.55798196793 : for > [100] iterations > Approach using filename : 11.552863121 : for > [100000] > iterations > Approach using file descriptor : 5.97015094757 : for > [100000] iterations > Approach using class : 4.46039891243 : for [100000] > iterations > > Output using python 2.3.5 (default python on OSX) > > Approach using enum approach : 12.2886080742 : for > [100] iterations > Approach using filename : 12.5682640076 : for > [100000] > iterations > Approach using file descriptor : 6.55952501297 : for > [100000] iterations > Approach using class : 5.35413718224 : for [100000] > iterations > > I am attaching test program FYI. > > -- > Aditya > > --- Nathan Coulter > <[EMAIL PROTECTED]> wrote: > > > > -------Original Message------- > > > From: Tiger12506 <[EMAIL PROTECTED]> > > > > > Yuck. Talk about a one shot function! Of course > > it only reads through the > > > file once! You only call the function once. Put > a > > second print randline(f) > > > at the bottom of your script and see what > happens > > :-) > > > > > > JS > > > > > > > *sigh* > > > > #!/bin/env python > > > > import os > > import random > > > > text = 'shaks12.txt' > > if not os.path.exists(text): > > os.system('wget > > > http://www.gutenberg.org/dirs/etext94/shaks12.txt') > > > > def randline(f): > > for i,j in enumerate(file(f, 'rb')): > > if random.randint(0,i) == i: > > line = j > > return line > > > > print randline(text) > > print randline(text) > > print randline(text) > > > > -- > > Yorick > > _______________________________________________ > > Tutor maillist - Tutor@python.org > > http://mail.python.org/mailman/listinfo/tutor > > > > > > > ____________________________________________________________________________________ > Sucker-punch spam with award-winning protection. > Try the free Yahoo! Mail Beta. > http://advision.webevents.yahoo.com/mailbeta/features_spam.html> import os > import random > > class randomline : > > def __init__(self, filename="largefile.txt") : > self.filesize = os.stat(filename)[6] > self.fd = file(filename, 'rb') > > def getline(self) : > offset = random.randint(0,self.filesize) > self.fd.seek(offset) > self.fd.readline() > line = self.fd.readline() > return (offset,line) > > def close(self) : > self.fd.close() > > # Uses file name > def getrandomline(filename) : > offset = random.randint(0,os.stat(filename)[6]) > fd = file(filename, 'rb') > fd.seek(offset) > ret = (offset,fd.readline()) > fd.close() > return ret > > # Uses file descriptor > def getrandline(fd) : > offset = random.randint(0,os.fstat(fd.fileno())[6]) > fd.seek(offset) > line = fd.readline() > return (offset,fd.readline()) > > # Uses enumeration > def randline(fd): > for i,j in enumerate(fd) : > if random.randint(0,i) == i: > line = j > fd.seek(0) > return line > > > if __name__ == '__main__' : > > # Substitute your file name > filename = "largefile.txt" > > # Class > rd = randomline(filename) > print rd.getline() > rd.close() > > # file name > print getrandomline(filename) > > # file descriptor > fd = file(filename,'rb') > print getrandline(fd) > fd.close() > > # Using enum approach > fd = file(filename,'rb') > print randline(fd) > fd.close() > > from timeit import Timer > t_class = Timer('rd.getline()', 'from __main__ > import randomline ; rd = > randomline("'+filename+'")') > t_filename = Timer('getrandomline("'+filename+'")', > 'from __main__ import getrandomline') > t_fd = Timer('getrandline(fd)', 'from __main__ > import getrandline ; fd = file("'+filename+'")') > t_enum = Timer('randline(fd)', 'from __main__ > import randline ; fd = file("'+filename+'")') > > print 'Approach using enum approach : %s : for [%d] > iterations' % (str(t_enum.timeit(100)),100) > print 'Approach using filename : %s : for [%d] > iterations' % > (str(t_filename.timeit(100000)),100000) > print 'Approach using file descriptor : %s : for > [%d] iterations' % (str(t_fd.timeit(100000)),100000) > print 'Approach using class : %s : for [%d] > iterations' % (str(t_class.timeit(100000)),100000) > > > _______________________________________________ > Tutor maillist - Tutor@python.org > http://mail.python.org/mailman/listinfo/tutor > ____________________________________________________________________________________ Sick sense of humor? Visit Yahoo! TV's Comedy with an Edge to see what's on, when. http://tv.yahoo.com/collections/222 _______________________________________________ Tutor maillist - Tutor@python.org http://mail.python.org/mailman/listinfo/tutor