Sorry, I did not see the other thread in which this
approach has already been covered. The point Kent has
raised about going into infinite loop with file having
single line is very true.

Following is the corrected version (for completeness
sake) -

import os,random

def getrandfromMem(filename) :
  fd = file(filename,'rb')
  l = fd.readlines()
  pos = random.randint(0,len(l))
  fd.close()
  return (pos,l[pos])

def getrandomline2(filename) :
  filesize = os.stat(filename)[6]
  if filesize < 4096 :  # Seek may not be very useful
    return getrandfromMem(filename)

  fd = file(filename,'rb')
  for _ in range(10) : # Try 10 times
    pos = random.randint(0,filesize)
    fd.seek(pos)
    fd.readline()  # Read and ignore
    line = fd.readline()
    if line != '' :
       break

  if line != '' :
    return (pos,line)
  else :
    getrandfromMem(filename)

getrandomline2("shaks12.txt")

Caveat : It will still skip 1st line during random
selection if its size exceed 4096 chars !!


--- Aditya Lal <[EMAIL PROTECTED]> wrote:

> An alternative approach (I found the Yorick's code
> to
> be too slow for large # of calls) :
> 
> We can use file size to pick a random point in the
> file. We can read and ignore text till next new
> line.
> This will avoid outputting partial lines. Return the
> next line (which I guess is still random :)). 
> 
> Indicative code -
> 
> import os,random
> 
> def getrandomline(filename) :
>   offset = random.randint(0,os.stat(filename)[6])
>   fd = file(filename,'rb')
>   fd.seek(offset)
>   fd.readline()  # Read and ignore
>   return fd.readline()
> 
> getrandomline("shaks12.txt")
> 
> Caveat: The above code will never choose 1st line
> and
> will return '' for last line. Other than the
> boundary
> conditions it will work well (even for large files).
> 
> 
> Interestingly :
> 
> On modifying this code to take in file object rather
> than filename, the performance improved by ~50%. On
> wrapping it in a class, it further improved by ~25%.
> 
> On executing the get random line 100,000 times on
> large file (size 2707519 with 9427 lines), the class
> version finished < 5 seconds.
> 
> Platform : 2GHz Intel Core 2 Duo macBook (2GB RAM)
> running Mac OSX (10.4.10).
> 
> Output using python 2.5.1 (stackless)
> 
> Approach using enum approach : 9.55798196793 : for
> [100] iterations
> Approach using filename : 11.552863121 : for
> [100000]
> iterations
> Approach using file descriptor : 5.97015094757 : for
> [100000] iterations
> Approach using class : 4.46039891243 : for [100000]
> iterations
> 
> Output using python 2.3.5 (default python on OSX)
> 
> Approach using enum approach : 12.2886080742 : for
> [100] iterations
> Approach using filename : 12.5682640076 : for
> [100000]
> iterations
> Approach using file descriptor : 6.55952501297 : for
> [100000] iterations
> Approach using class : 5.35413718224 : for [100000]
> iterations
> 
> I am attaching test program FYI.
> 
> --
> Aditya
> 
> --- Nathan Coulter
> <[EMAIL PROTECTED]> wrote:
> 
> > >  -------Original Message-------
> > >  From: Tiger12506 <[EMAIL PROTECTED]>
> > 
> > >  Yuck. Talk about a one shot function! Of course
> > it only reads through the
> > >  file once! You only call the function once. Put
> a
> > second print randline(f)
> > >  at the bottom of your script and see what
> happens
> > :-)
> > >  
> > >  JS
> > >  
> > 
> > *sigh*
> > 
> > #!/bin/env python
> > 
> > import os
> > import random
> > 
> > text = 'shaks12.txt'
> > if not os.path.exists(text):
> >   os.system('wget
> >
> http://www.gutenberg.org/dirs/etext94/shaks12.txt')
> > 
> > def randline(f):
> >     for i,j in enumerate(file(f, 'rb')):
> >         if random.randint(0,i) == i:
> >             line = j
> >     return line
> > 
> > print randline(text)
> > print randline(text)
> > print randline(text)
> > 
> > -- 
> > Yorick
> > _______________________________________________
> > Tutor maillist  -  Tutor@python.org
> > http://mail.python.org/mailman/listinfo/tutor
> > 
> 
> 
> 
>  
>
____________________________________________________________________________________
> Sucker-punch spam with award-winning protection. 
> Try the free Yahoo! Mail Beta.
>
http://advision.webevents.yahoo.com/mailbeta/features_spam.html>
import os
> import random
> 
> class randomline :
>       
>       def __init__(self, filename="largefile.txt") :
>               self.filesize = os.stat(filename)[6]
>               self.fd = file(filename, 'rb')
> 
>       def getline(self) :
>               offset = random.randint(0,self.filesize)
>               self.fd.seek(offset)
>               self.fd.readline()
>               line = self.fd.readline()
>               return (offset,line)
>       
>       def close(self) :
>               self.fd.close()
> 
> # Uses file name
> def getrandomline(filename) :
>       offset = random.randint(0,os.stat(filename)[6])
>       fd = file(filename, 'rb')
>       fd.seek(offset)
>       ret = (offset,fd.readline())
>       fd.close()
>       return ret
> 
> # Uses file descriptor
> def getrandline(fd) :
>       offset = random.randint(0,os.fstat(fd.fileno())[6])
>       fd.seek(offset)
>       line = fd.readline()
>       return (offset,fd.readline())
> 
> # Uses enumeration
> def randline(fd):
>       for i,j in enumerate(fd) :
>               if random.randint(0,i) == i:
>                       line = j
>       fd.seek(0)
>       return line
> 
> 
> if __name__ == '__main__' :
> 
>       # Substitute your file name
>       filename = "largefile.txt"
> 
>       # Class
>       rd = randomline(filename)
>       print rd.getline()
>       rd.close()
> 
>       # file name
>       print getrandomline(filename)
> 
>       # file descriptor
>       fd = file(filename,'rb')
>       print getrandline(fd)
>       fd.close()
> 
>       # Using enum approach
>       fd = file(filename,'rb')
>       print randline(fd)
>       fd.close()
> 
>       from timeit import Timer 
>       t_class = Timer('rd.getline()', 'from __main__
> import randomline ; rd =
> randomline("'+filename+'")')
>       t_filename = Timer('getrandomline("'+filename+'")',
> 'from __main__ import getrandomline')
>       t_fd = Timer('getrandline(fd)', 'from __main__
> import getrandline ; fd = file("'+filename+'")')
>       t_enum = Timer('randline(fd)', 'from __main__
> import randline ; fd = file("'+filename+'")')
> 
>       print 'Approach using enum approach : %s : for [%d]
> iterations' % (str(t_enum.timeit(100)),100)
>       print 'Approach using filename : %s : for [%d]
> iterations' %
> (str(t_filename.timeit(100000)),100000)
>       print 'Approach using file descriptor : %s : for
> [%d] iterations' % (str(t_fd.timeit(100000)),100000)
>       print 'Approach using class : %s : for [%d]
> iterations' % (str(t_class.timeit(100000)),100000)
> 
> > _______________________________________________
> Tutor maillist  -  Tutor@python.org
> http://mail.python.org/mailman/listinfo/tutor
> 



       
____________________________________________________________________________________
Sick sense of humor? Visit Yahoo! TV's 
Comedy with an Edge to see what's on, when. 
http://tv.yahoo.com/collections/222
_______________________________________________
Tutor maillist  -  Tutor@python.org
http://mail.python.org/mailman/listinfo/tutor

Reply via email to