//A CRAWLER IMPLEMENTATION
please run this prog. on the shell and under the control of debugger
when this prog. is run normally the prog. does not terminate .It
doesn't come out of the cond. if c<5: so this prog. continues
infinitely
but if this prog is run under the control of debugger the prog
terminates when the cond. if c<5: becomes false
i think this prob. may be due to multithreading pls help.


from sgmllib import SGMLParser
import threading
import re
import urllib
import pdb
import time
class urlist(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.list=[]

    def start_a(self,attr):
        href=[v for k,v in attr if k=="href"]
        if href:
            self.list.extend(href)
mid=2
c=0
class mythread(threading.Thread):
         stdmutex=threading.Lock()
         global threads
         threads=[]
         def __init__(self,u,myid):
                self.u=u
                self.myid=myid
                threading.Thread.__init__(self)
         def run(self):
                global c
                global mid
                if c<5:
                        self.stdmutex.acquire()
                        self.usock=urllib.urlopen(self.u)
                        self.p=urlist()
                        self.s=self.usock.read()
                        self.p.feed(self.s)
                        self.usock.close()
                        self.p.close()
                        c=c+1
                        fname="/root/" + str(c) + ".txt"
                        self.f=open(fname,"w")
                        self.f.write(self.s)
                        self.f.close()
                        print c
                        print self.p.list
                        print self.u
                        print self.myid
                        for j in self.p.list:
                                k=re.search("^https?:",j)
                                if k:
                                   i=mythread(j,mid)
                                   i.start()
                                   threads.append(i)
                                   mid=mid+1
                        self.stdmutex.release()






if __name__=="__main__":
    thread=mythread("http://www.google.co.in/",1)
    thread.start()
    threads.append(thread)
    for thread in threads:
          thread.join()
    print "main thread exits"

































































































































































































































































-- 
http://mail.python.org/mailman/listinfo/python-list

Reply via email to