//A CRAWLER IMPLEMENTATION please run this prog. on the shell and under the control of debugger when this prog. is run normally the prog. does not terminate .It doesn't come out of the cond. if c<5: so this prog. continues infinitely but if this prog is run under the control of debugger the prog terminates when the cond. if c<5: becomes false i think this prob. may be due to multithreading pls help.
from sgmllib import SGMLParser import threading import re import urllib import pdb import time class urlist(SGMLParser): def reset(self): SGMLParser.reset(self) self.list=[] def start_a(self,attr): href=[v for k,v in attr if k=="href"] if href: self.list.extend(href) mid=2 c=0 class mythread(threading.Thread): stdmutex=threading.Lock() global threads threads=[] def __init__(self,u,myid): self.u=u self.myid=myid threading.Thread.__init__(self) def run(self): global c global mid if c<5: self.stdmutex.acquire() self.usock=urllib.urlopen(self.u) self.p=urlist() self.s=self.usock.read() self.p.feed(self.s) self.usock.close() self.p.close() c=c+1 fname="/root/" + str(c) + ".txt" self.f=open(fname,"w") self.f.write(self.s) self.f.close() print c print self.p.list print self.u print self.myid for j in self.p.list: k=re.search("^https?:",j) if k: i=mythread(j,mid) i.start() threads.append(i) mid=mid+1 self.stdmutex.release() if __name__=="__main__": thread=mythread("http://www.google.co.in/",1) thread.start() threads.append(thread) for thread in threads: thread.join() print "main thread exits" -- http://mail.python.org/mailman/listinfo/python-list