what do you find that is strange?
On Aug 21, 7:32 am, Stef Mientki <stef.mien...@gmail.com> wrote: > > Graphical representation of links or pages that don't get linked to. > > I tried to test the links (with 2 algorithms, code below) in a generated > webpage, but the result I > get are very weird. > Probably one you knows a better way ? > > cheers, > Stef > > from BeautifulSoup import BeautifulSoup > from urllib import urlopen > from httplib import HTTP > from urlparse import urlparse > > def Check_URL_1 ( URL ) : > try: > fh = urlopen ( URL ) > return fh.code == 200 > except : > return False > > def Check_URL_2 ( URL ) : > p = urlparse ( URL ) > h = HTTP ( p[1] ) > h.putrequest ( 'HEAD', p[2] ) > h.endheaders() > if h.getreply()[0] == 200: > return True > else: > return False > > def Verify_Links ( URL ) : > Parts = URL.split('/') > Site = '/'.join ( Parts [:3] ) > Current = '/'.join ( Parts [:-1] ) > > fh = urlopen ( URL ) > lines = fh.read () > fh.close() > > Soup = BeautifulSoup ( lines ) > hrefs = lines = Soup.findAll ( 'a' ) > > for href in hrefs : > href = href [ 'href' ] #[:-1] ## <== remove "#" to generate all errors > > if href.startswith ( '/' ) : > href = Site + href > elif href.startswith ('#' ) : > href = URL + href > elif href.startswith ( 'http' ) : > pass > else : > href = Current + href > > try: > fh = urllib.urlopen ( href ) > except : > pass > print Check_URL_1 ( href ), Check_URL_2 ( href ), href > > URL = 'http://127.0.0.1:8000/welcome/default/index' > fh = Verify_Links ( URL )