hello, i have this code:
#!/usr/local/bin/python # -*- coding: utf-8 -*- import re import urllib2 import BeautifulSoup origin_site = 'http://DOMAIN.TLD/index.php?id=annuaire_assos&theme=0&rech=&num_page=' pages = range(1,3) for page_no in pages: print '====== %s' % page_no req = ('%s%s' % (origin_site, page_no)) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } items = [] try: urllib2.urlopen(req) except urllib2.URLError, e: pass else: # do something with the page doc = urllib2.urlopen(req) soup = BeautifulSoup.BeautifulSoup(doc) infoblock = soup.findAll('tr', { "class" : "menu2" }) for item in infoblock: soup = BeautifulSoup.BeautifulSoup(str(item)) for tag in soup.recursiveChildGenerator(): if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('td'): if tag.string is not None: assoc_name = (tag.string) if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('u'): if tag.string is not None: assoc_theme = (tag.string) get_onclick = soup('a')[0]['onclick'] # get the 'onclick' attribute print assoc_name, get_onclick, assoc_theme this returns the following: Amiral window.open('http://DOMAIN.TLD/extranet/associations/detail-assos.php?id=3815','','toolbar=0,menubar=0,location=0,scrollbars=1,top=80,left=400,width=500,height=400');return false Culture how do i extract from the get_onclick the 'http://DOMAIN.TLD/extranet/associations/detail-assos.php?id=3815' correctly? Any advise much appreciated. -- %>>> "".join( [ {'*':'@','^':'.'}.get(c,None) or chr(97+(ord(c)-83)%26) for c in ",adym,*)&uzq^zqf" ] ) _______________________________________________ Tutor maillist - Tutor@python.org To unsubscribe or change subscription options: http://mail.python.org/mailman/listinfo/tutor