ignore, i got it: get_url = re.compile(r"""window.open\('(.*)','','toolbar=0,""", re.DOTALL).findall
... get_onclick = str(soup('a')[0]['onclick']) # get the 'onclick' attribute urls = get_url(get_onclick) print assoc_name, urls, assoc_theme returns Amiral ['http://DOMAIN.TLD/extranet/associations/detail-assos.php?id=3815'] Culture On Sun, Oct 14, 2012 at 7:05 PM, Norman Khine <nor...@khine.net> wrote: > hello, i have this code: > > > #!/usr/local/bin/python > # -*- coding: utf-8 -*- > > import re > import urllib2 > import BeautifulSoup > > origin_site = > 'http://DOMAIN.TLD/index.php?id=annuaire_assos&theme=0&rech=&num_page=' > > pages = range(1,3) > > for page_no in pages: > print '====== %s' % page_no > req = ('%s%s' % (origin_site, page_no)) > user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' > headers = { 'User-Agent' : user_agent } > items = [] > try: > urllib2.urlopen(req) > except urllib2.URLError, e: > pass > else: > # do something with the page > doc = urllib2.urlopen(req) > soup = BeautifulSoup.BeautifulSoup(doc) > infoblock = soup.findAll('tr', { "class" : "menu2" }) > for item in infoblock: > soup = BeautifulSoup.BeautifulSoup(str(item)) > for tag in soup.recursiveChildGenerator(): > if isinstance(tag,BeautifulSoup.Tag) and > tag.name in ('td'): > if tag.string is not None: > assoc_name = (tag.string) > if isinstance(tag,BeautifulSoup.Tag) and > tag.name in ('u'): > if tag.string is not None: > assoc_theme = (tag.string) > > get_onclick = soup('a')[0]['onclick'] # get the > 'onclick' attribute > print assoc_name, get_onclick, assoc_theme > > > this returns the following: > > Amiral > window.open('http://DOMAIN.TLD/extranet/associations/detail-assos.php?id=3815','','toolbar=0,menubar=0,location=0,scrollbars=1,top=80,left=400,width=500,height=400');return > false Culture > > how do i extract from the get_onclick the > 'http://DOMAIN.TLD/extranet/associations/detail-assos.php?id=3815' > correctly? > > Any advise much appreciated. > > > > -- > %>>> "".join( [ {'*':'@','^':'.'}.get(c,None) or > chr(97+(ord(c)-83)%26) for c in ",adym,*)&uzq^zqf" ] ) -- %>>> "".join( [ {'*':'@','^':'.'}.get(c,None) or chr(97+(ord(c)-83)%26) for c in ",adym,*)&uzq^zqf" ] ) _______________________________________________ Tutor maillist - Tutor@python.org To unsubscribe or change subscription options: http://mail.python.org/mailman/listinfo/tutor