issue

bruce Mon, 25 Aug 2008 13:04:40 -0700

Hi.

Got a test web page, that basically has two "<html" tags in it. Examining
the page via Firefox/Dom Inspector, I can create a test xpath query
"/html/body/form" which gets the target form for the test.


The issue comes when I examine the page's source html. It looks like:
<html>
<body>
</body>
</html>

<html>
<body>
.
.
.
</body>
</html>

I've simplified things a bit... but basically, the 1st "html/body" is empty,
with the 2nd containing the data/nodes I need.

In using xpath("/html/body/form"), the app returns nothing/crashes.. I've
tried to do something like xpath("/html[position()=0]") as well with no
luck... It's as if xpath only looks at the 1st html that it sees in a given
page. I can't seem to find any docs for xpath to work around this. I'm using
the libxml2dom for python 2.5.1.

Any thoughts/comments...

If I comment out the 1st html section, things work as they should. The test
code is below...

thanks

------------------------------------------
#!/usr/bin/python
#
# test.py
#
# scrapes/extracts the basic data for the college
#
#
# the app gets/stores
#  name
#  url
#  address (street/city/state
#  phone
#
######################################################################3
#test python script
import re
import libxml2dom
import urllib
import urllib2
import sys, string
from  mechanize import Browser
import mechanize
#import tidy
import os.path
import cookielib
from libxml2dom import Node
from libxml2dom import NodeList
import subprocess
import time

########################
#
# Parse pricegrabber.com
########################
##cj = "p"
##COOKIEFILE = 'cookies.lwp'
#cookielib = 1


urlopen = urllib2.urlopen
Request = urllib2.Request
br = Browser()
br2 = Browser()

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values1 = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python' }
headers = { 'User-Agent' : user_agent }

url="http://schedule.psu.edu/";
#=======================================


if __name__ == "__main__":
# main app

        txdata = None

#----------------------------

        ##br.set_cookiejar(cj)
        br.set_handle_redirect(True)
        br.set_handle_referer(True)
        br.set_handle_robots(False)
        br.addheaders = [('User-Agent', 'Firefox')]

        print "url =",url
        #br.open(url)
        ##cj.save(COOKIEFILE)    # resave cookies

        #res = br.response()  # this is a copy of response
        #s = res.read()
        #print "slen=",len(s)
        tfile = open("/college/psu1.dat")
        s = tfile.read()
        print s


        # s contains HTML not XML text
        d=[]
        d = libxml2dom.parseString(s, html=1)
        print "d",d

        name_=[]
        len_=0

        br.open(url)
        ##cj.save(COOKIEFILE)    # resave cookies

        #res = br.response()  # this is a copy of response
        #s = res.read()
        print "slen=",len(s)

        # s contains HTML not XML text
        #d=[]
        #d = libxml2dom.parseString(s, html=1)
        #print "d",d

        #name_ = d.xpath("//form")
        name_ = d.xpath("/html/body/form")
        len_ = len(name_)
        print "len=",len_

        print "name1",name_
        print "len",len(name_)
        #print "sdlfs"
        sys.exit()
#       else:
#               print "err in form_ID"


        print "here..."


--
http://mail.python.org/mailman/listinfo/python-list

python - firefox dom/xpath question/issue

Reply via email to