RE: xpath questions...

bruce Sat, 23 Aug 2008 06:35:33 -0700

valid point...!!

here's the test python.. ugly as it is!!


#!/usr/bin/python
#
# test.py
#
# scrapes/extracts the basic data for the college
#
#
# the app gets/stores
#  name
#  url
#  address (street/city/state
#  phone
#
######################################################################3
#test python script
import re
import libxml2dom
import urllib
import urllib2
import sys, string
from  mechanize import Browser
import mechanize
#import tidy
import os.path
import cookielib
from libxml2dom import Node
from libxml2dom import NodeList
import subprocess
import MySQLdb
#import mysql_config
import time


########################
#
# Parse pricegrabber.com
########################


urlopen = urllib2.urlopen
##cj = urllib2.cookielib.LWPCookieJar()
Request = urllib2.Request
br = Browser()
br2 = Browser()


user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values1 = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python' }
headers = { 'User-Agent' : user_agent }

url="http://schedule.berkeley.edu/";
url="http://schedule.psu.edu/";
#=======================================


if __name__ == "__main__":
# main app

        txdata = None

#----------------------------
# get the kentucky test pages

        #br.set_cookiejar(cj)
        br.set_handle_redirect(True)
        br.set_handle_referer(True)
        br.set_handle_robots(False)
        br.addheaders = [('User-Agent', 'Firefox')]


        #cnt is the page count for the master url
        murl=url

        print "url =",murl
        br.open(murl)
        #cj.save(COOKIEFILE)    # resave cookies

        res = br.response()  # this is a copy of response
        s = res.read()

        # s contains HTML not XML text
        d = libxml2dom.parseString(s, html=1)


        #get the input/text dialogs
        #tn1 = "//[EMAIL 
PROTECTED]'main_content']/form[1]/input[position()=1]/@name"
        q="//img/parent::*/attribute::href"
        q="//[EMAIL PROTECTED]'cos_search1']/@action"

        t1=d.xpath(q)
        print "href = ",t1
        print "hnode =",t1[0].nodeValue
        print "htest =",t1[0].textContent
        print "htesttt =",t1[0].toString()

        sys.exit()

thanks!!


-----Original Message-----
From: [EMAIL PROTECTED]
[mailto:[EMAIL PROTECTED] Behalf
Of Fredrik Lundh
Sent: Saturday, August 23, 2008 5:58 AM
To: [email protected]
Subject: Re: xpath questions...


bruce wrote:

> Regarding the xpath question I've posed, some have said that it shouldn't
be
> here on the mailing list. Give that I'm writing the test scripts/apps in
> python, using the python libs, where else should it be posted?
>
> I mean, I could post the entire sample script so you can see that it's
using
> python, but I simplified the issue.

there was zero Python content left after the simplification.  maybe you
should at least mention what library you're using to "play around with
xpath and the html dom" ?

</F>

--
http://mail.python.org/mailman/listinfo/python-list

--
http://mail.python.org/mailman/listinfo/python-list

RE: xpath questions...

Reply via email to