libxml questions...

bruce Fri, 04 Jun 2004 16:36:01 -0700

hi...

starting to explore libxml.. i have a question (the 1st of many, i'm sure!!)


i'm trying to parse the following.. and trying to get the text after the
<strong></strong>... but i'm not sure how to access it.. i can get the
information within the <strong></strong>..

any help would be useful... thanks...

the sample html is:


                <tr>
                    <td valign="top" colspan=4 class="sectionheading"
bgcolor="#99ffcc"> ACCT 209 - 501 SURVEY OF ACCT PRIN     <br /></td>
                </tr>
                <tr>
                    <td valign="top" bgcolor="#99ffcc">
                        <strong>Instructor:</strong> STASNY M       <br />
                        <strong>Total Seats:</strong> 450<br />
                        <strong>Available Seats</strong> 066<p />
                    </td>
                    <td valign="top" bgcolor="#99ffcc">
                        NON-BUSINESS, NON-AGRIBUSINESS MAJORS ONLY<br /><br
/>



                    </td>
                    <td valign="top" bgcolor="#99ffcc">
                         <a
href="http://www.tamu.edu/map/building/overview/WCBA.html";> MWF
09:10AM-10:00AM WCBA 159</a><br />


                    </td>
                    <td  valign="top" bgcolor="#99ffcc">
                        CR 3
                    </td>
                </tr>


                <tr>
                    <td colspan=4 bgcolor="#e2e2e2" valign="top"
class="sectionheading">
                        ACCT 209 - 502
                        SURVEY OF ACCT PRIN
                    </td>
                </tr>
                <tr>
                    <td  bgcolor="#e2e2e2" valign="top">
                        <strong>Instructor:</strong> STASNY M
                        <br />
                        <strong>Total Seats:</strong> 325
                        <br />
                        <strong>Available Seats</strong> 001<p />
                    </td>
                    <td bgcolor="#e2e2e2" valign="top">
                        NON-BUSINESS, NON-AGRIBUSINESS MAJORS ONLY<br /><br
/>



                    </td>
                    <td bgcolor="#e2e2e2" valign="top">
                                                 <a 
href="http://www.tamu.edu/map/building/overview/KLCT.html";> MWF
12:40PM-01:30PM KLCT 115</a><br />



                    </td>
                    <td bgcolor="#e2e2e2" valign="top">
                        CR 3
                    </td>
                </tr>

                <tr>
                    <td valign="top" colspan=4 class="sectionheading"
bgcolor="#99ffcc"> ACCT 209 - 503 SURVEY OF ACCT PRIN     <br /></td>
                </tr>
                <tr>
                    <td valign="top" bgcolor="#99ffcc">
                        <strong>Instructor:</strong> STRAWSER R     <br />
                        <strong>Total Seats:</strong> 450<br />
                        <strong>Available Seats</strong> 194<p />
                    </td>
                    <td valign="top" bgcolor="#99ffcc">
                        NON-BUSINESS, NON-AGRIBUSINESS MAJORS ONLY<br /><br
/>


                    </td>
                    <td valign="top" bgcolor="#99ffcc">
                         <a
href="http://www.tamu.edu/map/building/overview/WCBA.html";> TR
08:00AM-09:15AM WCBA 159</a><br />

                    </td>
                    <td  valign="top" bgcolor="#99ffcc">
                        CR 3
                    </td>
                </tr>




the sample perl that i'm using is:
---------------------------------------------------------------
#!/usr/bin/perl -w

use HTML::TreeBuilder;
use LWP::UserAgent;
use WWW::Mechanize;
use XML::LibXML;

my $base_url =
"http://courses.tamu.edu/ViewSections.aspx?department=ACCT&term=C&course=209
&year=2004&activity=00";


   $ua = new LWP::UserAgent;
   $ua->timeout(30);
   $ua->agent("AgentName/0.1 " . $ua->agent);


   $section_url = $base_url; #testing for now...
   $req = new HTTP::Request GET => $section_url;
   $res = $ua->request($req);
   $q1c = $res->content;
  # print $q1c;

   $am_tree = HTML::TreeBuilder->new_from_content($res->content); # empty
tree

   # trigger off the <td to get the class names..

   $_ndx=0;
   @_section_tbl = $am_tree->look_down("_tag"=>"table", "cellpadding"=>"0");

#print "html = " .$_section_tbl[1]->dump() ."\n";
#die;

   $test_html = $_section_tbl[1]->as_HTML();

 my $doc = XML::LibXML
        ->new({recover=>1})
        ->parse_html_string($test_html); #or parse_html_file

#print $doc->findvalue('//text()');
 @nodes = $doc->findnodes('//[EMAIL PROTECTED]"sectionheading"]');

 print "cnt = " . scalar @nodes ."\n";
print $nodes[0]->findvalue('./text()')."\n";
print $nodes[1]->findvalue('./text()')."\n";
print $nodes[2]->findvalue('./text()')."\n";

#<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#
#           this is the area i'm having issues/questions about...
#
#<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

 @nodes2 = $doc->findnodes('//td/descendant::strong');

 print "cnt2 = " . scalar @nodes2 ."\n";

print $nodes2[0]->findvalue('./text()')."\n";
print $nodes2[1]->findvalue('./text()')."\n";
print $nodes2[2]->textContent ."\n";
print $nodes2[3]->findvalue('.')."\n";
print $nodes2[4]->findvalue('.')."\n";
print $nodes2[5]->findvalue('./text()')."\n";
print $nodes2[6]->findvalue('./text()')."\n";
print $nodes2[7]->findvalue('./text()')."\n";
print $nodes2[8]->findvalue('./text()')."\n";

die;

thanks...

libxml questions...

Reply via email to