hi...
starting to explore libxml.. i have a question (the 1st of many, i'm sure!!)
i'm trying to parse the following.. and trying to get the text after the
<strong></strong>... but i'm not sure how to access it.. i can get the
information within the <strong></strong>..
any help would be useful... thanks...
the sample html is:
<tr>
<td valign="top" colspan=4 class="sectionheading"
bgcolor="#99ffcc"> ACCT 209 - 501 SURVEY OF ACCT PRIN <br /></td>
</tr>
<tr>
<td valign="top" bgcolor="#99ffcc">
<strong>Instructor:</strong> STASNY M <br />
<strong>Total Seats:</strong> 450<br />
<strong>Available Seats</strong> 066<p />
</td>
<td valign="top" bgcolor="#99ffcc">
NON-BUSINESS, NON-AGRIBUSINESS MAJORS ONLY<br /><br
/>
</td>
<td valign="top" bgcolor="#99ffcc">
<a
href="http://www.tamu.edu/map/building/overview/WCBA.html"> MWF
09:10AM-10:00AM WCBA 159</a><br />
</td>
<td valign="top" bgcolor="#99ffcc">
CR 3
</td>
</tr>
<tr>
<td colspan=4 bgcolor="#e2e2e2" valign="top"
class="sectionheading">
ACCT 209 - 502
SURVEY OF ACCT PRIN
</td>
</tr>
<tr>
<td bgcolor="#e2e2e2" valign="top">
<strong>Instructor:</strong> STASNY M
<br />
<strong>Total Seats:</strong> 325
<br />
<strong>Available Seats</strong> 001<p />
</td>
<td bgcolor="#e2e2e2" valign="top">
NON-BUSINESS, NON-AGRIBUSINESS MAJORS ONLY<br /><br
/>
</td>
<td bgcolor="#e2e2e2" valign="top">
<a
href="http://www.tamu.edu/map/building/overview/KLCT.html"> MWF
12:40PM-01:30PM KLCT 115</a><br />
</td>
<td bgcolor="#e2e2e2" valign="top">
CR 3
</td>
</tr>
<tr>
<td valign="top" colspan=4 class="sectionheading"
bgcolor="#99ffcc"> ACCT 209 - 503 SURVEY OF ACCT PRIN <br /></td>
</tr>
<tr>
<td valign="top" bgcolor="#99ffcc">
<strong>Instructor:</strong> STRAWSER R <br />
<strong>Total Seats:</strong> 450<br />
<strong>Available Seats</strong> 194<p />
</td>
<td valign="top" bgcolor="#99ffcc">
NON-BUSINESS, NON-AGRIBUSINESS MAJORS ONLY<br /><br
/>
</td>
<td valign="top" bgcolor="#99ffcc">
<a
href="http://www.tamu.edu/map/building/overview/WCBA.html"> TR
08:00AM-09:15AM WCBA 159</a><br />
</td>
<td valign="top" bgcolor="#99ffcc">
CR 3
</td>
</tr>
the sample perl that i'm using is:
---------------------------------------------------------------
#!/usr/bin/perl -w
use HTML::TreeBuilder;
use LWP::UserAgent;
use WWW::Mechanize;
use XML::LibXML;
my $base_url =
"http://courses.tamu.edu/ViewSections.aspx?department=ACCT&term=C&course=209
&year=2004&activity=00";
$ua = new LWP::UserAgent;
$ua->timeout(30);
$ua->agent("AgentName/0.1 " . $ua->agent);
$section_url = $base_url; #testing for now...
$req = new HTTP::Request GET => $section_url;
$res = $ua->request($req);
$q1c = $res->content;
# print $q1c;
$am_tree = HTML::TreeBuilder->new_from_content($res->content); # empty
tree
# trigger off the <td to get the class names..
$_ndx=0;
@_section_tbl = $am_tree->look_down("_tag"=>"table", "cellpadding"=>"0");
#print "html = " .$_section_tbl[1]->dump() ."\n";
#die;
$test_html = $_section_tbl[1]->as_HTML();
my $doc = XML::LibXML
->new({recover=>1})
->parse_html_string($test_html); #or parse_html_file
#print $doc->findvalue('//text()');
@nodes = $doc->findnodes('//[EMAIL PROTECTED]"sectionheading"]');
print "cnt = " . scalar @nodes ."\n";
print $nodes[0]->findvalue('./text()')."\n";
print $nodes[1]->findvalue('./text()')."\n";
print $nodes[2]->findvalue('./text()')."\n";
#<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#
# this is the area i'm having issues/questions about...
#
#<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
@nodes2 = $doc->findnodes('//td/descendant::strong');
print "cnt2 = " . scalar @nodes2 ."\n";
print $nodes2[0]->findvalue('./text()')."\n";
print $nodes2[1]->findvalue('./text()')."\n";
print $nodes2[2]->textContent ."\n";
print $nodes2[3]->findvalue('.')."\n";
print $nodes2[4]->findvalue('.')."\n";
print $nodes2[5]->findvalue('./text()')."\n";
print $nodes2[6]->findvalue('./text()')."\n";
print $nodes2[7]->findvalue('./text()')."\n";
print $nodes2[8]->findvalue('./text()')."\n";
die;
thanks...