I'm using XML::Parser with start, end and char handlers.  When I call the
parse routine it seems that the "special" characters found in the
<headline_text> is confusing the parser.  Only one or two articles have
this, but one rotton apple is spoiling the bunch.  Since I'm going through
multiple articels, and pushing certain items into arrays this is throwing
off my associations.  I'm getting urls paired up with the wrong headlines
etc.

I'd appreciate any suggestions.

byron wise

THIS IS AN EXAMPLE OF THE XML I'M PARSING
- <article id="_23300891">
  <url>http://c.moreover.com/click/here.pl?x23300886</url> 
  <headline_text>&#039Dot-info&#039 domain manager to boot
cybersquatters</headline_text> 
  <source>CNET</source> 
  <media_type>text</media_type> 
  <cluster>moreover...</cluster> 
  <tagline /> 
 
<document_url>http://news.cnet.com/investor/news/investor-fd/0-9900-1000-0.html</document_url>
  <harvest_time>Aug 15 2001 4:48AM</harvest_time> 
  <access_registration /> 
  <access_status /> 
  </article>


HERE IS THE SCRIPT I'M RUNNING

#!/usr/local/bin/perl -w


use strict;
use LWP::UserAgent;
use HTTP::Request;
use XML::Parser;

my (%XMLHash, $file, $url, @url, @headline_text, @source, @document_url,
@harvest_time, $headline_text, $source, $document_url, $harvest_time,
$TagStatus);
my $globalFlag=0;

# Create a user agent object
 
  my  $ua = new LWP::UserAgent;
  $ua->agent("DeclareIt/1.1 " . $ua->agent);


my $p1 = new XML::Parser(Handlers => {  Start         =>  \&handle_start,
                                        End           =>  \&handle_end,
                                        Char          =>  \&handle_char});

  my @requests = (
               
'http://p.moreover.com/cgi-local/page?c=Cyberculture%20news&o=xml');

foreach ( @requests )  {
  # Create a request
  my $req = new HTTP::Request GET => $requests[0];
  $req->header('Accept' => 'text/html');
  
  # Pass request to the user agent and get a response back
  my $res = $ua->request($req);

  # Check the outcome of the response
  if ($res->is_success)  {   
        my $file = $res->content;  # Get the file   
        $p1->parse($file);  # Parse it
  
         %XMLHash = (   'url'           => \@url,
                        'headline_text' => \@headline_text,
                        'source'        => \@source,
                        'document_url'  => \@document_url,
                        'harvest_time'  => \@harvest_time );
        my $i=0;
        while($XMLHash{'url'}->[$i])  {
                print qq~URL: $XMLHash{'url'}->[$i]\n~;
                print qq~HeadLineText: $XMLHash{'headline_text'}->[$i]\n~;
                print qq~Source: $XMLHash{'source'}->[$i]\n~;
                print qq~DocumentURL: $XMLHash{'document_url'}->[$i]\n~;
                print qq~HarvestTime: $XMLHash{'harvest_time'}->[$i]\n~;
                print qq~\n~;
                $i++;
        }

        #print qq~URL: $XMLHash{'url'}\nHeadlineText:
$XMLHash{'headline_text'}\nSource: $XMLHash{'source'}\nDocumentURL:
$XMLHash{'document_url'}\nHarvestTime: $XMLHash{'harvest_time'}\n\n\n~;
  } # end if  
} # end foreach


exit;







sub handle_start() {
        my ($expat, $element) = @_;
        $TagStatus = $element;
        #print"$TagStatus\n";

}


sub handle_end()  {
        my ($expat, $element) = @_;
        $TagStatus = '';
}


sub handle_char()  {
        my ($expat, $element) = @_;

        if ($TagStatus eq 'url')  { #print "$element\n";
                $url = $element;
                push(@url, $element);
        }

        if ($TagStatus eq 'headline_text')  { print "HEADLINE: $element\n";
                $headline_text = $element;
                push(@headline_text, $element);
        }

        if ($TagStatus eq 'source')  { #print "$element\n";
                $source = $element;
                push(@source, $element);
        }

        if ($TagStatus eq 'document_url')  { #print "$element\n";
                $document_url = $element;
                push(@document_url, $element);
        }

        if ($TagStatus eq 'harvest_time')  { #print "$element\n";
                $harvest_time = $element;
                push(@harvest_time, $element);
        }



}





_______________________________________________________
http://inbox.excite.com


_______________________________________________
Perl-Unix-Users mailing list. To unsubscribe go to 
http://listserv.ActiveState.com/mailman/subscribe/perl-unix-users

Reply via email to