I'm using XML::Parser with start, end and char handlers. When I call the parse routine it seems that the "special" characters found in the <headline_text> is confusing the parser. Only one or two articles have this, but one rotton apple is spoiling the bunch. Since I'm going through multiple articels, and pushing certain items into arrays this is throwing off my associations. I'm getting urls paired up with the wrong headlines etc. I'd appreciate any suggestions. byron wise THIS IS AN EXAMPLE OF THE XML I'M PARSING - <article id="_23300891"> <url>http://c.moreover.com/click/here.pl?x23300886</url> <headline_text>'Dot-info' domain manager to boot cybersquatters</headline_text> <source>CNET</source> <media_type>text</media_type> <cluster>moreover...</cluster> <tagline /> <document_url>http://news.cnet.com/investor/news/investor-fd/0-9900-1000-0.html</document_url> <harvest_time>Aug 15 2001 4:48AM</harvest_time> <access_registration /> <access_status /> </article> HERE IS THE SCRIPT I'M RUNNING #!/usr/local/bin/perl -w use strict; use LWP::UserAgent; use HTTP::Request; use XML::Parser; my (%XMLHash, $file, $url, @url, @headline_text, @source, @document_url, @harvest_time, $headline_text, $source, $document_url, $harvest_time, $TagStatus); my $globalFlag=0; # Create a user agent object my $ua = new LWP::UserAgent; $ua->agent("DeclareIt/1.1 " . $ua->agent); my $p1 = new XML::Parser(Handlers => { Start => \&handle_start, End => \&handle_end, Char => \&handle_char}); my @requests = ( 'http://p.moreover.com/cgi-local/page?c=Cyberculture%20news&o=xml'); foreach ( @requests ) { # Create a request my $req = new HTTP::Request GET => $requests[0]; $req->header('Accept' => 'text/html'); # Pass request to the user agent and get a response back my $res = $ua->request($req); # Check the outcome of the response if ($res->is_success) { my $file = $res->content; # Get the file $p1->parse($file); # Parse it %XMLHash = ( 'url' => \@url, 'headline_text' => \@headline_text, 'source' => \@source, 'document_url' => \@document_url, 'harvest_time' => \@harvest_time ); my $i=0; while($XMLHash{'url'}->[$i]) { print qq~URL: $XMLHash{'url'}->[$i]\n~; print qq~HeadLineText: $XMLHash{'headline_text'}->[$i]\n~; print qq~Source: $XMLHash{'source'}->[$i]\n~; print qq~DocumentURL: $XMLHash{'document_url'}->[$i]\n~; print qq~HarvestTime: $XMLHash{'harvest_time'}->[$i]\n~; print qq~\n~; $i++; } #print qq~URL: $XMLHash{'url'}\nHeadlineText: $XMLHash{'headline_text'}\nSource: $XMLHash{'source'}\nDocumentURL: $XMLHash{'document_url'}\nHarvestTime: $XMLHash{'harvest_time'}\n\n\n~; } # end if } # end foreach exit; sub handle_start() { my ($expat, $element) = @_; $TagStatus = $element; #print"$TagStatus\n"; } sub handle_end() { my ($expat, $element) = @_; $TagStatus = ''; } sub handle_char() { my ($expat, $element) = @_; if ($TagStatus eq 'url') { #print "$element\n"; $url = $element; push(@url, $element); } if ($TagStatus eq 'headline_text') { print "HEADLINE: $element\n"; $headline_text = $element; push(@headline_text, $element); } if ($TagStatus eq 'source') { #print "$element\n"; $source = $element; push(@source, $element); } if ($TagStatus eq 'document_url') { #print "$element\n"; $document_url = $element; push(@document_url, $element); } if ($TagStatus eq 'harvest_time') { #print "$element\n"; $harvest_time = $element; push(@harvest_time, $element); } } _______________________________________________________ http://inbox.excite.com _______________________________________________ Perl-Unix-Users mailing list. To unsubscribe go to http://listserv.ActiveState.com/mailman/subscribe/perl-unix-users