i never found a solution to the same problem, so i "hacked" on the header string - try this:

###############
use strict;
use LWP::UserAgent;
use URI;
select STDOUT; $|=1;
select STDERR; $|=1;

my $ua = LWP::UserAgent->new();

my $url = 'http://www.nytimes.com/2004/05/11/business/11place.html?ex=1399608000&en=4a159a499cda7348&ei=5007&partner=USERLAND';

my $request = HTTP::Request->new('GET', $url);
my $response = $ua->request($request);
my $content = $response->content();

my $headers = $response->headers_as_string;
my $charset = "";

if ($headers  =~  m,Content-Type:\s*[^;]+;\s*charset=(\S+),igs) {
    $charset = $1;
}

print "Charset: $charset \n";

#################

./allan


Mitchell, Louise M wrote:


All,

I need to grab the encoding of pages I'm retrieving with
LWP::UserAgent... my perusal of the documentation indicated I could use
the LWP::MediaTypes to get the encoding... it says to pass a URI object
reference and return in array context to get it...I'm not having any
luck...

Below is my code... and below that a snippet from the HTML returned... I
can see that the encoding is iso-8859-1, as indicated in the first meta
tag... but my code is not retrieving it...

Can someone set me straight on how to get this value...

Thanks,
LouiseM

#*****************************
#  Code
#*****************************
#!d:\apps\perl\bin\perl

use LWP::UserAgent;

select STDOUT; $|=1;
select STDERR; $|=1;
$ua = LWP::UserAgent->new();
$url =
'http://www.nytimes.com/2004/05/11/business/11place.html?ex=1399608000&e
n=4a159a499cda7348&ei=5007&partner=USERLAND';
$request = HTTP::Request->new('GET', $url);
$response = $ua->request($request); $content = $response->content();


use LWP::MediaTypes; # qw(guess_media_type);
@type = guess_media_type($request); print "** type: @type \n";
#*****************************************
# meta tags from returned HTML
#******************************************


<meta http-equiv="Content-Type" content="text/html;
charset=iso-8859-1">
<meta name="ROBOTS" content="NOARCHIVE">
<meta name="DISPLAYDATE" content="May 11, 2004">
<meta name="hdl" content="Citigroup Assesses a Risk and Decides to
Settle">
<meta name="byl" content="By TIMOTHY L. O&#39;BRIEN">
<meta name="lp" content="Several banks played a role in pitching WorldCom but only Citigroup built an unusually symbiotic relationship.">
<meta name="description" content="Several banks played a role in
pitching WorldCom but only Citigroup built an unusually symbiotic relationship.">
<meta name="cre" content="The New York Times">
<meta name="pdate" content="20040511">
<meta name="ttl" content=""> <meta name="virtloc" content="">
<meta name="des" content="">
<meta name="per" content="Grubman, Jack">
<meta name="org" content="Citigroup Incorporated;WorldCom
Incorporated">
<meta name="geo" content="">
<meta name="ticker" content="Citigroup Incorporated|C|NYSE;WorldCom
Incorporated|WCPMQ,WCPNQ,WCPOQ|other-OTC;J.P. Morgan Chase &#0038;
Company|JPM|NYSE;Bank of America Corporation|BAC|NYSE;Enron
Corporation|ENRNQ|other-OTC">
<meta name="dat" content="May 11, 2004">
<meta name="tom" content="Sidebar">
<meta name="cat" content="">
<meta name="col" content="Market Place">
<meta name="dsk" content="Business">
<meta name="articleid" content="1084179629750">
<meta NAME="ARTICLE_TEMPLATE_VERSION" CONTENT="500">


Reply via email to