Re: [cgiapp] Problem displaying French, sometimes

Mike Tonks Tue, 09 Sep 2008 03:18:32 -0700

Hi Ron,

Don't know if this will come through correctly via email - I'll mail
you direct with file as an attachment - but I ended up writing a
simple perl script to run from the command line to test my utf8 data
display and retrieval from the database.  You will need to alter the
database calls to match your own setup.  You can ignore the last bit
that attempt to locate and fix corrupt data in the db.


In summary - I found I did not need encode or decode functions, but
did need to 'use utf8' and binmode utf8, and when run as cgi or CA
need to ensure all correct utf8 headers are sent to browser.

mike

#!/usr/bin/perl -w

use utf8; #Explicitly allow utf8 in our script - this is critical

binmode STDOUT, ":utf8"; # Explicitly output utf8 - this is critical

use DBI qw(:utils); # enable extra DBI debug functions

use Encode;
use Config::Auto;
use Getopt::Easy;

use Data::Dumper;

my $DELETE_MESSAGES = 0;

my $config = Config::Auto::parse("../../web/cgi-bin/BookBank/config.pm");

#warn "db_connect: " . join "|", @{$config->{db_connect}};

my $dbh = DBI->connect(@{$config->{db_connect}}) or die "Database
connection failed";

get_options "e-errors f-fix= p-poetry v-verbose D-debug", "usage =>
usage: prog [-e] [-f] [-p] [-D]";


my $ben = "Benützername";

my $sed = "Se déconnecter";

my $ver = "Verifié ☺";

print "Test phrases - display and concatenation \n";

print "$ben [". data_string_desc($ben) ."]\n";

print "$sed [". data_string_desc($sed) ."]\n";

print "$ver [". data_string_desc($ver) ."]\n";

print "$ben $sed $ver [". data_string_desc("$ben $sed $ver") ."]\n";

print "\n";

print "Additional Test data - various languages [use -p if you want poetry]\n";

foreach my $test (@{&test_data()}) { print $test . "
[".data_string_desc("$test")."]\n"; }

print "\n";

#Get utf8 data from file
open (DATA, "<:utf8", "test_utf8.txt") or die("Could not open file!");
foreach my $test (<DATA>) { print $test; }

if ($O{poetry}) { foreach my $test (@{&poetry()}) { print $test . "\n\n"; } }

print "\n";

print "Forced Incorrect Encoding \n";

my $double_coded = encode_utf8($sed);

my $triple_coded = encode_utf8($double_coded);

my $reverse = decode_utf8($sed);

print "Double Encoded: $sed - " . $double_coded . "
[".data_string_desc($double_coded)."]\n";

print "Triple Encoded: $sed - " . $triple_coded . "
[".data_string_desc($triple_coded)."]\n";

print "Reverse Encoded: $sed - " . $reverse . "
[".data_string_desc($reverse)."]\n";

print "\n";

print "From Database [table language_test_utf8]\n";

my $test = getLangTest($dbh);

foreach my $row (@$test) {

        print $row->{Text} . " [".data_string_desc($row->{Text})."]\n";
}

print "\n";


print "From Languages Table \n";

$test = getLangTerm($dbh);

foreach my $row (@$test) {

        print $row->{Text} . " [".data_string_desc($row->{Text})."]\n";
}

print "\n";

unless ($O{errors}) {
        print "Done initial tests, use -e [errors] to check for errors and -f
yes [fix] to try to fix errors in db \n";
        exit;
}

# à â ç é è ê ë î ï ô û ù ü ÿ

my $char = "é";

my $err = encode_utf8($char);

print "Looking for errors in database [ $char ] [ $err ] \n";

my $dbl_err = encode_utf8($err);

print "Test: [ $dbl_err ] \n";

$test = getLangMatch($dbh, $err);

foreach my $row (@$test) {

        # Perl / mysql seems to handle this, but it's not technically correct
        # Gives 'wide character in print' warning
        print "Raw: $row->{Text} \n";

        my $enc = encode_utf8($row->{Text});

        # This is the correct way to do it
        print "Enc: $enc \n";

        # Adding the correct and incorrect string together causes the encoded
string to be mangled
        # Gives 'wide character in print' warning
        print "Mangled: " . $enc . " [ $row->{Text} ] \n";

        my $err2 = $err;
        $err2 =~ s/Ã/ÃƒÂ/g;

        my $fix = $enc;
        $fix =~ s/$err/$char/g;
        $fix =~ s/$dbl_err/$char/g;
        $fix =~ s/$err2/$char/g;

        # Fixed - works for $err but not $dbl_err ?
        print "Fixed?: $fix \n";

}

# We can do a similar fix via the database, using mysql replace function
foreach $char ( qw/à â ç é è ê ë î ï ô û ù ü ÿ À Â Ä È É Ê Ë Î Ï Ô Œ Ù
Û Ü Ÿ/ ) {

        $err = encode_utf8($char);

        my $err2 = $err;
        $err2 =~ s/Ã/ÃƒÂ/g;

        print "char: $char [ $err ] [ $err2 ] \n";

        if ($O{fix} eq "yes") {

                print "Running fix on database \n";
                doLangMatchFix($dbh, $err, $char);
        }

        my $rows = getLangMatchFix($dbh, $err, $char);

        if (scalar (@$rows) > 0) {

                foreach my $row (@$rows) {

                        print "Fixed?: ".encode_utf8($row->{Text})." >>
".encode_utf8($row->{Fixed})." \n";
                }

        } else {

                print "No errors found - checking real data \n";

                my $rows = getLangMatch($dbh, $char);

                foreach my $row (@$rows) {

                        print "OK? [ $char ]: ".encode_utf8($row->{Text})." \n";
                }
        }
}

exit;

###########################################################

sub getLangTest
{
        my $dbh = shift;

        return $dbh->selectall_arrayref("select * from language_test_utf8", {
Slice => {} });
}

sub getLangTerm
{
        my ($dbh, $term) = @_;

        $term = 'test_utf8' unless $term;

        return $dbh->selectall_arrayref("select l.Language, lo.Text from
languages l, languagesoutput lo where l.LanguageID = lo.LanguageID and
LanguageMessageLookupID in (select LanguageMessageLookupID from
languagemessagelookup where LanguageMessageLookup = ?) order by
LanguageOutputID desc limit 0,3;", { Slice => {} }, $term);
}

sub getLangMatch
{
        my ($dbh, $match) = @_;

        $match = "%$match%";

        return $dbh->selectall_arrayref("select LanguageOutputID, Text from
languagesoutput where Text like ? limit 0, 10", { Slice => {} },
$match);
}

sub getLangMatchFix
{
        my ($dbh, $err, $replace) = @_;

        my $err2 = $err;
        $err2 =~ s/Ã/ÃƒÂ/g;

        my $match = "%$err%";
        my $match2 = "%$err2%";

        #return $dbh->selectall_arrayref("select LanguageOutputID, Text,
REPLACE(REPLACE(Text, ?, ?), ?, ?) as Fixed from languagesoutput where
Text like ? limit 0, 20", { Slice => {} }, $err2, $replace, $err,
$replace, $match);
        return $dbh->selectall_arrayref("select LanguageOutputID, Text,
REPLACE(Text, ?, ?) as Fixed from languagesoutput where Text like ?
limit 0, 10", { Slice => {} }, $err, $replace, $match);

}

sub doLangMatchFix
{
        my ($dbh, $err, $replace) = @_;

        my $err2 = $err;
        $err2 =~ s/Ã/ÃƒÂ/g;

        my $match = "%$err%";
        my $match2 = "%$err2%";

#       $dbh->do("update languagesoutput set Text = REPLACE(REPLACE(Text, ?,
?), ?, ?) where Text like ?", { Slice => {} }, $err2, $replace,
$search, $replace, $match);

        my $sql = "update languagesoutput set Text = REPLACE(Text, '$err',
'$replace') where Text like '$match'";
        my $sql2 = "update languagesoutput set Text = REPLACE(Text, '$err2',
'$replace') where Text like '$match2'";

        print "SQL: $sql \n";
        print "SQL: $sql2 \n";

        $dbh->do($sql);
        $dbh->do($sql2);
}


sub test_data
{
        return [
                "Czech and Slovak characters: š  ť  ž  ľ  č  ě ď ň ř ů ĺ Š Ť Ž 
Ľ Č Ě
Ď Ň Ř Ů Ĺ",
                "Polish characters: ł ą ż ę ć ń ś ź Ł Ą Ż Ę Ć Ń Ś Ź",
                "Romanian characters: Ă ă Ş ş Ţ ţ",
                "Croatian and Slovenian characters: š č ž ć đ Š Č Ž Ć Đ",
                "Hungarian characters: Ő ő Ű ű",
                "German characters: Ä, ä, Ö, ö, Ü, ü, ß",
                "Russian alphabet: абвгдеёжзийклмнопрстуфхцчшчьыъэюя
АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯ",
                "Special Byelorussian and Ukrainian characters: Ў ў Є є Ґ ґ",
                "Special Serbian and Macedonian characters: Ђ Љ Њ Ћ Џ ђ љ њ ћ 
џ",
                "Arabic: ب‎ج‎د‎ﻫ‎و‎ز‎ح‎ط‎ع‎ف‎ص‎ق‎ر‎ش‎ت‎ث‎خ‎ذ‎ض‎ظ‎غ‎"
        ];
}

sub poetry
{
        return [
                "From the Anglo-Saxon Rune Poem (Rune version): 
ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ
    ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ
    ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬",
        "From Laȝamon's Brut (The Chronicles of England, Middle English,
West Midlands):

    An preost wes on leoden, Laȝamon was ihoten
    He wes Leovenaðes sone -- liðe him be Drihten.
    He wonede at Ernleȝe at æðelen are chirechen,
    Uppen Sevarne staþe, sel þar him þuhte,
    Onfest Radestone, þer he bock radde. ",
        "From the Tagelied of Wolfram von Eschenbach (Middle High German):

    Sîne klâwen durh die wolken sint geslagen,
    er stîget ûf mit grôzer kraft,
    ich sih in grâwen tägelîch als er wil tagen,
    den tac, der im geselleschaft
    erwenden wil, dem werden man,
    den ich mit sorgen în verliez.
    ich bringe in hinnen, ob ich kan.
    sîn vil manegiu tugent michz leisten hiez.",
    "Some lines of Odysseus Elytis (Greek):

    Monotonic:

    Τη γλώσσα μου έδωσαν ελληνική
    το σπίτι φτωχικό στις αμμουδιές του Ομήρου.
    Μονάχη έγνοια η γλώσσα μου στις αμμουδιές του Ομήρου.

    από το Άξιον Εστί
    του Οδυσσέα Ελύτη

    Polytonic:

    Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ
    τὸ σπίτι φτωχικὸ στὶς ἀμμουδιὲς τοῦ Ὁμήρου.
    Μονάχη ἔγνοια ἡ γλῶσσα μου στὶς ἀμμουδιὲς τοῦ Ὁμήρου.

    ἀπὸ τὸ Ἄξιον ἐστί
    τοῦ Ὀδυσσέα Ἐλύτη",
    "The first stanza of Pushkin's Bronze Horseman (Russian):

    На берегу пустынных волн
    Стоял он, дум великих полн,
    И вдаль глядел. Пред ним широко
    Река неслася; бедный чёлн
    По ней стремился одиноко.
    По мшистым, топким берегам
    Чернели избы здесь и там,
    Приют убогого чухонца;
    И лес, неведомый лучам
    В тумане спрятанного солнца,
    Кругом шумел.",

        "Šota Rustaveli's Veṗxis Ṭq̇aosani, ̣︡Th, The Knight in the Tiger's
Skin (Georgian):

    ვეპხის ტყაოსანი შოთა რუსთაველი

    ღმერთსი შემვედრე, ნუთუ კვლა დამხსნას სოფლისა შრომასა, ცეცხლს,
წყალსა და მიწასა, ჰაერთა თანა მრომასა; მომცნეს ფრთენი და აღვფრინდე,
მივჰხვდე მას ჩემსა ნდომასა, დღისით და ღამით ვჰხედვიდე მზისა ელვათა
კრთომაასა. ",
        "Tamil poetry of Cupiramaniya Paarathiyar, சுப்ரமணிய பாரதியார் 
(1882-1921):

    யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம்,
    பாமரராய் விலங்குகளாய், உலகனைத்தும் இகழ்ச்சிசொலப் பான்மை கெட்டு,
    நாமமது தமிழரெனக் கொண்டு இங்கு வாழ்ந்திடுதல் நன்றோ? சொல்லீர்!
    தேமதுரத் தமிழோசை உலகமெலாம் பரவும்வகை செய்தல் வேண்டும்."
        ];
}


# insert into test values ('Se déconnecter');
# insert into test values ('Se dÃ(c)connecter');
# insert into test values ('ModifiÃƒÂ(c)');

#select Text, replace(Text, 'Ã(c)', 'é') as Fixed from languagesoutput
where Text like '%Ã(c)%';
#update languagesoutput set Text =  replace(Text, 'Ã(c)', 'é') where
Text like '%Ã(c)%';
#update languagesoutput set Text = REPLACE(Text, 'Ã ', 'à') where Text
like '%Ã %'
#select * from languagesoutput where Text like '%Ã´%';



2008/9/9 Ron Savage <[EMAIL PROTECTED]>:
> Hi Mike
>
> On Mon, 2008-09-08 at 09:23 +0100, Mike Tonks wrote:
>> You got me there.  I'm using mysql with utf8 and this works fine for
>> me.  I tend to agree with Peter that utf8 is the way to go.
>
> I've tried to go the 'utf8' way....
>
> (1) httpd.conf:
> PerlSetEnv   PGCLIENTENCODING UTF8
>
> (2) startup.pl:
> No change
>
> (3) sites.fcgi
> $ENV{'PGCLIENTENCODING'} = 'UTF8';
>
> (4) populate.countries.pl:
> This program does:
> use Locale::SubCountry;
> to load data into Postgres.
>
> $ENV{'PGCLIENTENCODING'} = 'UTF8';
>
> and
>
> # encode destroys its 2nd parameter, so we protect it.
>
> sub my_encode
> {
>        my($name) = @_;
>        $name     =~ s/(.+) \(SEE ALSO.+/$1/;
>
>        return encode('UTF-8', $name, Encode::FB_CROAK);
>
> } # End of my_encode;
>
> Note: See the pod for Encode, and in particular this note:
> UTF-8 vs. utf8 vs. UTF8
>
> (5) Sites.pm:
> This module displays the data:
>
> sub my_decode
> {
>        my($name) = @_;
>
>        return decode('UTF-8', $name, Encode::FB_CROAK);
>
> } # End of my_decode;
>
> (6) Result:
> The symptoms have reversed compared to my earlier msg.
>
> AAAAAAAAAgggggggggghhhhhhhhhh
>
> Now, the mod_perl execution path displays the correct data:
> CÔTE D'IVOIRE
>
> while the fastcgid execution path displays:
> CÃ"TE D'IVOIRE
>
> (7) Buy h-bomb on ebay. kill $self.
> After all, what's the point :-(.
>
> --
> Ron Savage
> [EMAIL PROTECTED]
> http://savage.net.au/index.html
>
>
>
> #####  CGI::Application community mailing list  ################
> ##                                                            ##
> ##  To unsubscribe, or change your message delivery options,  ##
> ##  visit:  http://www.erlbaum.net/mailman/listinfo/cgiapp    ##
> ##                                                            ##
> ##  Web archive:   http://www.erlbaum.net/pipermail/cgiapp/   ##
> ##  Wiki:          http://cgiapp.erlbaum.net/                 ##
> ##                                                            ##
> ################################################################
>
>

#####  CGI::Application community mailing list  ################
##                                                            ##
##  To unsubscribe, or change your message delivery options,  ##
##  visit:  http://www.erlbaum.net/mailman/listinfo/cgiapp    ##
##                                                            ##
##  Web archive:   http://www.erlbaum.net/pipermail/cgiapp/   ##
##  Wiki:          http://cgiapp.erlbaum.net/                 ##
##                                                            ##
################################################################

Re: [cgiapp] Problem displaying French, sometimes

Reply via email to