http://www.mediawiki.org/wiki/Special:Code/MediaWiki/56768

Revision: 56768
Author:   avar
Date:     2009-09-22 16:51:03 +0000 (Tue, 22 Sep 2009)

Log Message:
-----------
After we de-serialize the PHP data on disk we end up with a Perl hash
with UTF-8 data but none of the scalars have UTF-8 markers indicating
that.

That results in YAML::Syck dumping ugly YAML because it thinks we have
raw bytes instead of proper UTF-8.

Recurse through the hash and fix that. To reproduce this issue try doing:
    
    # php -r '$a = array("a key" => "a value", "a k?\195?\173e" => "a 
valj?\195?\186e"); echo serialize($a), "\n";' > serialized
    # ^^ this yields qq[a:2:{s:5:"a key";s:7:"a value";s:6:"a 
k?\195?\173e";s:9:"a valj?\195?\186e";}] in 'serialized'
    # cat serialized | perl load-and-dump.pl
    # SAME: # cat serialized | perl -Cio load-and-dump.pl
    
    use feature ':5.10';
    use strict;
    use warnings;
    
    use YAML::Syck 'Dump';
    use PHP::Serialization 'unserialize';
    use File::Slurp 'slurp';
    use Data::Dump 'dump';
    use Encode 'decode';
    
    my $serialized = join '', <STDIN>;
    my $unserialized = unserialize($serialized);
    my $unserialized_utf8;
    
    while (my ($k, $v) = each %$unserialized) {
        my $d_k = decode "utf-8", $k;
        my $d_v = decode "utf-8", $v;
    
        $unserialized_utf8->{$d_k} = $d_v;
    }
    
    say Dump $serialized;
    # "a:2:{s:5:\"a key\";s:7:\"a value\";s:6:\"a k?\195?\173e\";s:9:\"a 
valj?\195?\186e\";}\n"
    
    say Dump $unserialized;
    # a key: a value
    # "a k\xC3\xADe": "a valj\xC3\xBAe"
    
    say Dump $unserialized_utf8;
    # a key: a value
    # a k?\195?\173e: a valj?\195?\186e

Or this. Which recursively iterates:
    
    # perl load-and-dump.pl serialized
    
    use feature ':5.10';
    use strict;
    use warnings;
    
    use YAML::Syck 'Dump';
    use PHP::Serialization 'unserialize';
    use File::Slurp 'slurp';
    
    my $serialized = slurp(shift);
    my $unserialized = unserialize($serialized);
    
    my $unserialized_utf8 = deutf8($unserialized);
    
    say Dump $unserialized_utf8;
    
    # Just marks hash values as utf8, recursively. Doesn't touch keys (how
    # does that work anyway with keys being char*?!)
    sub iterate_and_mark_utf8
    {
        my ($hash, @path) = @_;
            
        while (my ($k, $v) = each %$hash)
        {
            if (ref $v eq 'HASH')
            {
                iterate_and_mark_utf8($v, @path, $k);
            }
            else
            {
                utf8::decode($hash->{$k});
            }
        }
    }
    
    sub deutf8 {
        if(ref($_[0]) eq "HASH") {
            return { map { deutf8($_) } %{$_[0]} };
        } else {
            my $s = $_[0];
            utf8::decode($s);
            return $s;
        }
    }

Modified Paths:
--------------
    trunk/extensions/Translate/utils/TranslateYaml.php

Modified: trunk/extensions/Translate/utils/TranslateYaml.php
===================================================================
--- trunk/extensions/Translate/utils/TranslateYaml.php  2009-09-22 16:50:09 UTC 
(rev 56767)
+++ trunk/extensions/Translate/utils/TranslateYaml.php  2009-09-22 16:51:03 UTC 
(rev 56768)
@@ -69,9 +69,23 @@
                file_put_contents( $tf, $sdata );
 
                $cmd = "perl -MYAML::Syck=DumpFile 
-MPHP::Serialization=unserialize -MFile::Slurp=slurp -wle '" .
-                      "my \$serialized = slurp(\"$tf\");" .
-                      "my \$unserialized = unserialize(\$serialized);" .
-                          "DumpFile(q[$tf.yaml], \$unserialized);' 2>&1";
+                          '$YAML::Syck::Headless = 1;' .
+                          '$YAML::Syck::SortKeys = 1;' .
+                          'my $tf = q[' . $tf . '];' .
+                      'my $serialized = slurp($tf);' .
+                      'my $unserialized = unserialize($serialized);' .
+                          'my $unserialized_utf8 deutf8($unserialized);' .
+                          'DumpFile(qq[$tf.yaml], $unserialized_utf8);' .
+                          'sub deutf8 {' .
+                              'if(ref($_[0]) eq "HASH") {' .
+                                  'return { map { deutf8($_) } %{$_[0]} };' .
+                               '} else {' .
+                                   'my $s = $_[0];' .
+                                   'utf8::decode($s);' .
+                                   'return $s;' .
+                               '}' .
+                           '}' .
+                           ' 2>&1';
                $out = wfShellExec( $cmd, &$ret );
                if ( $ret != 0 ) {
                        wfDebugDieBacktrace("The command '$cmd' died in 
execution with exit code '$ret': $out");



_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to