http://www.mediawiki.org/wiki/Special:Code/MediaWiki/56768
Revision: 56768 Author: avar Date: 2009-09-22 16:51:03 +0000 (Tue, 22 Sep 2009) Log Message: ----------- After we de-serialize the PHP data on disk we end up with a Perl hash with UTF-8 data but none of the scalars have UTF-8 markers indicating that. That results in YAML::Syck dumping ugly YAML because it thinks we have raw bytes instead of proper UTF-8. Recurse through the hash and fix that. To reproduce this issue try doing: # php -r '$a = array("a key" => "a value", "a k?\195?\173e" => "a valj?\195?\186e"); echo serialize($a), "\n";' > serialized # ^^ this yields qq[a:2:{s:5:"a key";s:7:"a value";s:6:"a k?\195?\173e";s:9:"a valj?\195?\186e";}] in 'serialized' # cat serialized | perl load-and-dump.pl # SAME: # cat serialized | perl -Cio load-and-dump.pl use feature ':5.10'; use strict; use warnings; use YAML::Syck 'Dump'; use PHP::Serialization 'unserialize'; use File::Slurp 'slurp'; use Data::Dump 'dump'; use Encode 'decode'; my $serialized = join '', <STDIN>; my $unserialized = unserialize($serialized); my $unserialized_utf8; while (my ($k, $v) = each %$unserialized) { my $d_k = decode "utf-8", $k; my $d_v = decode "utf-8", $v; $unserialized_utf8->{$d_k} = $d_v; } say Dump $serialized; # "a:2:{s:5:\"a key\";s:7:\"a value\";s:6:\"a k?\195?\173e\";s:9:\"a valj?\195?\186e\";}\n" say Dump $unserialized; # a key: a value # "a k\xC3\xADe": "a valj\xC3\xBAe" say Dump $unserialized_utf8; # a key: a value # a k?\195?\173e: a valj?\195?\186e Or this. Which recursively iterates: # perl load-and-dump.pl serialized use feature ':5.10'; use strict; use warnings; use YAML::Syck 'Dump'; use PHP::Serialization 'unserialize'; use File::Slurp 'slurp'; my $serialized = slurp(shift); my $unserialized = unserialize($serialized); my $unserialized_utf8 = deutf8($unserialized); say Dump $unserialized_utf8; # Just marks hash values as utf8, recursively. Doesn't touch keys (how # does that work anyway with keys being char*?!) sub iterate_and_mark_utf8 { my ($hash, @path) = @_; while (my ($k, $v) = each %$hash) { if (ref $v eq 'HASH') { iterate_and_mark_utf8($v, @path, $k); } else { utf8::decode($hash->{$k}); } } } sub deutf8 { if(ref($_[0]) eq "HASH") { return { map { deutf8($_) } %{$_[0]} }; } else { my $s = $_[0]; utf8::decode($s); return $s; } } Modified Paths: -------------- trunk/extensions/Translate/utils/TranslateYaml.php Modified: trunk/extensions/Translate/utils/TranslateYaml.php =================================================================== --- trunk/extensions/Translate/utils/TranslateYaml.php 2009-09-22 16:50:09 UTC (rev 56767) +++ trunk/extensions/Translate/utils/TranslateYaml.php 2009-09-22 16:51:03 UTC (rev 56768) @@ -69,9 +69,23 @@ file_put_contents( $tf, $sdata ); $cmd = "perl -MYAML::Syck=DumpFile -MPHP::Serialization=unserialize -MFile::Slurp=slurp -wle '" . - "my \$serialized = slurp(\"$tf\");" . - "my \$unserialized = unserialize(\$serialized);" . - "DumpFile(q[$tf.yaml], \$unserialized);' 2>&1"; + '$YAML::Syck::Headless = 1;' . + '$YAML::Syck::SortKeys = 1;' . + 'my $tf = q[' . $tf . '];' . + 'my $serialized = slurp($tf);' . + 'my $unserialized = unserialize($serialized);' . + 'my $unserialized_utf8 deutf8($unserialized);' . + 'DumpFile(qq[$tf.yaml], $unserialized_utf8);' . + 'sub deutf8 {' . + 'if(ref($_[0]) eq "HASH") {' . + 'return { map { deutf8($_) } %{$_[0]} };' . + '} else {' . + 'my $s = $_[0];' . + 'utf8::decode($s);' . + 'return $s;' . + '}' . + '}' . + ' 2>&1'; $out = wfShellExec( $cmd, &$ret ); if ( $ret != 0 ) { wfDebugDieBacktrace("The command '$cmd' died in execution with exit code '$ret': $out"); _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs