commit: f3d88990de718c19996976fbe2bb30bdf58b475e
Author: Kerin Millar <kfm <AT> plushkava <DOT> net>
AuthorDate: Mon Aug 18 01:10:47 2025 +0000
Commit: Kerin Millar <kfm <AT> plushkava <DOT> net>
CommitDate: Mon Aug 18 01:17:05 2025 +0000
URL: https://gitweb.gentoo.org/proj/locale-gen.git/commit/?id=f3d88990
mkconfig: capitalise first letter of territory *after* decoding
Presently, the map_locale_attributes() subroutine uses ucfirst() to
capitalise the first character of the $territory variable. This happens
at the point that its initial value is captured, before UTF-8 decoding
potentially occurs as a consequence of calling the to_ascii()
subroutine. Instead, ensure that it can only happen after decoding, thus
avoiding the issue demonstrated by the following program.
#!/usr/bin/perl
binmode *STDOUT, ':encoding(US-ASCII)';
my $territory = 'ïmaginary-territory'; # UTF-8 bytes, not characters!
print $territory, "\n";
print ucfirst $territory, "\n";
Running this program produces:
\x{00c3}\x{00af}maginary-territory
\x{00c3}\x{00af}maginary-territory
Had the UTF-8 bytes been decoded to characters first, the program would
have instead produced:
\x{00ef}maginary-territory
\x{00cf}maginary-territory
Signed-off-by: Kerin Millar <kfm <AT> plushkava.net>
mkconfig | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mkconfig b/mkconfig
index 5e54dbb..25a8ad6 100755
--- a/mkconfig
+++ b/mkconfig
@@ -77,7 +77,7 @@ sub map_locale_attributes ($prefix) {
next if $locale =~ m/^\./;
my $data = read_binary("$top/$locale");
if ($data =~ $regex) {
- my ($language, $territory) = ($1, ucfirst $2);
+ my ($language, $territory) = ($1, $2);
for ($language, $territory) {
if (m/[^\p{ASCII}]/) {
$_ = to_ascii($_);
@@ -94,7 +94,7 @@ sub map_locale_attributes ($prefix) {
}
$attr_by{$locale} = {
'language' => $language,
- 'territory' => $territory
+ 'territory' => ucfirst $territory
};
}
}