commit: 811994394c9866f667d4822975038fb03807a325 Author: Kerin Millar <kfm <AT> plushkava <DOT> net> AuthorDate: Fri Aug 15 22:05:49 2025 +0000 Commit: Kerin Millar <kfm <AT> plushkava <DOT> net> CommitDate: Fri Aug 15 22:15:22 2025 +0000 URL: https://gitweb.gentoo.org/proj/locale-gen.git/commit/?id=81199439
mkconfig: don't transliterate paths in map_locale_attributes() Presently, the map_locale_attributes() subroutine employs a two-stage pipeline, where grep(1) is used to find lines defining the "language" and "territory" attributes, with iconv(1) transliterating to US-ASCII. This is done so as to eliminate diacritics and produce a config that is, at once, valid US-ASCII and valid UTF-8. However, there is a potential issue with this approach. Imagine a scenario in which Gentoo is installed beneath the following prefix. /home/gentoo/gentøø-linux In that case, the pathname will be transliterated to: /home/gentoo/gentoo-linux Thus, the regular expression that separates the pathname from the line will fail to match. At first, I contemplated switching to the /usr/share/i18n/locales directory before grep(1) is executed. To do so would almost certainly have sufficed, because the names of the files residing in that directory are unaffected by transliteration, and this will remain the case for the foreseeable future. Then I realised that Perl is able to perform transliteration using only core modules. As such, address this issue by having Perl be directly responsible for opening and parsing the locale files. For any field whose value does not consist entirely of bytes in the 0x00 - 0x7F range, transliterate the diacritics in the following way. 1) Decode the bytes to characters (presuming UTF-8 as the encoding) 2) Convert the characters to the NFKD normal form 3) Strip non-spacing combining marks with a zero advance width This approach appears adequate, and remains about as fast. It should be noted that dev-perl/File-Slurper is now a requirement. I favour this module because its read_binary() subroutine performs an extremely fast unbuffered 'slurp'. Should this requirement be considered problematic, it may be removed before the next release is issued. Signed-off-by: Kerin Millar <kfm <AT> plushkava.net> mkconfig | 73 +++++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/mkconfig b/mkconfig index 5a7251f..d79b3af 100755 --- a/mkconfig +++ b/mkconfig @@ -4,16 +4,17 @@ # consisting only of UTF-8 locales supported by the installed version of glibc, # with comments indicating the languages and territories in plain English. # -# Requires: column(1), grep(1), iconv(1), sh(1) +# Requires: column(1) # # Copyright 2025 Kerin Millar <[email protected]> # License GPL-2.0-only <https://spdx.org/licenses/GPL-2.0-only.html> use v5.36; -use File::Spec::Functions qw(catfile); +use Encode qw(decode); +use File::Spec::Functions qw(catdir catfile); +use Unicode::Normalize qw(NFKD); -# Unset BASH_ENV for security reasons. Even as sh(1), bash acts upon it. -delete $ENV{'BASH_ENV'}; +use File::Slurper qw(read_binary); { # The first argument shall be treated as a prefix, if any. @@ -26,8 +27,10 @@ delete $ENV{'BASH_ENV'}; # Gather the language and territory attributes of the locale templates. my $attr_by = map_locale_attributes($prefix); - # Use column(1) to write out a nicely columnated list. - open my $pipe, "| column -t -s \037" or exit 127; + # Use column(1) to write out a nicely columnated list. The encoding is + # applied as a precaution; should any wide characters unexpectedly slip + # through, they shall be backslash-escaped e.g. U+00E5 => "\x{00e5}". + open my $pipe, "|-:encoding(US-ASCII)", "column -t -s \037" or exit 127; while (my $line = readline $fh) { my ($read_locale, $charmap) = split ' ', $line; @@ -56,31 +59,49 @@ delete $ENV{'BASH_ENV'}; } sub map_locale_attributes ($prefix) { - my $top = local $ENV{'TOP'} = catfile($prefix, '/usr/share/i18n', 'locales'); - my @lines = qx{ - grep -E '^(language|territory)[[:blank:]]' /dev/null "\$TOP"/* | - iconv -f UTF-8 -t US-ASCII//TRANSLIT - }; + my $top = catdir($prefix, '/usr/share/i18n/locales'); + opendir(my $dh, $top) or die "Can't open '$top' as a directory: $!"; my $regex = qr/ - \Q$top\E\/([^\/:]+) # basename - : # separates pathname from matching line - (language|territory) # attribute key - \h+ # one or more <blank> characters - "([^"]*)" # attribute value - /x; + ^ + language # attribute key + \h+ # one or more <blank> characters + "([^"]+)" # non-empty attribute value + \n # line break + territory + \h+ + "([^"]*)" # attribute value + $ + /mx; my %attr_by; - for my $line (@lines) { - if ($line =~ m/^${regex}$/) { - my ($locale, $key, $val) = ($1, $2, ucfirst $3); - if ($key eq 'territory') { - if ($val =~ m/^Myanmar/) { - $val = 'Myanmar/Burma'; - } elsif ($val eq 'Turkiye') { - $val = 'Turkey'; + while (my $locale = readdir $dh) { + next if $locale =~ m/^\./; + my $data = read_binary("$top/$locale"); + if ($data =~ $regex) { + my ($language, $territory) = ($1, ucfirst $2); + for my $ref (\$language, \$territory) { + if ($ref->$* =~ m/[^\p{ASCII}]/) { + $ref->$* = to_ascii($ref->$*); } } - $attr_by{$locale}{$key} = $val; + if ($territory =~ m/^Myanmar/) { + $territory = 'Myanmar/Burma'; + } elsif ($territory eq 'Turkiye') { + $territory = 'Turkey'; + } + $attr_by{$locale} = { + 'language' => $language, + 'territory' => $territory + }; } } return \%attr_by; } + +sub to_ascii ($bytes) { + # This behaves similarly to "iconv -f UTF-8 -t US-ASCII//TRANSLIT". At + # least, to a degree that is sufficient for the inputs being processed. + my $chars = decode('UTF-8', $bytes, Encode::FB_CROAK); + $chars = NFKD($chars); + $chars =~ s/\p{NonspacingMark}//g; + return $chars; +}
