commit:     945525e71206428f87f94b839fe4ce36b9d85e1e
Author:     Kerin Millar <kfm <AT> plushkava <DOT> net>
AuthorDate: Tue Aug 19 05:21:52 2025 +0000
Commit:     Kerin Millar <kfm <AT> plushkava <DOT> net>
CommitDate: Tue Aug 19 05:21:52 2025 +0000
URL:        https://gitweb.gentoo.org/proj/locale-gen.git/commit/?id=945525e7

mkconfig: always decode the locale files as UTF-8

Through inspecting the glibc release tarballs, it can be seen that the
locale files are always encoded as UTF-8. With that in mind, use the
read_text() method of File::Slurper so that the bytes are always decoded
as UTF-8. In turn, this simplifies the to_ascii() subroutine.

Signed-off-by: Kerin Millar <kfm <AT> plushkava.net>

 mkconfig | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/mkconfig b/mkconfig
index 25a8ad6..fa42452 100755
--- a/mkconfig
+++ b/mkconfig
@@ -10,11 +10,10 @@
 # License GPL-2.0-only <https://spdx.org/licenses/GPL-2.0-only.html>
 
 use v5.36;
-use Encode qw(decode);
 use File::Spec::Functions qw(catdir catfile);
 use Unicode::Normalize qw(NFKD);
 
-use File::Slurper qw(read_binary);
+use File::Slurper qw(read_text);
 
 {
        # The first argument shall be treated as a prefix, if any.
@@ -75,9 +74,9 @@ sub map_locale_attributes ($prefix) {
        my %attr_by;
        while (my $locale = readdir $dh) {
                next if $locale =~ m/^\./;
-               my $data = read_binary("$top/$locale");
+               my $data = read_text("$top/$locale");
                if ($data =~ $regex) {
-                       my ($language, $territory) = ($1, $2);
+                       my ($language, $territory) = ($1, ucfirst $2);
                        for ($language, $territory) {
                                if (m/[^\p{ASCII}]/) {
                                        $_ = to_ascii($_);
@@ -94,18 +93,17 @@ sub map_locale_attributes ($prefix) {
                        }
                        $attr_by{$locale} = {
                                'language'  => $language,
-                               'territory' => ucfirst $territory
+                               'territory' => $territory
                        };
                }
        }
        return \%attr_by;
 }
 
-sub to_ascii ($bytes) {
+sub to_ascii ($str) {
        # This behaves similarly to "iconv -f UTF-8 -t US-ASCII//TRANSLIT". At
        # least, to a degree that is sufficient for the inputs being processed.
-       my $chars = decode('UTF-8', $bytes, Encode::FB_CROAK);
-       $chars = NFKD($chars);
-       $chars =~ s/\p{NonspacingMark}//g;
-       return $chars;
+       $str = NFKD($str);
+       $str =~ s/\p{NonspacingMark}//g;
+       return $str;
 }

Reply via email to