The following commit has been merged in the master branch:
commit c8b97181e1b78508c9264a559368b9dca6540f77
Author: Niels Thykier ni...@thykier.net
Date: Sat Apr 13 11:00:23 2013 +0200
c/files: Check for files in non-UTF-8 encoding
Signed-off-by: Niels Thykier ni...@thykier.net
diff --git a/checks/files b/checks/files
index 5ba18c8..c77e07b 100644
--- a/checks/files
+++ b/checks/files
@@ -26,7 +26,7 @@ use File::Basename;
use Lintian::Data;
use Lintian::Tags qw(tag);
-use Lintian::Util qw(fail open_gz);
+use Lintian::Util qw(fail is_string_utf8_encoded open_gz);
my $FONT_PACKAGES = Lintian::Data-new ('files/fonts', qr/\s++/);
my $TRIPLETS = Lintian::Data-new ('files/triplets', qr/\s++/);
@@ -242,7 +242,6 @@ if (!$is_dummy) {
# Read package contents...
foreach my $file ($info-sorted_index) {
-next if $file eq '';
my $index_info = $info-index ($file);
my $owner = $index_info-owner . '/' . $index_info-group;
my $operm = $index_info-operm;
@@ -250,6 +249,10 @@ foreach my $file ($info-sorted_index) {
$arch_dep_files = 1 if $file !~ m,^usr/share/,o $file ne 'usr/';
+if (!is_string_utf8_encoded($file)) {
+tag 'file-name-is-not-valid-UTF-8', $file;
+}
+
if ($index_info-is_hardlink) {
my $link_target_dir = $link;
$link_target_dir =~ s,[^/]*$,,;
diff --git a/checks/files.desc b/checks/files.desc
index 0e47660..12d571a 100644
--- a/checks/files.desc
+++ b/checks/files.desc
@@ -1304,3 +1304,13 @@ Tag: dir-or-file-in-build-tree
Severity: serious
Certainty: possible
Info: Your package install file in our build tree.
+
+Tag: file-name-is-not-valid-UTF-8
+Severity: normal
+Certainty: certain
+Ref: #701081
+Info: The file name does not appear to be valid UTF-8. This may become
+ a requirement in future Policy versions.
+ .
+ Note that Lintian may be unable to display the filename accurately.
+ Unprintable characters may have been replaced.
diff --git a/debian/changelog b/debian/changelog
index 8205368..c72abfa 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -11,6 +11,7 @@ lintian (2.5.12) UNRELEASED; urgency=low
- dir-or-file-in-build-tree
- dir-or-file-in-etc-opt
- dir-or-file-in-home
+ - file-name-is-not-valid-UTF-8
- font-adobe-copyrighted-fragment-no-credit
- font-package-not-multi-arch-foreign
- illegal-runtime-test-name
@@ -94,6 +95,9 @@ lintian (2.5.12) UNRELEASED; urgency=low
spotting it. (Closes: #699452)
+ [NT] Add patch from Bastien Roucariès to check for another
adobe font license issues. (Closes: #705175)
++ [NT] Test for use of file names that are contain invalid
+ UTF-8 byte sequences. Thanks to Helmut Grohne for the
+ suggestion. (Closes: #704446)
* checks/init.d:
+ [NT] Fix regression where Lintian would not properly match
init.d passed to update-rc.d. Thanks to Michael Meskes for
diff --git a/lib/Lintian/Util.pm b/lib/Lintian/Util.pm
index df7c093..9c9fd4f 100644
--- a/lib/Lintian/Util.pm
+++ b/lib/Lintian/Util.pm
@@ -57,6 +57,7 @@ BEGIN {
get_file_checksum
slurp_entire_file
file_is_encoded_in_non_utf8
+ is_string_utf8_encoded
fail
strip
lstrip
@@ -712,6 +713,29 @@ sub get_file_checksum {
return $digest-hexdigest;
}
+=item is_string_utf8_encoded(STRING)
+
+Returns a truth value if STRING can be decoded as valid UTF-8.
+
+=cut
+
+sub is_string_utf8_encoded {
+my ($str) = @_;
+if ($str =~ m,\e[-!\$%()*+./],) {
+# ISO-2022
+return 0;
+}
+eval {
+Encode::decode('UTF-8', $str, Encode::FB_CROAK);
+};
+if ($@) {
+# fail
+return 0;
+}
+# pass
+return 1;
+}
+
=item file_is_encoded_in_non_utf8 (...)
Undocumented
@@ -726,15 +750,7 @@ sub file_is_encoded_in_non_utf8 {
or fail(failure while checking encoding of $file for $type package
$pkg);
my $line = 0;
while ($fd) {
-if (m,\e[-!\$%()*+./],) {
-# ISO-2022
-$line = $.;
-last;
-}
-eval {
-$_ = Encode::decode('UTF-8', $_, Encode::FB_CROAK);
-};
-if ($@) {
+if (!is_string_utf8_encoded($_)) {
$line = $.;
last;
}
diff --git a/t/tests/files-general/debian/debian/rules
b/t/tests/files-general/debian/debian/rules
index 1806e1d..e5e7c6e 100755
--- a/t/tests/files-general/debian/debian/rules
+++ b/t/tests/files-general/debian/debian/rules
@@ -13,6 +13,10 @@ override_dh_install:
touch $(tmp)/usr/share/foo/'*'
touch $(tmp)/usr/share/foo/'ws '
touch $(tmp)/usr/share/foo/.nfs-fake-tmpfile
+ # If the following line gets messed up, it can be
+ # restored with something like:
+ # sed -i 's/@FILE@/bokm\xe5l/'
+ touch $(tmp)/usr/share/foo/bokm�l
touch