This is an automated email from the ASF dual-hosted git repository. tilman pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new 70eb9d06d TIKA-4298: Failed to detect charset for zip entry with short non-Unicode file name (#1903) 70eb9d06d is described below commit 70eb9d06d78e3d8cb399fc461f6ba15d0fc2549e Author: Mingchun Zhao <spring5...@gmail.com> AuthorDate: Thu Aug 15 23:49:35 2024 +0900 TIKA-4298: Failed to detect charset for zip entry with short non-Unicode file name (#1903) TIKA-4298: Failed to detect charset for zip entry with short non-Unicode file name Co-authored-by: Mingchun Zhao <mingchun.z...@rondhuit.com> --- .../java/org/apache/tika/parser/pkg/PackageParser.java | 15 ++++++++++++++- .../org/apache/tika/parser/pkg/PackageParserTest.java | 7 +++++++ .../test-documents/testZipEntryNameCharsetShiftSJIS.zip | Bin 0 -> 330 bytes 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java index 3bb2e1e0c..bfd2a1168 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java @@ -107,6 +107,8 @@ public class PackageParser extends AbstractEncodingDetectorParser { MediaType.set(ZIP, JAR, AR, ARJ, CPIO, DUMP, TAR, SEVENZ); // the mark limit used for stream private static final int MARK_LIMIT = 100 * 1024 * 1024; // 100M + // The number of bytes of entry name to detect charset properly + private static final int MIN_BYTES_FOR_DETECTING_CHARSET = 100; static final Set<MediaType> loadPackageSpecializations() { @@ -443,9 +445,20 @@ public class PackageParser extends AbstractEncodingDetectorParser { //Try to detect charset of archive entry in case of non-unicode filename is used if (detectCharsetsInEntryNames && entry instanceof ZipArchiveEntry) { + // Extend short entry name to improve accuracy of charset detection + byte[] entryName = ((ZipArchiveEntry) entry).getRawName(); + byte[] extendedEntryName = entryName; + if (0 < entryName.length && entryName.length < MIN_BYTES_FOR_DETECTING_CHARSET) { + int len = entryName.length * (MIN_BYTES_FOR_DETECTING_CHARSET / entryName.length); + extendedEntryName = new byte[len]; + for (int i = 0; i < len; i++) { + extendedEntryName[i] = entryName[i % entryName.length]; + } + } + Charset candidate = getEncodingDetector().detect( - UnsynchronizedByteArrayInputStream.builder().setByteArray(((ZipArchiveEntry) entry).getRawName()).get(), + UnsynchronizedByteArrayInputStream.builder().setByteArray(extendedEntryName).get(), parentMetadata); if (candidate != null) { name = new String(((ZipArchiveEntry) entry).getRawName(), candidate); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java index b16f3f5c8..bec188b8d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java @@ -31,4 +31,11 @@ public class PackageParserTest extends TikaTest { List<Metadata> metadataList = getRecursiveMetadata("gbk.zip"); assertContains("审计压缩", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); } + + @Test + public void handleEntryNameWithCharsetShiftJIS() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testZipEntryNameCharsetShiftSJIS.zip"); + assertContains("文章", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertContains("文章", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testZipEntryNameCharsetShiftSJIS.zip b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testZipEntryNameCharsetShiftSJIS.zip new file mode 100644 index 000000000..45595685a Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testZipEntryNameCharsetShiftSJIS.zip differ