This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 70eb9d06d TIKA-4298: Failed to detect charset for zip entry with short 
non-Unicode file name (#1903)
70eb9d06d is described below

commit 70eb9d06d78e3d8cb399fc461f6ba15d0fc2549e
Author: Mingchun Zhao <spring5...@gmail.com>
AuthorDate: Thu Aug 15 23:49:35 2024 +0900

    TIKA-4298: Failed to detect charset for zip entry with short non-Unicode 
file name (#1903)
    
    TIKA-4298: Failed to detect charset for zip entry with short non-Unicode 
file name
    
    Co-authored-by: Mingchun Zhao <mingchun.z...@rondhuit.com>
---
 .../java/org/apache/tika/parser/pkg/PackageParser.java   |  15 ++++++++++++++-
 .../org/apache/tika/parser/pkg/PackageParserTest.java    |   7 +++++++
 .../test-documents/testZipEntryNameCharsetShiftSJIS.zip  | Bin 0 -> 330 bytes
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 3bb2e1e0c..bfd2a1168 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -107,6 +107,8 @@ public class PackageParser extends 
AbstractEncodingDetectorParser {
             MediaType.set(ZIP, JAR, AR, ARJ, CPIO, DUMP, TAR, SEVENZ);
     // the mark limit used for stream
     private static final int MARK_LIMIT = 100 * 1024 * 1024; // 100M
+    // The number of bytes of entry name to detect charset properly
+    private static final int MIN_BYTES_FOR_DETECTING_CHARSET = 100;
 
 
     static final Set<MediaType> loadPackageSpecializations() {
@@ -443,9 +445,20 @@ public class PackageParser extends 
AbstractEncodingDetectorParser {
         
         //Try to detect charset of archive entry in case of non-unicode 
filename is used
         if (detectCharsetsInEntryNames && entry instanceof ZipArchiveEntry) {
+            // Extend short entry name to improve accuracy of charset detection
+            byte[] entryName = ((ZipArchiveEntry) entry).getRawName();
+            byte[] extendedEntryName = entryName;
+            if (0 < entryName.length && entryName.length < 
MIN_BYTES_FOR_DETECTING_CHARSET) {
+                int len = entryName.length * (MIN_BYTES_FOR_DETECTING_CHARSET 
/ entryName.length);
+                extendedEntryName = new byte[len];
+                for (int i = 0; i < len; i++) {
+                    extendedEntryName[i] = entryName[i % entryName.length];
+                }
+            }
+
             Charset candidate =
                     getEncodingDetector().detect(
-                            
UnsynchronizedByteArrayInputStream.builder().setByteArray(((ZipArchiveEntry) 
entry).getRawName()).get(),
+                            
UnsynchronizedByteArrayInputStream.builder().setByteArray(extendedEntryName).get(),
                             parentMetadata);
             if (candidate != null) {
                 name = new String(((ZipArchiveEntry) entry).getRawName(), 
candidate);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
index b16f3f5c8..bec188b8d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -31,4 +31,11 @@ public class PackageParserTest extends TikaTest {
         List<Metadata> metadataList = getRecursiveMetadata("gbk.zip");
         assertContains("审计压缩", 
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
     }
+
+    @Test
+    public void handleEntryNameWithCharsetShiftJIS() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testZipEntryNameCharsetShiftSJIS.zip");
+        assertContains("文章", 
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertContains("文章", 
metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testZipEntryNameCharsetShiftSJIS.zip
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testZipEntryNameCharsetShiftSJIS.zip
new file mode 100644
index 000000000..45595685a
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testZipEntryNameCharsetShiftSJIS.zip
 differ

Reply via email to