[tika] 01/02: TIKA-3332 -- recursively process the embedded file tree in PDFs.

tallison Mon, 22 Mar 2021 11:17:59 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 8bf65c01e174e1ba872e813089b076f21ddb4410
Author: tballison <[email protected]>
AuthorDate: Mon Mar 22 14:17:35 2021 -0400

    TIKA-3332 -- recursively process the embedded file tree in PDFs.
---
 CHANGES.txt                                        |   3 +
 .../src/test/java/org/apache/tika/TikaTest.java    |  15 +++--
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  70 +++++++++++++--------
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  11 ++++
 .../testPDF_deeplyEmbeddedAttachments.pdf          | Bin 0 -> 122221 bytes
 5 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 2b9089c..bcc7c5d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.26 - 03/09/2021
 
+   * Extract more embedded files in PDFs by recursively processing the
+     embedded file tree (TIKA-3332).
+
    * Allow for case insensitive headers for configuration of the PDFParser
      and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320).
 
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java 
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index e21f752..2d0083c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -247,19 +247,22 @@ public abstract class TikaTest {
     }
 
     protected List<Metadata> getRecursiveMetadata(Path path, ParseContext 
context, boolean suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(path)) {
-            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, new 
Metadata(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, 
metadata, suppressException);
         }
     }
     protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, 
boolean suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(path)) {
-            return getRecursiveMetadata(tis, parser, new ParseContext(), new 
Metadata(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, parser, new ParseContext(), 
metadata, suppressException);
         }
     }
 
     protected List<Metadata> getRecursiveMetadata(Path p, boolean 
suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(p)) {
-            return getRecursiveMetadata(tis, new ParseContext(), new 
Metadata(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(p, metadata)) {
+            return getRecursiveMetadata(tis, new ParseContext(), metadata, 
suppressException);
         }
     }
     protected List<Metadata> getRecursiveMetadata(Path filePath) throws 
Exception {
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 43526ef..f930c61 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -34,6 +34,7 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.ListIterator;
@@ -136,10 +137,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     };
 
     /**
-     * Maximum recursive depth during AcroForm processing.
-     * Prevents theoretical AcroForm recursion bomb.
+     * Maximum recursive depth to prevent cycles/recursion bombs.
+     * This applies to AcroForm processing and processing
+     * the embedded document tree.
      */
-    private final static int MAX_ACROFORM_RECURSIONS = 10;
+    private final static int MAX_RECURSION_DEPTH = 100;
 
     private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new 
TesseractOCRConfig();
 
@@ -287,32 +289,48 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     private void extractEmbeddedDocuments(PDDocument document)
             throws IOException, SAXException, TikaException {
-            PDDocumentNameDictionary namesDictionary =
-                    new 
PDDocumentNameDictionary(document.getDocumentCatalog());
-            PDEmbeddedFilesNameTreeNode efTree = 
namesDictionary.getEmbeddedFiles();
-            if (efTree == null) {
-                return;
+        PDDocumentNameDictionary namesDictionary =
+                new PDDocumentNameDictionary(document.getDocumentCatalog());
+        PDEmbeddedFilesNameTreeNode efTree = 
namesDictionary.getEmbeddedFiles();
+
+        if (efTree == null) {
+            return;
+        }
+
+        //Set<COSObjectKey> seen = new HashSet<>();
+
+        Map<String, PDComplexFileSpecification> embeddedFileNames = new 
HashMap<>();
+        int depth = 0;
+        //recursively find embedded files
+        extractFilesfromEFTree(efTree, embeddedFileNames, depth);
+        processEmbeddedDocNames(embeddedFileNames);
+    }
+
+    private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String,
+            PDComplexFileSpecification> embeddedFileNames, int depth) throws 
IOException {
+        if (depth > MAX_RECURSION_DEPTH) {
+            throw new IOException("Hit max recursion depth");
+        }
+        Map<String, PDComplexFileSpecification> names = null;
+        try {
+            names = efTree.getNames();
+        } catch (IOException e) {
+            //LOG?
+        }
+        if (names != null) {
+            for (Map.Entry<String, PDComplexFileSpecification> e : 
names.entrySet()) {
+                embeddedFileNames.put(e.getKey(), e.getValue());
             }
+        }
 
-        Map<String, PDComplexFileSpecification> embeddedFileNames = 
efTree.getNames();
-        //For now, try to get the embeddedFileNames out of embeddedFiles or 
its kids.
-        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
-        //If there is a need we could add a fully recursive search to find a 
non-null
-        //Map<String, COSObjectable> that contains the doc info.
-        if (embeddedFileNames != null) {
-            processEmbeddedDocNames(embeddedFileNames);
+        List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
+        if (kids == null) {
+            return;
         } else {
-            List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
-            if (kids == null) {
-                return;
-            }
             for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
-                embeddedFileNames = node.getNames();
-                if (embeddedFileNames != null) {
-                    processEmbeddedDocNames(embeddedFileNames);
-                }
+                extractFilesfromEFTree(node, embeddedFileNames, depth+1);
             }
-        }
+       }
     }
 
     private void processDoc(String name, PDFileSpecification spec, 
AttributesImpl attributes) throws TikaException, SAXException, IOException {
@@ -803,8 +821,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private void processAcroField(PDField field, final int 
currentRecursiveDepth)
             throws SAXException, IOException, TikaException {
 
-        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
-            return;
+        if (currentRecursiveDepth >= MAX_RECURSION_DEPTH) {
+            throw new IOException("Hit max recursion depth.");
         }
 
         PDFormFieldAdditionalActions pdFormFieldAdditionalActions = 
field.getActions();
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 4ad2b12..c009f06 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -25,8 +25,10 @@ import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 import static org.junit.Assume.assumeTrue;
 
+import java.io.File;
 import java.io.InputStream;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -1649,4 +1651,13 @@ public class PDFParserTest extends TikaTest {
             return true;
         }
     }
+
+    @Test
+    public void testDeeplyEmbeddedAttachments() throws Exception {
+        //test file comes from pdfcpu issue #120: 
https://github.com/pdfcpu/pdfcpu/issues/201
+        //in our regression corpus: pdfcpu-201-0.zip-0.pdf");
+        List<Metadata> metadataList = getRecursiveMetadata(
+                "testPDF_deeplyEmbeddedAttachments.pdf");
+        assertEquals(21, metadataList.size());
+    }
 }
diff --git 
a/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
 
b/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
new file mode 100644
index 0000000..7df6d14
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
 differ

[tika] 01/02: TIKA-3332 -- recursively process the embedded file tree in PDFs.

Reply via email to