This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8bf65c01e174e1ba872e813089b076f21ddb4410 Author: tballison <[email protected]> AuthorDate: Mon Mar 22 14:17:35 2021 -0400 TIKA-3332 -- recursively process the embedded file tree in PDFs. --- CHANGES.txt | 3 + .../src/test/java/org/apache/tika/TikaTest.java | 15 +++-- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 70 +++++++++++++-------- .../org/apache/tika/parser/pdf/PDFParserTest.java | 11 ++++ .../testPDF_deeplyEmbeddedAttachments.pdf | Bin 0 -> 122221 bytes 5 files changed, 67 insertions(+), 32 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 2b9089c..bcc7c5d 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,8 @@ Release 1.26 - 03/09/2021 + * Extract more embedded files in PDFs by recursively processing the + embedded file tree (TIKA-3332). + * Allow for case insensitive headers for configuration of the PDFParser and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320). diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index e21f752..2d0083c 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -247,19 +247,22 @@ public abstract class TikaTest { } protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context, boolean suppressException) throws Exception { - try (TikaInputStream tis = TikaInputStream.get(path)) { - return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, new Metadata(), suppressException); + Metadata metadata = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { + return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, metadata, suppressException); } } protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, boolean suppressException) throws Exception { - try (TikaInputStream tis = TikaInputStream.get(path)) { - return getRecursiveMetadata(tis, parser, new ParseContext(), new Metadata(), suppressException); + Metadata metadata = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { + return getRecursiveMetadata(tis, parser, new ParseContext(), metadata, suppressException); } } protected List<Metadata> getRecursiveMetadata(Path p, boolean suppressException) throws Exception { - try (TikaInputStream tis = TikaInputStream.get(p)) { - return getRecursiveMetadata(tis, new ParseContext(), new Metadata(), suppressException); + Metadata metadata = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get(p, metadata)) { + return getRecursiveMetadata(tis, new ParseContext(), metadata, suppressException); } } protected List<Metadata> getRecursiveMetadata(Path filePath) throws Exception { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 43526ef..f930c61 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -34,6 +34,7 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.ListIterator; @@ -136,10 +137,11 @@ class AbstractPDF2XHTML extends PDFTextStripper { }; /** - * Maximum recursive depth during AcroForm processing. - * Prevents theoretical AcroForm recursion bomb. + * Maximum recursive depth to prevent cycles/recursion bombs. + * This applies to AcroForm processing and processing + * the embedded document tree. */ - private final static int MAX_ACROFORM_RECURSIONS = 10; + private final static int MAX_RECURSION_DEPTH = 100; private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig(); @@ -287,32 +289,48 @@ class AbstractPDF2XHTML extends PDFTextStripper { private void extractEmbeddedDocuments(PDDocument document) throws IOException, SAXException, TikaException { - PDDocumentNameDictionary namesDictionary = - new PDDocumentNameDictionary(document.getDocumentCatalog()); - PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); - if (efTree == null) { - return; + PDDocumentNameDictionary namesDictionary = + new PDDocumentNameDictionary(document.getDocumentCatalog()); + PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); + + if (efTree == null) { + return; + } + + //Set<COSObjectKey> seen = new HashSet<>(); + + Map<String, PDComplexFileSpecification> embeddedFileNames = new HashMap<>(); + int depth = 0; + //recursively find embedded files + extractFilesfromEFTree(efTree, embeddedFileNames, depth); + processEmbeddedDocNames(embeddedFileNames); + } + + private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String, + PDComplexFileSpecification> embeddedFileNames, int depth) throws IOException { + if (depth > MAX_RECURSION_DEPTH) { + throw new IOException("Hit max recursion depth"); + } + Map<String, PDComplexFileSpecification> names = null; + try { + names = efTree.getNames(); + } catch (IOException e) { + //LOG? + } + if (names != null) { + for (Map.Entry<String, PDComplexFileSpecification> e : names.entrySet()) { + embeddedFileNames.put(e.getKey(), e.getValue()); } + } - Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames(); - //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. - //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java - //If there is a need we could add a fully recursive search to find a non-null - //Map<String, COSObjectable> that contains the doc info. - if (embeddedFileNames != null) { - processEmbeddedDocNames(embeddedFileNames); + List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); + if (kids == null) { + return; } else { - List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); - if (kids == null) { - return; - } for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { - embeddedFileNames = node.getNames(); - if (embeddedFileNames != null) { - processEmbeddedDocNames(embeddedFileNames); - } + extractFilesfromEFTree(node, embeddedFileNames, depth+1); } - } + } } private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException { @@ -803,8 +821,8 @@ class AbstractPDF2XHTML extends PDFTextStripper { private void processAcroField(PDField field, final int currentRecursiveDepth) throws SAXException, IOException, TikaException { - if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { - return; + if (currentRecursiveDepth >= MAX_RECURSION_DEPTH) { + throw new IOException("Hit max recursion depth."); } PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions(); diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 4ad2b12..c009f06 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -25,8 +25,10 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static org.junit.Assume.assumeTrue; +import java.io.File; import java.io.InputStream; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -1649,4 +1651,13 @@ public class PDFParserTest extends TikaTest { return true; } } + + @Test + public void testDeeplyEmbeddedAttachments() throws Exception { + //test file comes from pdfcpu issue #120: https://github.com/pdfcpu/pdfcpu/issues/201 + //in our regression corpus: pdfcpu-201-0.zip-0.pdf"); + List<Metadata> metadataList = getRecursiveMetadata( + "testPDF_deeplyEmbeddedAttachments.pdf"); + assertEquals(21, metadataList.size()); + } } diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf b/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf new file mode 100644 index 0000000..7df6d14 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf differ
