This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 29ef4b5919375d0915c260b6d632ca706ed7e46d Author: tallison <[email protected]> AuthorDate: Mon Mar 22 14:59:17 2021 -0400 TIKA-3332 -- recursively search embedded file tree for attachments --- CHANGES.txt | 3 ++ .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 54 +++++++++++++-------- .../org/apache/tika/parser/pdf/PDFParserTest.java | 9 ++++ .../testPDF_deeplyEmbeddedAttachments.pdf | Bin 0 -> 122221 bytes 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 8b83391..e1038e2 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -50,6 +50,9 @@ Release 1.26 - ??/??/???? endpoint in tika-server (TIKA-3325); it no longer functions only per container or embedded document. + * Extract more embedded files in PDFs by recursively processing the + embedded file tree (TIKA-3332). + * Allow for case insensitive headers for configuration of the PDFParser and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320). diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index c97aed1..bb874b9 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -36,6 +36,7 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.ListIterator; @@ -115,10 +116,11 @@ class AbstractPDF2XHTML extends PDFTextStripper { public static final String XMP_PAGE_LOCATION_PREFIX = "page "; /** - * Maximum recursive depth during AcroForm processing. - * Prevents theoretical AcroForm recursion bomb. + * Maximum recursive depth to prevent cycles/recursion bombs. + * This applies to AcroForm processing and processing + * the embedded document tree. */ - private final static int MAX_ACROFORM_RECURSIONS = 10; + private final static int MAX_RECURSION_DEPTH = 100; private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml"); private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml"); final List<IOException> exceptions = new ArrayList<>(); @@ -297,23 +299,37 @@ class AbstractPDF2XHTML extends PDFTextStripper { return; } - Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames(); - //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. - //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java - //If there is a need we could add a fully recursive search to find a non-null - //Map<String, COSObjectable> that contains the doc info. - if (embeddedFileNames != null) { - processEmbeddedDocNames(embeddedFileNames); - } else { - List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); - if (kids == null) { - return; + Map<String, PDComplexFileSpecification> embeddedFileNames = new HashMap<>(); + int depth = 0; + //recursively find embedded files + extractFilesfromEFTree(efTree, embeddedFileNames, depth); + processEmbeddedDocNames(embeddedFileNames); + + } + + private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String, + PDComplexFileSpecification> embeddedFileNames, int depth) throws IOException { + if (depth > MAX_RECURSION_DEPTH) { + throw new IOException("Hit max recursion depth"); + } + Map<String, PDComplexFileSpecification> names = null; + try { + names = efTree.getNames(); + } catch (IOException e) { + //LOG? + } + if (names != null) { + for (Map.Entry<String, PDComplexFileSpecification> e : names.entrySet()) { + embeddedFileNames.put(e.getKey(), e.getValue()); } + } + + List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); + if (kids == null) { + return; + } else { for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { - embeddedFileNames = node.getNames(); - if (embeddedFileNames != null) { - processEmbeddedDocNames(embeddedFileNames); - } + extractFilesfromEFTree(node, embeddedFileNames, depth+1); } } } @@ -843,7 +859,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { private void processAcroField(PDField field, final int currentRecursiveDepth) throws SAXException, IOException, TikaException { - if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { + if (currentRecursiveDepth >= MAX_RECURSION_DEPTH) { return; } diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 251a4b7..b0990a3 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1334,4 +1334,13 @@ public class PDFParserTest extends TikaTest { return true; } } + + @Test + public void testDeeplyEmbeddedAttachments() throws Exception { + //test file comes from pdfcpu issue #120: https://github.com/pdfcpu/pdfcpu/issues/201 + //in our regression corpus: pdfcpu-201-0.zip-0.pdf"); + List<Metadata> metadataList = getRecursiveMetadata( + "testPDF_deeplyEmbeddedAttachments.pdf"); + assertEquals(21, metadataList.size()); + } } diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf new file mode 100644 index 0000000..7df6d14 Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf differ
