Repository: tika Updated Branches: refs/heads/master 16290d86c -> 6ad18f44c
TIKA-1990 -- make sure to include JPEG filters when exporting jpegs embedded in PDFs Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6ad18f44 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6ad18f44 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6ad18f44 Branch: refs/heads/master Commit: 6ad18f44cdfef5e80b343d9c06787b850fcfc9fd Parents: 16290d8 Author: tballison <[email protected]> Authored: Tue May 31 09:57:50 2016 -0400 Committer: tballison <[email protected]> Committed: Tue May 31 09:57:50 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 10 +++++++--- .../apache/tika/parser/pdf/PDFParserTest.java | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/6ad18f44/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index fed5137..fec6a79 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -27,6 +27,7 @@ import java.io.OutputStream; import java.io.Writer; import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Arrays; import java.util.Calendar; import java.util.HashMap; import java.util.HashSet; @@ -98,6 +99,10 @@ class PDF2XHTML extends PDFTextStripper { * Prevents theoretical AcroForm recursion bomb. */ private final static int MAX_ACROFORM_RECURSIONS = 10; + + private static final List<String> JPEG = Arrays.asList( + COSName.DCT_DECODE.getName(), + COSName.DCT_DECODE_ABBREVIATION.getName()); /** * Format used for signature dates * TODO Make this thread-safe @@ -109,6 +114,7 @@ class PDF2XHTML extends PDFTextStripper { private final PDFParserConfig config; private final Metadata metadata; private final List<IOException> exceptions = new ArrayList<>(); + /** * This keeps track of the pdf object ids for inline * images that have been processed. @@ -447,9 +453,7 @@ class PDF2XHTML extends PDFTextStripper { if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) || PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) { // RGB or Gray colorspace: get and write the unmodifiedJPEG stream - //TODO: shouldn't need to do this: should be able to call createInputStream directly?! - //version clash somewhere?! - InputStream data = pdImage.getStream().createInputStream(); + InputStream data = pdImage.getStream().createInputStream(JPEG); org.apache.pdfbox.io.IOUtils.copy(data, out); org.apache.pdfbox.io.IOUtils.closeQuietly(data); } else { http://git-wip-us.apache.org/repos/asf/tika/blob/6ad18f44/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index a260b1d..7a66a7f 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1157,6 +1157,25 @@ public class PDFParserTest extends TikaTest { assertEquals(0, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length); assertNotContained("1309.61", content); } + @Test + public void testEmbeddedJPEG() throws Exception { + //TIKA-1990, test that an embedded jpeg is correctly decoded + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImages(true); + ParseContext context = new ParseContext(); + context.set(PDFParserConfig.class, config); + + List<Metadata> metadataList = getRecursiveJson("testPDF_childAttachments.pdf", context); + //sanity check + assertEquals(4, metadataList.size()); + + //inlined jpeg metadata + Metadata jpegMetadata = metadataList.get(1); + assertEquals("image/jpeg", jpegMetadata.get(Metadata.CONTENT_TYPE)); + //the metadata parse will fail if the stream is not correctly decoded + assertEquals("1425", jpegMetadata.get(Metadata.IMAGE_LENGTH)); + } + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path);
