Repository: tika Updated Branches: refs/heads/2.x b1c00c050 -> e05dd5bf4
TIKA-1990 -- need to add JPEG filters to embedded stream when handling embedded jpegs in PDFParser Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e05dd5bf Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e05dd5bf Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e05dd5bf Branch: refs/heads/2.x Commit: e05dd5bf4145c0e8bbfd585d05a8a4c26d83e2ce Parents: b1c00c0 Author: tballison <[email protected]> Authored: Tue May 31 10:09:19 2016 -0400 Committer: tballison <[email protected]> Committed: Tue May 31 10:09:19 2016 -0400 ---------------------------------------------------------------------- .../tika/parser/AutoDetectParserTest.java | 24 +++++++++++++++++++- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 10 +++++--- 2 files changed, 30 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/e05dd5bf/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java index 91b054e..4f312a9 100644 --- a/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -27,10 +27,12 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; +import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; @@ -38,6 +40,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMPDM; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.sax.BodyContentHandler; import org.gagravarr.tika.FlacParser; import org.gagravarr.tika.OpusParser; @@ -45,7 +48,7 @@ import org.gagravarr.tika.VorbisParser; import org.junit.Test; import org.xml.sax.ContentHandler; -public class AutoDetectParserTest { +public class AutoDetectParserTest extends TikaTest { private TikaConfig tika = TikaConfig.getDefaultConfig(); // Easy to read constants for the MIME types: @@ -380,6 +383,25 @@ public class AutoDetectParserTest { assertEquals("value", metadata.get("MyParser")); } + @Test + public void testEmbeddedJPEGInPDF() throws Exception { + //TIKA-1990, test that an embedded jpeg is correctly decoded + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImages(true); + ParseContext context = new ParseContext(); + context.set(PDFParserConfig.class, config); + + List<Metadata> metadataList = getRecursiveJson("testPDF_childAttachments.pdf", context); + //sanity check + assertEquals(4, metadataList.size()); + + //inlined jpeg metadata + Metadata jpegMetadata = metadataList.get(1); + assertEquals("image/jpeg", jpegMetadata.get(Metadata.CONTENT_TYPE)); + //the metadata parse will fail if the stream is not correctly decoded + assertEquals("1425", jpegMetadata.get(Metadata.IMAGE_LENGTH)); + } + private static final MediaType MY_MEDIA_TYPE = new MediaType("application", "x-myparser"); /** http://git-wip-us.apache.org/repos/asf/tika/blob/e05dd5bf/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index 7f6eb6c..bc0bf96 100644 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -27,6 +27,7 @@ import java.io.OutputStream; import java.io.Writer; import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Arrays; import java.util.Calendar; import java.util.HashMap; import java.util.HashSet; @@ -98,6 +99,11 @@ class PDF2XHTML extends PDFTextStripper { * Prevents theoretical AcroForm recursion bomb. */ private final static int MAX_ACROFORM_RECURSIONS = 10; + + private static final List<String> JPEG = Arrays.asList( + COSName.DCT_DECODE.getName(), + COSName.DCT_DECODE_ABBREVIATION.getName()); + /** * Format used for signature dates * TODO Make this thread-safe @@ -447,9 +453,7 @@ class PDF2XHTML extends PDFTextStripper { if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) || PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) { // RGB or Gray colorspace: get and write the unmodifiedJPEG stream - //TODO: shouldn't need to do this: should be able to call createInputStream directly?! - //version clash somewhere?! - InputStream data = pdImage.getStream().createInputStream(); + InputStream data = pdImage.getStream().createInputStream(JPEG); org.apache.pdfbox.io.IOUtils.copy(data, out); org.apache.pdfbox.io.IOUtils.closeQuietly(data); } else {
