Author: jukka
Date: Wed Mar  9 15:29:39 2011
New Revision: 1079837

URL: http://svn.apache.org/viewvc?rev=1079837&view=rev
Log:
TIKA-609: IOException from jempbox

Catch (and ignore) parse errors with embedded XMP.

Use TikaInputStream instead of an unbounded memory buffer for reading a JPEG 
stream twice.

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=1079837&r1=1079836&r2=1079837&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
 Wed Mar  9 15:29:39 2011
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.image;
 
+import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.DecimalFormat;
@@ -81,11 +82,11 @@ public class ImageMetadataExtractor {
         this.handlers = handlers;
     }
 
-    public void parseJpeg(InputStream stream)
+    public void parseJpeg(File file)
             throws IOException, SAXException, TikaException {
         try {
             com.drew.metadata.Metadata jpegMetadata =
-                JpegMetadataReader.readMetadata(stream);
+                JpegMetadataReader.readMetadata(file);
 
             handle(jpegMetadata);
         } catch (JpegProcessingException e) {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java?rev=1079837&r1=1079836&r2=1079837&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
 Wed Mar  9 15:29:39 2011
@@ -54,31 +54,37 @@ public class JempboxExtractor {
         if (!found) {
             return;
         }
-        
-        Reader decoded = new InputStreamReader(new 
ByteArrayInputStream(xmpraw.toByteArray()), DEFAULT_XMP_CHARSET);
-        XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded));
-        
-        XMPSchemaDublinCore dc = xmp.getDublinCoreSchema();
-        if (dc != null) {
-            if (dc.getTitle() != null) {
-                metadata.set(DublinCore.TITLE, dc.getTitle());
-            }
-            if (dc.getDescription() != null) {
-                metadata.set(DublinCore.DESCRIPTION, dc.getDescription());
-            }
-            if (dc.getCreators() != null && dc.getCreators().size() > 0) {
-                metadata.set(DublinCore.CREATOR, 
joinCreators(dc.getCreators()));
-            }
-            if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
-                Iterator<String> keywords = dc.getSubjects().iterator();
-                while (keywords.hasNext()) {
-                    metadata.add(DublinCore.SUBJECT, keywords.next());
+
+        Reader decoded = new InputStreamReader(
+                new ByteArrayInputStream(xmpraw.toByteArray()),
+                DEFAULT_XMP_CHARSET);
+        try {
+            XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded));
+            XMPSchemaDublinCore dc = xmp.getDublinCoreSchema();
+            if (dc != null) {
+                if (dc.getTitle() != null) {
+                    metadata.set(DublinCore.TITLE, dc.getTitle());
+                }
+                if (dc.getDescription() != null) {
+                    metadata.set(DublinCore.DESCRIPTION, dc.getDescription());
+                }
+                if (dc.getCreators() != null && dc.getCreators().size() > 0) {
+                    metadata.set(DublinCore.CREATOR, 
joinCreators(dc.getCreators()));
+                }
+                if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
+                    Iterator<String> keywords = dc.getSubjects().iterator();
+                    while (keywords.hasNext()) {
+                        metadata.add(DublinCore.SUBJECT, keywords.next());
+                    }
+                    // TODO should we set KEYWORDS too?
+                    // All tested photo managers set the same in 
Iptc.Application2.Keywords and Xmp.dc.subject
                 }
-                // TODO should we set KEYWORDS too?
-                // All tested photo managers set the same in 
Iptc.Application2.Keywords and Xmp.dc.subject
             }
+        } catch (IOException e) {
+            // Could not parse embedded XMP metadata. That's not a serious
+            // problem, so we'll just ignore the issue for now.
+            // TODO: Make error handling like this configurable.
         }
-        
     }
 
     protected String joinCreators(List<String> creators) {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java?rev=1079837&r1=1079836&r2=1079837&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
 Wed Mar  9 15:29:39 2011
@@ -23,6 +23,8 @@ import java.util.Collections;
 import java.util.Set;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryFiles;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
@@ -55,18 +57,14 @@ public class JpegParser implements Parse
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        
-        // read stream twice - exif and xmp extractors
-        stream.mark(Integer.MAX_VALUE);
-        FilterInputStream first = new FilterInputStream(stream) {
-            @Override
-            public void close() throws IOException {
-            }
-        };
-        new ImageMetadataExtractor(metadata).parseJpeg(first);
-        stream.reset();
-        
-        new JempboxExtractor(metadata).parse(stream);
+        TemporaryFiles tmp = new TemporaryFiles();
+        try {
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+            new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
+            new JempboxExtractor(metadata).parse(tis);
+        } finally {
+            tmp.dispose();
+        }
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();


Reply via email to