Author: jukka
Date: Wed Mar 9 15:29:39 2011
New Revision: 1079837
URL: http://svn.apache.org/viewvc?rev=1079837&view=rev
Log:
TIKA-609: IOException from jempbox
Catch (and ignore) parse errors with embedded XMP.
Use TikaInputStream instead of an unbounded memory buffer for reading a JPEG
stream twice.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=1079837&r1=1079836&r2=1079837&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
Wed Mar 9 15:29:39 2011
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.image;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.text.DecimalFormat;
@@ -81,11 +82,11 @@ public class ImageMetadataExtractor {
this.handlers = handlers;
}
- public void parseJpeg(InputStream stream)
+ public void parseJpeg(File file)
throws IOException, SAXException, TikaException {
try {
com.drew.metadata.Metadata jpegMetadata =
- JpegMetadataReader.readMetadata(stream);
+ JpegMetadataReader.readMetadata(file);
handle(jpegMetadata);
} catch (JpegProcessingException e) {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java?rev=1079837&r1=1079836&r2=1079837&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
Wed Mar 9 15:29:39 2011
@@ -54,31 +54,37 @@ public class JempboxExtractor {
if (!found) {
return;
}
-
- Reader decoded = new InputStreamReader(new
ByteArrayInputStream(xmpraw.toByteArray()), DEFAULT_XMP_CHARSET);
- XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded));
-
- XMPSchemaDublinCore dc = xmp.getDublinCoreSchema();
- if (dc != null) {
- if (dc.getTitle() != null) {
- metadata.set(DublinCore.TITLE, dc.getTitle());
- }
- if (dc.getDescription() != null) {
- metadata.set(DublinCore.DESCRIPTION, dc.getDescription());
- }
- if (dc.getCreators() != null && dc.getCreators().size() > 0) {
- metadata.set(DublinCore.CREATOR,
joinCreators(dc.getCreators()));
- }
- if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
- Iterator<String> keywords = dc.getSubjects().iterator();
- while (keywords.hasNext()) {
- metadata.add(DublinCore.SUBJECT, keywords.next());
+
+ Reader decoded = new InputStreamReader(
+ new ByteArrayInputStream(xmpraw.toByteArray()),
+ DEFAULT_XMP_CHARSET);
+ try {
+ XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded));
+ XMPSchemaDublinCore dc = xmp.getDublinCoreSchema();
+ if (dc != null) {
+ if (dc.getTitle() != null) {
+ metadata.set(DublinCore.TITLE, dc.getTitle());
+ }
+ if (dc.getDescription() != null) {
+ metadata.set(DublinCore.DESCRIPTION, dc.getDescription());
+ }
+ if (dc.getCreators() != null && dc.getCreators().size() > 0) {
+ metadata.set(DublinCore.CREATOR,
joinCreators(dc.getCreators()));
+ }
+ if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
+ Iterator<String> keywords = dc.getSubjects().iterator();
+ while (keywords.hasNext()) {
+ metadata.add(DublinCore.SUBJECT, keywords.next());
+ }
+ // TODO should we set KEYWORDS too?
+ // All tested photo managers set the same in
Iptc.Application2.Keywords and Xmp.dc.subject
}
- // TODO should we set KEYWORDS too?
- // All tested photo managers set the same in
Iptc.Application2.Keywords and Xmp.dc.subject
}
+ } catch (IOException e) {
+ // Could not parse embedded XMP metadata. That's not a serious
+ // problem, so we'll just ignore the issue for now.
+ // TODO: Make error handling like this configurable.
}
-
}
protected String joinCreators(List<String> creators) {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java?rev=1079837&r1=1079836&r2=1079837&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
Wed Mar 9 15:29:39 2011
@@ -23,6 +23,8 @@ import java.util.Collections;
import java.util.Set;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryFiles;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -55,18 +57,14 @@ public class JpegParser implements Parse
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
-
- // read stream twice - exif and xmp extractors
- stream.mark(Integer.MAX_VALUE);
- FilterInputStream first = new FilterInputStream(stream) {
- @Override
- public void close() throws IOException {
- }
- };
- new ImageMetadataExtractor(metadata).parseJpeg(first);
- stream.reset();
-
- new JempboxExtractor(metadata).parse(stream);
+ TemporaryFiles tmp = new TemporaryFiles();
+ try {
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
+ new JempboxExtractor(metadata).parse(tis);
+ } finally {
+ tmp.dispose();
+ }
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();