Author: sergeyb
Date: Fri Jul 11 10:27:15 2014
New Revision: 1609677
URL: http://svn.apache.org/r1609677
Log:
[TIKA-1351] Updating AutoDetect, Composite and PDF parsers to guard against
null content handlers
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=1609677&r1=1609676&r2=1609677&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
Fri Jul 11 10:27:15 2014
@@ -114,7 +114,8 @@ public class AutoDetectParser extends Co
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// TIKA-216: Zip bomb prevention
- SecureContentHandler sch = new SecureContentHandler(handler, tis);
+ SecureContentHandler sch =
+ handler != null ? new SecureContentHandler(handler, tis) :
null;
try {
// Parse the document
super.parse(tis, sch, metadata, context);
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1609677&r1=1609676&r2=1609677&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
Fri Jul 11 10:27:15 2014
@@ -237,7 +237,8 @@ public class CompositeParser extends Abs
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
- TaggedContentHandler taggedHandler = new
TaggedContentHandler(handler);
+ TaggedContentHandler taggedHandler =
+ handler != null ? new TaggedContentHandler(handler) : null;
try {
parser.parse(taggedStream, taggedHandler, metadata, context);
} catch (RuntimeException e) {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1609677&r1=1609676&r2=1609677&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Fri Jul 11 10:27:15 2014
@@ -154,7 +154,9 @@ public class PDFParser extends AbstractP
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
- PDF2XHTML.process(pdfDocument, handler, context, metadata,
localConfig);
+ if (handler != null) {
+ PDF2XHTML.process(pdfDocument, handler, context, metadata,
localConfig);
+ }
} finally {
if (pdfDocument != null) {
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1609677&r1=1609676&r2=1609677&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Fri Jul 11 10:27:15 2014
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser.pdf;
-import org.junit.Ignore;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull;
@@ -25,7 +24,6 @@ import static org.junit.Assert.assertTru
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
-import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -90,6 +88,26 @@ public class PDFParserTest extends TikaT
assertTrue("should have word boundary between paragraphs",
!content.contains("libraries.Apache"));
}
+
+ @Test
+ public void testPdfParsingMetadataOnly() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ Metadata metadata = new Metadata();
+
+ InputStream stream = PDFParserTest.class.getResourceAsStream(
+ "/test-documents/testPDF.pdf");
+
+ try {
+ parser.parse(stream, null, metadata, new ParseContext());
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Bertrand Delacr\u00e9taz",
metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
+ assertEquals("Apache Tika - Apache Tika",
metadata.get(TikaCoreProperties.TITLE));
+ }
@Test
public void testCustomMetadata() throws Exception {