Author: jukka Date: Thu Sep 22 05:50:47 2011 New Revision: 1173951 URL: http://svn.apache.org/viewvc?rev=1173951&view=rev Log: TIKA-709: Tika network server does not print anything in response to, for example, Word documents
Ensure that the parsers won't close the document stream, as specified in the Parser interface contract Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java?rev=1173951&r1=1173950&r2=1173951&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java Thu Sep 22 05:50:47 2011 @@ -26,6 +26,7 @@ import java.util.List; import java.util.Set; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; @@ -65,7 +66,7 @@ public class FeedParser extends Abstract // set the encoding? try { SyndFeed feed = new SyndFeedInput().build( - new InputSource(stream)); + new InputSource(new CloseShieldInputStream(stream))); String title = stripTags(feed.getTitleEx()); String description = stripTags(feed.getDescriptionEx()); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java?rev=1173951&r1=1173950&r2=1173951&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java Thu Sep 22 05:50:47 2011 @@ -44,14 +44,10 @@ public class JempboxExtractor { public JempboxExtractor(Metadata metadata) { this.metadata = metadata; } - - public void parse(InputStream file) - throws IOException, TikaException { - + + public void parse(InputStream file) throws IOException, TikaException { ByteArrayOutputStream xmpraw = new ByteArrayOutputStream(); - boolean found = scanner.parse(file, xmpraw); - file.close(); - if (!found) { + if (!scanner.parse(file, xmpraw)) { return; } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1173951&r1=1173950&r2=1173951&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Thu Sep 22 05:50:47 2011 @@ -35,6 +35,7 @@ import org.apache.poi.poifs.filesystem.N import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -166,13 +167,15 @@ public class OfficeParser extends Abstra NPOIFSFileSystem filesystem; TikaInputStream tstream = TikaInputStream.cast(stream); if (tstream == null) { - filesystem = new NPOIFSFileSystem(stream); + filesystem = + new NPOIFSFileSystem(new CloseShieldInputStream(stream)); } else if (tstream.getOpenContainer() instanceof NPOIFSFileSystem) { filesystem = (NPOIFSFileSystem) tstream.getOpenContainer(); } else if (tstream.hasFile()) { filesystem = new NPOIFSFileSystem(tstream.getFileChannel()); } else { - filesystem = new NPOIFSFileSystem(tstream); + filesystem = + new NPOIFSFileSystem(new CloseShieldInputStream(tstream)); } // Parse summary entries first, to make metadata available early Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1173951&r1=1173950&r2=1173951&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Thu Sep 22 05:50:47 2011 @@ -32,6 +32,7 @@ import org.apache.poi.xssf.extractor.XSS import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -57,18 +58,19 @@ public class OOXMLExtractorFactory { OOXMLExtractor extractor; POIXMLTextExtractor poiExtractor; - if(stream instanceof TikaInputStream && - ((TikaInputStream)stream).getOpenContainer() != null) { - poiExtractor = ExtractorFactory.createExtractor( - (OPCPackage)((TikaInputStream)stream).getOpenContainer() - ); - } else if (stream instanceof TikaInputStream && - ((TikaInputStream) stream).hasFile()) { - poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(((TikaInputStream) stream).getFile()); + TikaInputStream tis = TikaInputStream.cast(stream); + if (tis != null && tis.getOpenContainer() instanceof OPCPackage) { + poiExtractor = ExtractorFactory.createExtractor( + (OPCPackage) tis.getOpenContainer()); + } else if (tis != null && tis.hasFile()) { + poiExtractor = (POIXMLTextExtractor) + ExtractorFactory.createExtractor(tis.getFile()); } else { - poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream); + InputStream shield = new CloseShieldInputStream(stream); + poiExtractor = (POIXMLTextExtractor) + ExtractorFactory.createExtractor(shield); } - + POIXMLDocument document = poiExtractor.getDocument(); if (poiExtractor instanceof XSSFEventBasedExcelExtractor) { extractor = new XSSFExcelExtractorDecorator( Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1173951&r1=1173950&r2=1173951&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Thu Sep 22 05:50:47 2011 @@ -31,6 +31,7 @@ import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.Property; @@ -71,7 +72,8 @@ public class PDFParser extends AbstractP InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - PDDocument pdfDocument = PDDocument.load(stream, true); + PDDocument pdfDocument = + PDDocument.load(new CloseShieldInputStream(stream), true); try { if (pdfDocument.isEncrypted()) { try {