Author: jukka Date: Sun Mar 9 08:28:24 2008 New Revision: 635259 URL: http://svn.apache.org/viewvc?rev=635259&view=rev Log: TIKA-126: Add Parser.parse(InputStream, Metadata) for metadata extraction
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java Modified: incubator/tika/trunk/CHANGES.txt incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Modified: incubator/tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/CHANGES.txt (original) +++ incubator/tika/trunk/CHANGES.txt Sun Mar 9 08:28:24 2008 @@ -19,6 +19,9 @@ 8. TIKA-112 - Use Commons IO 1.4 (Jukka Zitting) +9. TIKA-126 - Add Parser.parse(InputStream, Metadata) for metadata extraction + (Jukka Zitting) + Release 0.1-incubating - 12/27/2007 Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java?rev=635259&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AbstractParser.java Sun Mar 9 08:28:24 2008 @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Abstract parser base class. Contains a default implementation of the + * [EMAIL PROTECTED] #parse(InputStream, Metadata)} method. + */ +public abstract class AbstractParser implements Parser { + + /** + * Calls the full + * [EMAIL PROTECTED] Parser#parse(InputStream, org.xml.sax.ContentHandler, Metadata)} + * method and keeps only the extracted metatdata. + */ + public void parse(InputStream stream, Metadata metadata) + throws IOException, TikaException { + try { + parse(stream, new DefaultHandler(), metadata); + } catch (SAXException e) { + throw new TikaException("Unexpected SAX error", e); + } + } + +} Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java Sun Mar 9 08:28:24 2008 @@ -30,7 +30,7 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -public class AutoDetectParser implements Parser { +public class AutoDetectParser extends AbstractParser { private TikaConfig config; Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/EmptyParser.java Sun Mar 9 08:28:24 2008 @@ -16,10 +16,8 @@ */ package org.apache.tika.parser; -import java.io.IOException; import java.io.InputStream; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; @@ -32,9 +30,12 @@ */ public class EmptyParser implements Parser { + public void parse(InputStream stream, Metadata metadata) { + } + public void parse( InputStream stream, ContentHandler handler, Metadata metadata) - throws IOException, SAXException, TikaException { + throws SAXException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ErrorParser.java Sun Mar 9 08:28:24 2008 @@ -30,6 +30,12 @@ public class ErrorParser implements Parser { public void parse( + InputStream stream, Metadata metadata) + throws TikaException { + throw new TikaException("Parse error"); + } + + public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws TikaException { throw new TikaException("Parse error"); Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Sun Mar 9 08:28:24 2008 @@ -30,6 +30,20 @@ public interface Parser { /** + * Parses document metadata from the given document stream. + * <p> + * The given document stream is consumed but not closed by this method. + * The responsibility to close the stream remains on the caller. + * + * @param stream the document stream (input) + * @param metadata document metadata (input and output) + * @throws IOException if the document stream could not be read + * @throws TikaException if the document could not be parsed + */ + void parse(InputStream stream, Metadata metadata) + throws IOException, TikaException; + + /** * Parses a document stream into a sequence of XHTML SAX events. * Fills in related document metadata in the given metadata object. * <p> Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java Sun Mar 9 08:28:24 2008 @@ -30,7 +30,7 @@ * instance. Subclasses can provide extra decoration by overriding the * parse method. */ -public class ParserDecorator implements Parser { +public class ParserDecorator extends AbstractParser { /** * The decorated parser instance. Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sun Mar 9 08:28:24 2008 @@ -18,12 +18,13 @@ import java.io.IOException; import java.io.InputStream; -import java.io.Reader; +import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.Parser; +import org.apache.tika.parser.AbstractParser; import org.apache.tika.sax.ContentHandlerDecorator; +import org.apache.tika.utils.Utils; import org.cyberneko.html.parsers.SAXParser; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; @@ -33,34 +34,16 @@ /** * Simple HTML parser that extracts title. */ -public class HtmlParser implements Parser { +public class HtmlParser extends AbstractParser { - public void parse(InputStream stream, ContentHandler handler, - Metadata metadata) throws IOException, SAXException, TikaException { - - final SAXParser parser = new SAXParser(); - - final InputSource source; - - Reader utf8Reader; - - try { - utf8Reader = org.apache.tika.utils.Utils.getUTF8Reader( - stream, metadata); - } catch (TikaException ex) { - utf8Reader = null; - } - - if (utf8Reader == null) { - source = new InputSource(stream); - } else { - source = new InputSource(utf8Reader); - } - - - parser.setContentHandler(new TitleExtractingContentHandler(handler, - metadata)); - parser.parse(source); + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata) + throws IOException, SAXException, TikaException { + SAXParser parser = new SAXParser(); + parser.setContentHandler( + new TitleExtractingContentHandler(handler, metadata)); + parser.parse(new InputSource(Utils.getUTF8Reader( + new CloseShieldInputStream(stream), metadata))); } private static class TitleExtractingContentHandler extends Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sun Mar 9 08:28:24 2008 @@ -52,6 +52,31 @@ DocumentSummaryInformation.DEFAULT_STREAM_NAME; /** + * Extracts properties from an MS Document input stream + */ + public void parse(InputStream stream, Metadata metadata) + throws IOException, TikaException { + POIFSFileSystem filesystem = new POIFSFileSystem(stream); + Iterator<?> entries = filesystem.getRoot().getEntries(); + while (entries.hasNext()) { + Entry entry = (Entry) entries.next(); + String name = entry.getName(); + if (!(entry instanceof DocumentEntry)) { + // Skip directory entries + } else if (SUMMARY_INFORMATION.equals(name) + || DOCUMENT_SUMMARY_INFORMATION.equals(name)) { + parse((DocumentEntry) entry, metadata); + } else if ("WordDocument".equals(name)) { + setType(metadata, "application/msword"); + } else if ("PowerPoint Document".equals(name)) { + setType(metadata, "application/vnd.ms-powerpoint"); + } else if ("Workbook".equals(name)) { + setType(metadata, "application/vnd.ms-excel"); + } + } + } + + /** * Extracts properties and text from an MS Document input stream */ public void parse( Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Sun Mar 9 08:28:24 2008 @@ -28,7 +28,6 @@ import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; /** * OpenOffice parser @@ -55,6 +54,21 @@ this.content = content; } + public void parse(InputStream stream, Metadata metadata) + throws IOException, TikaException { + ZipInputStream zip = new ZipInputStream(stream); + ZipEntry entry = zip.getNextEntry(); + while (entry != null) { + if (entry.getName().equals("mimetype")) { + String type = IOUtils.toString(zip, "UTF-8"); + metadata.set(Metadata.CONTENT_TYPE, type); + } else if (entry.getName().equals("meta.xml")) { + meta.parse(zip, metadata); + } + entry = zip.getNextEntry(); + } + } + public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { @@ -65,7 +79,7 @@ String type = IOUtils.toString(zip, "UTF-8"); metadata.set(Metadata.CONTENT_TYPE, type); } else if (entry.getName().equals("meta.xml")) { - meta.parse(zip, new DefaultHandler(), metadata); + meta.parse(zip, metadata); } else if (entry.getName().equals("content.xml")) { content.parse(zip, handler, metadata); } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Sun Mar 9 08:28:24 2008 @@ -34,6 +34,24 @@ */ public class PDFParser implements Parser { + public void parse(InputStream stream, Metadata metadata) + throws IOException, TikaException { + PDDocument pdfDocument = PDDocument.load(stream); + try { + if (pdfDocument.isEncrypted()) { + try { + pdfDocument.decrypt(""); + } catch (Exception e) { + // Ignore + } + } + metadata.add(Metadata.CONTENT_TYPE, "application/pdf"); + extractMetadata(pdfDocument, metadata); + } finally { + pdfDocument.close(); + } + } + public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Sun Mar 9 08:28:24 2008 @@ -25,7 +25,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.Parser; +import org.apache.tika.parser.AbstractParser; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -33,7 +33,7 @@ /** * RTF parser */ -public class RTFParser implements Parser { +public class RTFParser extends AbstractParser { public void parse( InputStream stream, ContentHandler handler, Metadata metadata) Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java Sun Mar 9 08:28:24 2008 @@ -22,7 +22,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.Parser; +import org.apache.tika.parser.AbstractParser; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.Utils; import org.xml.sax.ContentHandler; @@ -31,12 +31,11 @@ /** * Text parser */ -public class TXTParser implements Parser { +public class TXTParser extends AbstractParser { public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { - Reader reader = Utils.getUTF8Reader(stream, metadata); metadata.set(Metadata.CONTENT_TYPE, "text/plain"); Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=635259&r1=635258&r2=635259&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Sun Mar 9 08:28:24 2008 @@ -26,7 +26,7 @@ import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.Parser; +import org.apache.tika.parser.AbstractParser; import org.apache.tika.sax.TextContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; @@ -36,7 +36,7 @@ /** * XML parser */ -public class XMLParser implements Parser { +public class XMLParser extends AbstractParser { public void parse( InputStream stream, ContentHandler handler, Metadata metadata)