Author: jukka Date: Fri Apr 11 07:29:33 2008 New Revision: 647181 URL: http://svn.apache.org/viewvc?rev=647181&view=rev Log: TIKA-139: Add a composite parser
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java Modified: incubator/tika/trunk/CHANGES.txt incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java Modified: incubator/tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=647181&r1=647180&r2=647181&view=diff ============================================================================== --- incubator/tika/trunk/CHANGES.txt (original) +++ incubator/tika/trunk/CHANGES.txt Fri Apr 11 07:29:33 2008 @@ -46,6 +46,9 @@ 19. TIKA-113 - Metadata (such as title) should not be part of content (Jukka Zitting) +20. TIKA-139 - Add a composite parser (Jukka Zitting) + + Release 0.1-incubating - 12/27/2007 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann) Modified: incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java?rev=647181&r1=647180&r2=647181&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java Fri Apr 11 07:29:33 2008 @@ -107,7 +107,11 @@ public Parser getParser(String mimeType) { return parsers.get(mimeType); } - + + public Map<String, Parser> getParsers() { + return parsers; + } + public MimeTypes getMimeRepository(){ return mimeTypes; } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=647181&r1=647180&r2=647181&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java Fri Apr 11 07:29:33 2008 @@ -30,9 +30,9 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -public class AutoDetectParser extends AbstractParser { +public class AutoDetectParser extends CompositeParser { - private TikaConfig config; + private MimeTypes types; /** * Creates an auto-detecting parser instance using the default Tika @@ -40,7 +40,7 @@ */ public AutoDetectParser() { try { - config = TikaConfig.getDefaultConfig(); + setConfig(TikaConfig.getDefaultConfig()); } catch (TikaException e) { // FIXME: This should never happen throw new RuntimeException(e); @@ -48,15 +48,20 @@ } public AutoDetectParser(TikaConfig config) { - this.config = config; + setConfig(config); } - public TikaConfig getConfig() { - return config; + public void setConfig(TikaConfig config) { + setParsers(config.getParsers()); + setMimeTypes(config.getMimeRepository()); } - public void setConfig(TikaConfig config) { - this.config = config; + public MimeTypes getMimeTypes() { + return types; + } + + public void setMimeTypes(MimeTypes types) { + this.types = types; } public void parse( @@ -71,17 +76,8 @@ MimeType type = getMimeType(stream, metadata); metadata.set(Metadata.CONTENT_TYPE, type.getName()); - // Get the parser configured for the detected MIME type - Parser parser = config.getParser(type.getName()); - if (parser == null) { - parser = config.getParser(MimeTypes.DEFAULT); - } - if (parser == null) { - throw new TikaException("No parsers available: " + type.getName()); - } - // Parse the document - parser.parse(stream, handler, metadata); + super.parse(stream, handler, metadata); } /** @@ -99,8 +95,6 @@ */ private MimeType getMimeType(InputStream stream, Metadata metadata) throws IOException { - MimeTypes types = config.getMimeRepository(); - // Get type based on magic prefix stream.mark(types.getMinLength()); try { Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=647181&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java Fri Apr 11 07:29:33 2008 @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Composite parser that delegates parsing tasks to a component parser + * based on the declared content type of the incoming document. A fallback + * parser is defined for cases where a parser for the given content type is + * not available. + */ +public class CompositeParser implements Parser { + + /** + * Set of component parsers, keyed by the supported media types. + */ + private Map<String, Parser> parsers = new HashMap<String, Parser>(); + + /** + * The fallback parser, used when no better parser is available. + */ + private Parser fallback = new EmptyParser(); + + /** + * Returns the component parsers. + * + * @return component parsers, keyed by media type + */ + public Map<String, Parser> getParsers() { + return parsers; + } + + /** + * Sets the component parsers. + * + * @param parsers component parsers, keyed by media type + */ + public void setParsers(Map<String, Parser> parsers) { + this.parsers = parsers; + } + + /** + * Returns the fallback parser. + * + * @return fallback parser + */ + public Parser getFallback() { + return fallback; + } + + /** + * Sets the fallback parser. + * + * @param fallback fallback parser + */ + public void setFallback(Parser fallback) { + this.fallback = fallback; + } + + /** + * Returns the parser that best matches the given metadata. By default + * looks for a parser that matches the content type metadata property, + * and uses the fallback parser if a better match is not found. + * <p> + * Subclasses can override this method to provide more accurate + * parser resolution. + * + * @param metadata document metadata + * @return matching parser + */ + protected Parser getParser(Metadata metadata) { + Parser parser = parsers.get(metadata.get(Metadata.CONTENT_TYPE)); + if (parser == null) { + parser = fallback; + } + return parser; + } + + /** + * Delegates the call to the matching component parser. + */ + public void parse(InputStream stream, Metadata metadata) + throws IOException, TikaException { + getParser(metadata).parse(stream, metadata); + } + + /** + * Delegates the call to the matching component parser. + */ + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata) + throws IOException, SAXException, TikaException { + getParser(metadata).parse(stream, handler, metadata); + } + +}