Author: jukka
Date: Fri Apr 11 07:29:33 2008
New Revision: 647181

URL: http://svn.apache.org/viewvc?rev=647181&view=rev
Log:
TIKA-139: Add a composite parser

Added:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=647181&r1=647180&r2=647181&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Apr 11 07:29:33 2008
@@ -46,6 +46,9 @@
 19. TIKA-113 - Metadata (such as title) should not be part of content
                (Jukka Zitting)
 
+20. TIKA-139 - Add a composite parser (Jukka Zitting)
+
+
 Release 0.1-incubating - 12/27/2007
 
 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java?rev=647181&r1=647180&r2=647181&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java 
Fri Apr 11 07:29:33 2008
@@ -107,7 +107,11 @@
     public Parser getParser(String mimeType) {
         return parsers.get(mimeType);
     }
-    
+
+    public Map<String, Parser> getParsers() {
+        return parsers;
+    }
+
     public MimeTypes getMimeRepository(){
         return mimeTypes;
     }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=647181&r1=647180&r2=647181&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
Fri Apr 11 07:29:33 2008
@@ -30,9 +30,9 @@
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-public class AutoDetectParser extends AbstractParser {
+public class AutoDetectParser extends CompositeParser {
 
-    private TikaConfig config;
+    private MimeTypes types;
 
     /**
      * Creates an auto-detecting parser instance using the default Tika
@@ -40,7 +40,7 @@
      */
     public AutoDetectParser() {
         try {
-            config = TikaConfig.getDefaultConfig();
+            setConfig(TikaConfig.getDefaultConfig());
         } catch (TikaException e) {
             // FIXME: This should never happen
             throw new RuntimeException(e);
@@ -48,15 +48,20 @@
     }
 
     public AutoDetectParser(TikaConfig config) {
-        this.config = config;
+        setConfig(config);
     }
 
-    public TikaConfig getConfig() {
-        return config;
+    public void setConfig(TikaConfig config) {
+        setParsers(config.getParsers());
+        setMimeTypes(config.getMimeRepository());
     }
 
-    public void setConfig(TikaConfig config) {
-        this.config = config;
+    public MimeTypes getMimeTypes() {
+        return types;
+    }
+
+    public void setMimeTypes(MimeTypes types) {
+        this.types = types;
     }
 
     public void parse(
@@ -71,17 +76,8 @@
         MimeType type = getMimeType(stream, metadata);
         metadata.set(Metadata.CONTENT_TYPE, type.getName());
 
-        // Get the parser configured for the detected MIME type
-        Parser parser = config.getParser(type.getName());
-        if (parser == null) {
-            parser = config.getParser(MimeTypes.DEFAULT);
-        }
-        if (parser == null) {
-            throw new TikaException("No parsers available: " + type.getName());
-        }
-
         // Parse the document
-        parser.parse(stream, handler, metadata);
+        super.parse(stream, handler, metadata);
     }
 
     /**
@@ -99,8 +95,6 @@
      */
     private MimeType getMimeType(InputStream stream, Metadata metadata)
             throws IOException {
-        MimeTypes types = config.getMimeRepository();
-
         // Get type based on magic prefix
         stream.mark(types.getMinLength());
         try {

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=647181&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java 
(added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java 
Fri Apr 11 07:29:33 2008
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Composite parser that delegates parsing tasks to a component parser
+ * based on the declared content type of the incoming document. A fallback
+ * parser is defined for cases where a parser for the given content type is
+ * not available.
+ */
+public class CompositeParser implements Parser {
+
+    /**
+     * Set of component parsers, keyed by the supported media types.
+     */
+    private Map<String, Parser> parsers = new HashMap<String, Parser>();
+
+    /**
+     * The fallback parser, used when no better parser is available.
+     */
+    private Parser fallback = new EmptyParser();
+
+    /**
+     * Returns the component parsers.
+     *
+     * @return component parsers, keyed by media type
+     */
+    public Map<String, Parser> getParsers() {
+        return parsers;
+    }
+
+    /**
+     * Sets the component parsers.
+     *
+     * @param parsers component parsers, keyed by media type
+     */
+    public void setParsers(Map<String, Parser> parsers) {
+        this.parsers = parsers;
+    }
+
+    /**
+     * Returns the fallback parser.
+     *
+     * @return fallback parser
+     */
+    public Parser getFallback() {
+        return fallback;
+    }
+
+    /**
+     * Sets the fallback parser.
+     *
+     * @param fallback fallback parser
+     */
+    public void setFallback(Parser fallback) {
+        this.fallback = fallback;
+    }
+
+    /**
+     * Returns the parser that best matches the given metadata. By default
+     * looks for a parser that matches the content type metadata property,
+     * and uses the fallback parser if a better match is not found.
+     * <p>
+     * Subclasses can override this method to provide more accurate
+     * parser resolution.
+     *
+     * @param metadata document metadata
+     * @return matching parser
+     */
+    protected Parser getParser(Metadata metadata) {
+        Parser parser = parsers.get(metadata.get(Metadata.CONTENT_TYPE));
+        if (parser == null) {
+            parser = fallback;
+        }
+        return parser;
+    }
+
+    /**
+     * Delegates the call to the matching component parser.
+     */
+    public void parse(InputStream stream, Metadata metadata)
+            throws IOException, TikaException {
+        getParser(metadata).parse(stream, metadata);
+    }
+
+    /**
+     * Delegates the call to the matching component parser.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        getParser(metadata).parse(stream, handler, metadata);
+    }
+
+}


Reply via email to