Author: jerome Date: Fri Aug 26 15:47:04 2005 New Revision: 240359 URL: http://svn.apache.org/viewcvs?rev=240359&view=rev Log: Add an analysis extension point
Added: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java (with props) lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java (with props) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Added: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=240359&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java Fri Aug 26 15:47:04 2005 @@ -0,0 +1,107 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.analysis; + +// JDK imports +import java.util.HashMap; +import java.util.Map; +import java.util.logging.Logger; + +// Nutch imports +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.ExtensionPoint; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.plugin.PluginRuntimeException; +import org.apache.nutch.util.LogFormatter; + + +/** + * Creates and caches [EMAIL PROTECTED] NutchAnalyzer} plugins. + * + * @author Jérôme Charron + */ +public class AnalyzerFactory { + + public final static Logger LOG = + LogFormatter.getLogger(AnalyzerFactory.class.getName()); + + private final static ExtensionPoint X_POINT = + PluginRepository.getInstance() + .getExtensionPoint(NutchAnalyzer.X_POINT_ID); + + private final static Map CACHE = new HashMap(); + + private final static NutchAnalyzer DEFAULT_ANALYSER = + new NutchDocumentAnalyzer(); + + + static { + if (X_POINT == null) { + throw new RuntimeException("x point " + NutchAnalyzer.X_POINT_ID + + " not found."); + } + } + + + private AnalyzerFactory() {} + + + /** + * Returns the appropriate [EMAIL PROTECTED] Analyser} implementation given a language + * code. + * + * <p>NutchAnalyser extensions should define the attribute "lang". The first + * plugin found whose "lang" attribute equals the specified lang parameter is + * used. If none match, then the [EMAIL PROTECTED] NutchDocumentAnalyzer} is used. + */ + public static NutchAnalyzer get(String lang) { + + NutchAnalyzer analyzer = DEFAULT_ANALYSER; + Extension extension = getExtension(lang); + if (extension != null) { + try { + analyzer = (NutchAnalyzer) extension.getExtensionInstance(); + } catch (PluginRuntimeException pre) { + analyzer = DEFAULT_ANALYSER; + } + } + return analyzer; + } + + private static Extension getExtension(String lang) { + + Extension extension = (Extension) CACHE.get(lang); + if (extension == null) { + extension = findExtension(lang); + CACHE.put(lang, extension); + } + return extension; + } + + private static Extension findExtension(String lang) { + + if (lang != null) { + Extension[] extensions = X_POINT.getExtentens(); + for (int i=0; i<extensions.length; i++) { + if (lang.equals(extensions[i].getAttribute("lang"))) { + return extensions[i]; + } + } + } + return null; + } + +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java?rev=240359&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java Fri Aug 26 15:47:04 2005 @@ -0,0 +1,45 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.analysis; + +// JDK imports +import java.io.Reader; + +// Lucene imports +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; + + +/** + * Extension point for analysis. + * All plugins found which implement this extension point are run + * sequentially on the parse. + * + * @author Jérôme Charron + */ +public abstract class NutchAnalyzer extends Analyzer { + + /** The name of the extension point. */ + final static String X_POINT_ID = NutchAnalyzer.class.getName(); + + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + */ + public abstract TokenStream tokenStream(String fieldName, Reader reader); + + +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?rev=240359&r1=240358&r2=240359&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java Fri Aug 26 15:47:04 2005 @@ -13,20 +13,25 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.analysis; +// JDK imports +import java.io.Reader; +import java.io.IOException; + +// Lucene imports import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; -import java.io.Reader; -import java.io.IOException; -/** The analyzer used for Nutch documents. Uses the JavaCC-defined lexical - * analyzer [EMAIL PROTECTED] NutchDocumentTokenizer}, with no stop list. This keeps it - * consistent with query parsing. */ -public class NutchDocumentAnalyzer extends Analyzer { + +/** + * The analyzer used for Nutch documents. + * Uses the JavaCC-defined lexical analyzer [EMAIL PROTECTED] NutchDocumentTokenizer}, + * with no stop list. This keeps it consistent with query parsing. + */ +public class NutchDocumentAnalyzer extends NutchAnalyzer { /** Analyzer used to index textual content. */ private static class ContentAnalyzer extends Analyzer { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=240359&r1=240358&r2=240359&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java Fri Aug 26 15:47:04 2005 @@ -16,25 +16,20 @@ package org.apache.nutch.indexer; -import org.apache.nutch.pagedb.*; -import org.apache.nutch.linkdb.*; import org.apache.nutch.fetcher.*; import org.apache.nutch.parse.*; import org.apache.nutch.analysis.NutchDocumentAnalyzer; -import org.apache.nutch.db.*; -import org.apache.nutch.io.*; import org.apache.nutch.fs.*; import org.apache.nutch.segment.SegmentReader; import org.apache.nutch.util.*; - -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import java.util.logging.*; -import java.util.*; import java.io.*; +import org.apache.nutch.analysis.AnalyzerFactory; +import org.apache.nutch.analysis.NutchAnalyzer; /** Creates an index for the output corresponding to a single fetcher run. */ public class IndexSegment { @@ -149,7 +144,11 @@ doc = IndexingFilters.filter(doc, parse, fetcherOutput); // add the document to the index - writer.addDocument(doc); + NutchAnalyzer analyzer = AnalyzerFactory.get(doc.get("lang")); + LOG.info(" Indexing [" + doc.getField("url").stringValue() + + "] with analyzer " + analyzer + " (" + doc.getField("lang").stringValue() + ")"); + //LOG.info(" Doc is " + doc); + writer.addDocument(doc, analyzer); if (count > 0 && count % LOG_STEP == 0) { curTime = System.currentTimeMillis(); LOG.info(" Processed " + count + " records (" + Modified: lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml?rev=240359&r1=240358&r2=240359&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Fri Aug 26 15:47:04 2005 @@ -40,5 +40,8 @@ id="org.apache.nutch.net.URLFilter" name="Nutch URL Filter"/> +<extension-point + id="org.apache.nutch.analysis.NutchAnalyzer" + name="Nutch Analysis"/> </plugin>