Author: jerome
Date: Fri Aug 26 15:47:04 2005
New Revision: 240359

URL: http://svn.apache.org/viewcvs?rev=240359&view=rev
Log:
Add an analysis extension point

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java  
 (with props)
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java   
(with props)
Modified:
    
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
    lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml

Added: 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=240359&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java 
Fri Aug 26 15:47:04 2005
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.analysis;
+
+// JDK imports
+import java.util.HashMap;
+import java.util.Map;
+import java.util.logging.Logger;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.util.LogFormatter;
+
+
+/**
+ * Creates and caches [EMAIL PROTECTED] NutchAnalyzer} plugins.
+ *
+ * @author Jérôme Charron
+ */
+public class AnalyzerFactory {
+
+  public final static Logger LOG =
+          LogFormatter.getLogger(AnalyzerFactory.class.getName());
+
+  private final static ExtensionPoint X_POINT = 
+          PluginRepository.getInstance()
+                          .getExtensionPoint(NutchAnalyzer.X_POINT_ID);
+
+  private final static Map CACHE = new HashMap();
+
+  private final static NutchAnalyzer DEFAULT_ANALYSER = 
+                                            new NutchDocumentAnalyzer();
+  
+  
+  static {
+    if (X_POINT == null) {
+      throw new RuntimeException("x point " + NutchAnalyzer.X_POINT_ID +
+                                 " not found.");
+    }
+  }
+
+
+  private AnalyzerFactory() {}
+
+  
+  /**
+   * Returns the appropriate [EMAIL PROTECTED] Analyser} implementation given 
a language
+   * code.
+   *
+   * <p>NutchAnalyser extensions should define the attribute "lang". The first
+   * plugin found whose "lang" attribute equals the specified lang parameter is
+   * used. If none match, then the [EMAIL PROTECTED] NutchDocumentAnalyzer} is 
used.
+   */
+  public static NutchAnalyzer get(String lang) {
+
+    NutchAnalyzer analyzer = DEFAULT_ANALYSER;
+    Extension extension = getExtension(lang);
+    if (extension != null) {
+        try {
+            analyzer = (NutchAnalyzer) extension.getExtensionInstance();
+        } catch (PluginRuntimeException pre) {
+            analyzer = DEFAULT_ANALYSER;
+        }
+    }
+    return analyzer;
+  }
+
+  private static Extension getExtension(String lang) {
+
+    Extension extension = (Extension) CACHE.get(lang);
+    if (extension == null) {
+      extension = findExtension(lang);
+      CACHE.put(lang, extension);
+    }
+    return extension;
+  }
+
+  private static Extension findExtension(String lang) {
+
+    if (lang != null) {
+      Extension[] extensions = X_POINT.getExtentens();
+      for (int i=0; i<extensions.length; i++) {
+        if (lang.equals(extensions[i].getAttribute("lang"))) {
+          return extensions[i];
+        }
+      }
+    }
+    return null;
+  }
+
+}

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java?rev=240359&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java 
Fri Aug 26 15:47:04 2005
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.analysis;
+
+// JDK imports
+import java.io.Reader;
+
+// Lucene imports
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+
+
+/** 
+ * Extension point for analysis.
+ * All plugins found which implement this extension point are run
+ * sequentially on the parse.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class NutchAnalyzer extends Analyzer {
+
+  /** The name of the extension point. */
+  final static String X_POINT_ID = NutchAnalyzer.class.getName();
+
+  
+  /**
+   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   */
+  public abstract TokenStream tokenStream(String fieldName, Reader reader);
+
+  
+}

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?rev=240359&r1=240358&r2=240359&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
 (original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
 Fri Aug 26 15:47:04 2005
@@ -13,20 +13,25 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.analysis;
 
+// JDK imports
+import java.io.Reader;
+import java.io.IOException;
+
+// Lucene imports
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
-import java.io.Reader;
-import java.io.IOException;
 
-/** The analyzer used for Nutch documents.  Uses the JavaCC-defined lexical
- * analyzer [EMAIL PROTECTED] NutchDocumentTokenizer}, with no stop list.  
This keeps it
- * consistent with query parsing. */
-public class NutchDocumentAnalyzer extends Analyzer {
+
+/**
+ * The analyzer used for Nutch documents.
+ * Uses the JavaCC-defined lexical analyzer [EMAIL PROTECTED] 
NutchDocumentTokenizer},
+ * with no stop list.  This keeps it consistent with query parsing.
+ */
+public class NutchDocumentAnalyzer extends NutchAnalyzer {
 
   /** Analyzer used to index textual content. */
   private static class ContentAnalyzer extends Analyzer {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=240359&r1=240358&r2=240359&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java Fri 
Aug 26 15:47:04 2005
@@ -16,25 +16,20 @@
 
 package org.apache.nutch.indexer;
 
-import org.apache.nutch.pagedb.*;
-import org.apache.nutch.linkdb.*;
 import org.apache.nutch.fetcher.*;
 import org.apache.nutch.parse.*;
 import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.db.*;
-import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
 import org.apache.nutch.segment.SegmentReader;
 import org.apache.nutch.util.*;
-
-import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 
 import java.util.logging.*;
-import java.util.*;
 import java.io.*;
+import org.apache.nutch.analysis.AnalyzerFactory;
+import org.apache.nutch.analysis.NutchAnalyzer;
 
 /** Creates an index for the output corresponding to a single fetcher run. */
 public class IndexSegment {
@@ -149,7 +144,11 @@
               doc = IndexingFilters.filter(doc, parse, fetcherOutput);
     
               // add the document to the index
-              writer.addDocument(doc);
+              NutchAnalyzer analyzer = AnalyzerFactory.get(doc.get("lang"));
+              LOG.info(" Indexing [" + doc.getField("url").stringValue() +
+                       "] with analyzer " + analyzer + " (" + 
doc.getField("lang").stringValue() + ")");
+              //LOG.info(" Doc is " + doc);
+              writer.addDocument(doc, analyzer);
               if (count > 0 && count % LOG_STEP == 0) {
                 curTime = System.currentTimeMillis();
                 LOG.info(" Processed " + count + " records (" +

Modified: lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml?rev=240359&r1=240358&r2=240359&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Fri Aug 26 
15:47:04 2005
@@ -40,5 +40,8 @@
       id="org.apache.nutch.net.URLFilter"
       name="Nutch URL Filter"/>
 
+<extension-point
+      id="org.apache.nutch.analysis.NutchAnalyzer"
+      name="Nutch Analysis"/>
 
 </plugin>


Reply via email to