Author: jnioche
Date: Mon Oct  7 10:08:43 2013
New Revision: 1529813

URL: http://svn.apache.org/r1529813
Log:
NUTCH-1562

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
    nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
    nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
    nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
    nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Oct  7 10:08:43 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1562 Order of execution for scoring filters (jnioche, snagel)
+
 * NUTCH-1640 Reuse ParseUtil instance in ParseSegment (Mitesh Singh Jat via 
jnioche)
 
 * NUTCH-1639 bin/crawl fails on mac os (various contributors via snagel)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Oct  7 10:08:43 2013
@@ -1172,11 +1172,10 @@
 <property>
   <name>scoring.filter.order</name>
   <value></value>
-  <description>The order in which scoring filters are applied.
-  This may be left empty (in which case all available scoring
-  filters will be applied in the order defined in plugin-includes
-  and plugin-excludes), or a space separated list of implementation
-  classes.
+  <description>The order in which scoring filters are applied.  This
+  may be left empty (in which case all available scoring filters will
+  be applied in system defined order), or a space separated list of
+  implementation classes.
   </description>
 </property>
 

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Mon Oct  
7 10:08:43 2013
@@ -17,16 +17,12 @@
 
 package org.apache.nutch.indexer;
 
-import java.util.ArrayList;
-import java.util.HashMap;
-
 // Commons Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.nutch.plugin.*;
+import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.util.ObjectCache;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
@@ -42,65 +38,10 @@ public class IndexingFilters {
   private IndexingFilter[] indexingFilters;
 
   public IndexingFilters(Configuration conf) {
-    /* Get indexingfilter.order property */
-    String order = conf.get(INDEXINGFILTER_ORDER);
-    ObjectCache objectCache = ObjectCache.get(conf);
-    this.indexingFilters = (IndexingFilter[]) objectCache
-        .getObject(IndexingFilter.class.getName());
-    if (this.indexingFilters == null) {
-      /*
-       * If ordered filters are required, prepare array of filters based on
-       * property
-       */
-      String[] orderedFilters = null;
-      if (order != null && !order.trim().equals("")) {
-        orderedFilters = order.trim().split("\\s+");
-      }
-      try {
-        ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
-            IndexingFilter.X_POINT_ID);
-        if (point == null)
-          throw new RuntimeException(IndexingFilter.X_POINT_ID + " not 
found.");
-        Extension[] extensions = point.getExtensions();
-        HashMap<String, IndexingFilter> filterMap =
-          new HashMap<String, IndexingFilter>();
-        for (int i = 0; i < extensions.length; i++) {
-          Extension extension = extensions[i];
-          IndexingFilter filter = (IndexingFilter) extension
-              .getExtensionInstance();
-          LOG.info("Adding " + filter.getClass().getName());
-          if (!filterMap.containsKey(filter.getClass().getName())) {
-            filterMap.put(filter.getClass().getName(), filter);
-          }
-        }
-        /*
-         * If no ordered filters required, just get the filters in an
-         * indeterminate order
-         */
-        if (orderedFilters == null) {
-          objectCache.setObject(IndexingFilter.class.getName(),
-              filterMap.values().toArray(
-                  new IndexingFilter[0]));
-          /* Otherwise run the filters in the required order */
-        } else {
-          ArrayList<IndexingFilter> filters = new ArrayList<IndexingFilter>();
-          for (int i = 0; i < orderedFilters.length; i++) {
-            IndexingFilter filter = filterMap
-                .get(orderedFilters[i]);
-            if (filter != null) {
-              filters.add(filter);
-            }
-          }
-          objectCache.setObject(IndexingFilter.class.getName(), filters
-              .toArray(new IndexingFilter[filters.size()]));
-        }
-      } catch (PluginRuntimeException e) {
-        throw new RuntimeException(e);
-      }
-      this.indexingFilters = (IndexingFilter[]) objectCache
-          .getObject(IndexingFilter.class.getName());
-    }
-  }                  
+    indexingFilters = (IndexingFilter[]) PluginRepository.get(conf)
+        .getOrderedPlugins(IndexingFilter.class, IndexingFilter.X_POINT_ID,
+            INDEXINGFILTER_ORDER);
+  }
 
   /** Run all defined filters. */
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url, 
CrawlDatum datum,

Modified: nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Mon Oct  7 
10:08:43 2013
@@ -17,17 +17,9 @@
 
 package org.apache.nutch.net;
 
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.ExtensionPoint;
-import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.plugin.PluginRepository;
-import org.apache.nutch.util.ObjectCache;
 
-import org.apache.hadoop.conf.Configuration;
 /** Creates and caches {@link URLFilter} implementing plugins.*/
 public class URLFilters {
 
@@ -35,49 +27,9 @@ public class URLFilters {
   private URLFilter[] filters;
 
   public URLFilters(Configuration conf) {
-    String order = conf.get(URLFILTER_ORDER);
-    ObjectCache objectCache = ObjectCache.get(conf);
-    this.filters = (URLFilter[]) 
objectCache.getObject(URLFilter.class.getName());
-
-    if (this.filters == null) {
-      String[] orderedFilters = null;
-      if (order != null && !order.trim().equals("")) {
-        orderedFilters = order.trim().split("\\s+");
-      }
-
-      try {
-        ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
-            URLFilter.X_POINT_ID);
-        if (point == null)
-          throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
-        Extension[] extensions = point.getExtensions();
-        Map<String, URLFilter> filterMap = new HashMap<String, URLFilter>();
-        for (int i = 0; i < extensions.length; i++) {
-          Extension extension = extensions[i];
-          URLFilter filter = (URLFilter) extension.getExtensionInstance();
-          if (!filterMap.containsKey(filter.getClass().getName())) {
-            filterMap.put(filter.getClass().getName(), filter);
-          }
-        }
-        if (orderedFilters == null) {
-          objectCache.setObject(URLFilter.class.getName(), 
filterMap.values().toArray(
-              new URLFilter[0]));
-        } else {
-          ArrayList<URLFilter> filters = new ArrayList<URLFilter>();
-          for (int i = 0; i < orderedFilters.length; i++) {
-            URLFilter filter = filterMap.get(orderedFilters[i]);
-            if (filter != null) {
-              filters.add(filter);
-            }
-          }
-          objectCache.setObject(URLFilter.class.getName(), filters
-              .toArray(new URLFilter[filters.size()]));
-        }
-      } catch (PluginRuntimeException e) {
-        throw new RuntimeException(e);
-      }
-      this.filters = (URLFilter[]) 
objectCache.getObject(URLFilter.class.getName());
-    }
+    this.filters = (URLFilter[]) PluginRepository.get(conf)
+        .getOrderedPlugins(URLFilter.class, URLFilter.X_POINT_ID,
+            URLFILTER_ORDER);
   }
 
   /** Run all defined filters. Assume logical AND. */

Modified: nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Mon Oct  
7 10:08:43 2013
@@ -17,12 +17,8 @@
 
 package org.apache.nutch.parse;
 
-import java.util.ArrayList;
-import java.util.HashMap;
-
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.plugin.*;
-import org.apache.nutch.util.ObjectCache;
+import org.apache.nutch.plugin.PluginRepository;
 import org.apache.hadoop.conf.Configuration;
 
 import org.w3c.dom.DocumentFragment;
@@ -35,59 +31,10 @@ public class HtmlParseFilters {
   public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order";
 
   public HtmlParseFilters(Configuration conf) {
-        String order = conf.get(HTMLPARSEFILTER_ORDER);
-        ObjectCache objectCache = ObjectCache.get(conf);
-        this.htmlParseFilters = (HtmlParseFilter[]) 
objectCache.getObject(HtmlParseFilter.class.getName());
-        if (htmlParseFilters == null) {
-          /*
-           * If ordered filters are required, prepare array of filters based on
-           * property
-           */
-          String[] orderedFilters = null;
-          if (order != null && !order.trim().equals("")) {
-            orderedFilters = order.trim().split("\\s+");
-          }
-            HashMap<String, HtmlParseFilter> filterMap =
-              new HashMap<String, HtmlParseFilter>();
-            try {
-                ExtensionPoint point = 
PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.X_POINT_ID);
-                if (point == null)
-                    throw new RuntimeException(HtmlParseFilter.X_POINT_ID + " 
not found.");
-                Extension[] extensions = point.getExtensions();
-                for (int i = 0; i < extensions.length; i++) {
-                    Extension extension = extensions[i];
-                    HtmlParseFilter parseFilter = (HtmlParseFilter) 
extension.getExtensionInstance();
-                    if 
(!filterMap.containsKey(parseFilter.getClass().getName())) {
-                        filterMap.put(parseFilter.getClass().getName(), 
parseFilter);
-                    }
-                }
-                HtmlParseFilter[] htmlParseFilters = 
filterMap.values().toArray(new HtmlParseFilter[filterMap.size()]);
-                /*
-                 * If no ordered filters required, just get the filters in an
-                 * indeterminate order
-                 */
-                if (orderedFilters == null) {
-                  objectCache.setObject(HtmlParseFilter.class.getName(), 
htmlParseFilters);
-                }
-                /* Otherwise run the filters in the required order */
-                else {
-                  ArrayList<HtmlParseFilter> filters = new 
ArrayList<HtmlParseFilter>();
-                  for (int i = 0; i < orderedFilters.length; i++) {
-                    HtmlParseFilter filter = filterMap
-                        .get(orderedFilters[i]);
-                    if (filter != null) {
-                      filters.add(filter);
-                    }
-                  }
-                  objectCache.setObject(HtmlParseFilter.class.getName(), 
filters
-                      .toArray(new HtmlParseFilter[filters.size()]));
-                }
-            } catch (PluginRuntimeException e) {
-                throw new RuntimeException(e);
-            }
-            this.htmlParseFilters = (HtmlParseFilter[]) 
objectCache.getObject(HtmlParseFilter.class.getName());
-        }
-    }                  
+    htmlParseFilters = (HtmlParseFilter[]) PluginRepository.get(conf)
+        .getOrderedPlugins(HtmlParseFilter.class, HtmlParseFilter.X_POINT_ID,
+            HTMLPARSEFILTER_ORDER);
+  }
 
   /** Run all defined filters. */
   public ParseResult filter(Content content, ParseResult parseResult, 
HTMLMetaTags metaTags, DocumentFragment doc) {

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Mon Oct  
7 10:08:43 2013
@@ -16,10 +16,12 @@
  */
 package org.apache.nutch.plugin;
 
+import java.lang.reflect.Array;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.WeakHashMap;
 import java.util.List;
@@ -29,6 +31,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.ObjectCache;
 
 /**
  * The plugin repositority is a registry of all plugins.
@@ -371,6 +374,81 @@ public class PluginRepository {
     }
     return map;
   }
+  
+  /**
+   * Get ordered list of plugins. Filter and normalization plugins are applied
+   * in a configurable "pipeline" order, e.g., if one plugin depends on the
+   * output of another plugin. This method loads the plugins in the order
+   * defined by orderProperty. If orderProperty is empty or unset, all active
+   * plugins of the given interface and extension point are loaded.
+   * 
+   * @param clazz
+   *          interface class implemented by required plugins
+   * @param xPointId
+   *          extension point id of required plugins
+   * @param orderProperty
+   *          property name defining plugin order
+   * @return array of plugin instances
+   */
+  public synchronized Object[] getOrderedPlugins(Class<?> clazz, String 
xPointId,
+      String orderProperty) {
+    Object[] filters;
+    ObjectCache objectCache = ObjectCache.get(conf);
+    filters = (Object[]) objectCache.getObject(clazz.getName());
+
+    if (filters == null) {
+      String order = conf.get(orderProperty);
+      List<String> orderOfFilters = new ArrayList<String>();
+      boolean userDefinedOrder = false;
+      if (order != null && !order.trim().isEmpty()) {
+        orderOfFilters = Arrays.asList(order.trim().split("\\s+"));
+        userDefinedOrder = true;
+      }
+
+      try {
+        ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+            xPointId);
+        if (point == null)
+          throw new RuntimeException(xPointId + " not found.");
+        Extension[] extensions = point.getExtensions();
+        HashMap<String, Object> filterMap = new HashMap<String, Object>();
+        for (int i = 0; i < extensions.length; i++) {
+          Extension extension = extensions[i];
+          Object filter = extension.getExtensionInstance();
+          if (!filterMap.containsKey(filter.getClass().getName())) {
+            filterMap.put(filter.getClass().getName(), filter);
+            if (!userDefinedOrder)
+              orderOfFilters.add(filter.getClass().getName());
+          }
+        }
+        List<Object> sorted = new ArrayList<Object>();
+        for (String orderedFilter : orderOfFilters) {
+          Object f = filterMap.get(orderedFilter);
+          if (f == null) {
+            LOG.error(clazz.getSimpleName() + " : " + orderedFilter
+                + " declared in configuration property " + orderProperty
+                + " but not found in an active plugin - ignoring.");
+            continue;
+          }
+          sorted.add(f);
+        }
+        Object[] filter = (Object[]) Array.newInstance(clazz, sorted.size());
+        for (int i = 0; i < sorted.size(); i++) {
+          filter[i] = sorted.get(i);
+          if (LOG.isTraceEnabled()) {
+            LOG.trace(clazz.getSimpleName() + " : filters[" + i + "] = "
+                + filter[i].getClass());
+          }
+        }
+        objectCache.setObject(clazz.getName(), filter);
+      } catch (PluginRuntimeException e) {
+        throw new RuntimeException(e);
+      }
+
+      filters = (Object[]) objectCache.getObject(clazz.getName());
+    }
+    return filters;
+  }
 
   /**
    * Loads all necessary dependencies for a selected plugin, and then runs one

Modified: nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java Mon Oct  
7 10:08:43 2013
@@ -18,25 +18,19 @@
 package org.apache.nutch.scoring;
 
 import java.util.Collection;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map.Entry;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.ExtensionPoint;
-import org.apache.nutch.plugin.PluginRuntimeException;
 import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.ObjectCache;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.Text;
 
 /**
  * Creates and caches {@link ScoringFilter} implementing plugins.
@@ -49,43 +43,9 @@ public class ScoringFilters extends Conf
 
   public ScoringFilters(Configuration conf) {
     super(conf);
-    ObjectCache objectCache = ObjectCache.get(conf);
-    String order = conf.get("scoring.filter.order");
-    this.filters = (ScoringFilter[]) 
objectCache.getObject(ScoringFilter.class.getName());
-
-    if (this.filters == null) {
-      String[] orderedFilters = null;
-      if (order != null && !order.trim().equals("")) {
-        orderedFilters = order.trim().split("\\s+");
-      }
-
-      try {
-        ExtensionPoint point = 
PluginRepository.get(conf).getExtensionPoint(ScoringFilter.X_POINT_ID);
-        if (point == null) throw new RuntimeException(ScoringFilter.X_POINT_ID 
+ " not found.");
-        Extension[] extensions = point.getExtensions();
-        HashMap<String, ScoringFilter> filterMap =
-          new HashMap<String, ScoringFilter>();
-        for (int i = 0; i < extensions.length; i++) {
-          Extension extension = extensions[i];
-          ScoringFilter filter = (ScoringFilter) 
extension.getExtensionInstance();
-          if (!filterMap.containsKey(filter.getClass().getName())) {
-            filterMap.put(filter.getClass().getName(), filter);
-          }
-        }
-        if (orderedFilters == null) {
-          objectCache.setObject(ScoringFilter.class.getName(), 
filterMap.values().toArray(new ScoringFilter[0]));
-        } else {
-          ScoringFilter[] filter = new ScoringFilter[orderedFilters.length];
-          for (int i = 0; i < orderedFilters.length; i++) {
-            filter[i] = filterMap.get(orderedFilters[i]);
-          }
-          objectCache.setObject(ScoringFilter.class.getName(), filter);
-        }
-      } catch (PluginRuntimeException e) {
-        throw new RuntimeException(e);
-      }
-      this.filters = (ScoringFilter[]) 
objectCache.getObject(ScoringFilter.class.getName());
-    }
+    this.filters = (ScoringFilter[]) PluginRepository.get(conf)
+        .getOrderedPlugins(ScoringFilter.class, ScoringFilter.X_POINT_ID,
+            "scoring.filter.order");
   }
 
   /** Calculate a sort value for Generate. */


Reply via email to