Repository: nutch
Updated Branches:
  refs/heads/2.x b7f3fce42 -> 5c3a38128


Fix for NUTCH-2305 generate.min.score doesn't work, contributed by Kiyonari 
Harigae


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/5c3a3812
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/5c3a3812
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/5c3a3812

Branch: refs/heads/2.x
Commit: 5c3a381289f158f69b4f7ebe7b059cd7d9ba7638
Parents: b7f3fce
Author: Sebastian Nagel <sna...@apache.org>
Authored: Mon Aug 22 22:40:19 2016 +0200
Committer: Sebastian Nagel <sna...@apache.org>
Committed: Mon Aug 22 22:40:19 2016 +0200

----------------------------------------------------------------------
 conf/nutch-default.xml                               | 7 +++++++
 src/java/org/apache/nutch/crawl/GeneratorMapper.java | 7 +++++++
 2 files changed, 14 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/5c3a3812/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index f1a16fc..d2181c5 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -624,6 +624,13 @@
   updatedb will generate identical fetchlists.</description>
 </property>
 
+<property>
+  <name>generate.min.score</name>
+  <value>0.0</value>
+  <description>Select only entries with a score larger than
+  generate.min.score.</description>
+</property>
+
 <!-- urlpartitioner properties -->
 <property>
   <name>partition.url.mode</name>

http://git-wip-us.apache.org/repos/asf/nutch/blob/5c3a3812/src/java/org/apache/nutch/crawl/GeneratorMapper.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/GeneratorMapper.java 
b/src/java/org/apache/nutch/crawl/GeneratorMapper.java
index b76133f..d07b0b5 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorMapper.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorMapper.java
@@ -44,6 +44,7 @@ GoraMapper<String, WebPage, SelectorEntry, WebPage> {
   private long curTime;
   private SelectorEntry entry = new SelectorEntry();
   private int maxDistance;
+  private float scoreThreshold;
 
   @Override
   public void map(String reversedUrl, WebPage page, Context context)
@@ -101,6 +102,11 @@ GoraMapper<String, WebPage, SelectorEntry, WebPage> {
     } catch (ScoringFilterException e) {
       // ignore
     }
+
+    // consider only entries with a score superior to the threshold
+    if (scoreThreshold != Float.NaN && score < scoreThreshold)
+      return;
+
     entry.set(url, score);
     context.write(entry, page);
   }
@@ -123,5 +129,6 @@ GoraMapper<String, WebPage, SelectorEntry, WebPage> {
         System.currentTimeMillis());
     schedule = FetchScheduleFactory.getFetchSchedule(conf);
     scoringFilters = new ScoringFilters(conf);
+    scoreThreshold = conf.getFloat(GeneratorJob.GENERATOR_MIN_SCORE, 
Float.NaN);
   }
 }

Reply via email to