Repository: nutch Updated Branches: refs/heads/2.x b7f3fce42 -> 5c3a38128
Fix for NUTCH-2305 generate.min.score doesn't work, contributed by Kiyonari Harigae Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/5c3a3812 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/5c3a3812 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/5c3a3812 Branch: refs/heads/2.x Commit: 5c3a381289f158f69b4f7ebe7b059cd7d9ba7638 Parents: b7f3fce Author: Sebastian Nagel <sna...@apache.org> Authored: Mon Aug 22 22:40:19 2016 +0200 Committer: Sebastian Nagel <sna...@apache.org> Committed: Mon Aug 22 22:40:19 2016 +0200 ---------------------------------------------------------------------- conf/nutch-default.xml | 7 +++++++ src/java/org/apache/nutch/crawl/GeneratorMapper.java | 7 +++++++ 2 files changed, 14 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/5c3a3812/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index f1a16fc..d2181c5 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -624,6 +624,13 @@ updatedb will generate identical fetchlists.</description> </property> +<property> + <name>generate.min.score</name> + <value>0.0</value> + <description>Select only entries with a score larger than + generate.min.score.</description> +</property> + <!-- urlpartitioner properties --> <property> <name>partition.url.mode</name> http://git-wip-us.apache.org/repos/asf/nutch/blob/5c3a3812/src/java/org/apache/nutch/crawl/GeneratorMapper.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/GeneratorMapper.java b/src/java/org/apache/nutch/crawl/GeneratorMapper.java index b76133f..d07b0b5 100644 --- a/src/java/org/apache/nutch/crawl/GeneratorMapper.java +++ b/src/java/org/apache/nutch/crawl/GeneratorMapper.java @@ -44,6 +44,7 @@ GoraMapper<String, WebPage, SelectorEntry, WebPage> { private long curTime; private SelectorEntry entry = new SelectorEntry(); private int maxDistance; + private float scoreThreshold; @Override public void map(String reversedUrl, WebPage page, Context context) @@ -101,6 +102,11 @@ GoraMapper<String, WebPage, SelectorEntry, WebPage> { } catch (ScoringFilterException e) { // ignore } + + // consider only entries with a score superior to the threshold + if (scoreThreshold != Float.NaN && score < scoreThreshold) + return; + entry.set(url, score); context.write(entry, page); } @@ -123,5 +129,6 @@ GoraMapper<String, WebPage, SelectorEntry, WebPage> { System.currentTimeMillis()); schedule = FetchScheduleFactory.getFetchSchedule(conf); scoringFilters = new ScoringFilters(conf); + scoreThreshold = conf.getFloat(GeneratorJob.GENERATOR_MIN_SCORE, Float.NaN); } }