nutch git commit: Fix for NUTCH-2305 generate.min.score doesn't work, contributed by Kiyonari Harigae
Repository: nutch Updated Branches: refs/heads/2.x b7f3fce42 -> 5c3a38128 Fix for NUTCH-2305 generate.min.score doesn't work, contributed by Kiyonari Harigae Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/5c3a3812 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/5c3a3812 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/5c3a3812 Branch: refs/heads/2.x Commit: 5c3a381289f158f69b4f7ebe7b059cd7d9ba7638 Parents: b7f3fce Author: Sebastian Nagel Authored: Mon Aug 22 22:40:19 2016 +0200 Committer: Sebastian Nagel Committed: Mon Aug 22 22:40:19 2016 +0200 -- conf/nutch-default.xml | 7 +++ src/java/org/apache/nutch/crawl/GeneratorMapper.java | 7 +++ 2 files changed, 14 insertions(+) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/5c3a3812/conf/nutch-default.xml -- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index f1a16fc..d2181c5 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -624,6 +624,13 @@ updatedb will generate identical fetchlists. + + generate.min.score + 0.0 + Select only entries with a score larger than + generate.min.score. + + partition.url.mode http://git-wip-us.apache.org/repos/asf/nutch/blob/5c3a3812/src/java/org/apache/nutch/crawl/GeneratorMapper.java -- diff --git a/src/java/org/apache/nutch/crawl/GeneratorMapper.java b/src/java/org/apache/nutch/crawl/GeneratorMapper.java index b76133f..d07b0b5 100644 --- a/src/java/org/apache/nutch/crawl/GeneratorMapper.java +++ b/src/java/org/apache/nutch/crawl/GeneratorMapper.java @@ -44,6 +44,7 @@ GoraMapper { private long curTime; private SelectorEntry entry = new SelectorEntry(); private int maxDistance; + private float scoreThreshold; @Override public void map(String reversedUrl, WebPage page, Context context) @@ -101,6 +102,11 @@ GoraMapper { } catch (ScoringFilterException e) { // ignore } + +// consider only entries with a score superior to the threshold +if (scoreThreshold != Float.NaN && score < scoreThreshold) + return; + entry.set(url, score); context.write(entry, page); } @@ -123,5 +129,6 @@ GoraMapper { System.currentTimeMillis()); schedule = FetchScheduleFactory.getFetchSchedule(conf); scoringFilters = new ScoringFilters(conf); +scoreThreshold = conf.getFloat(GeneratorJob.GENERATOR_MIN_SCORE, Float.NaN); } }
[1/5] nutch git commit: Allow Fetcher to optionally store robots.txt content (if property fetcher.store.robotstxt == true). Improved RobotRulesParser command-line tool.
Repository: nutch Updated Branches: refs/heads/master d37b7ce13 -> 3fca1a590 Allow Fetcher to optionally store robots.txt content (if property fetcher.store.robotstxt == true). Improved RobotRulesParser command-line tool. Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6c9cca5e Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6c9cca5e Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6c9cca5e Branch: refs/heads/master Commit: 6c9cca5e55e43458cbc5e59b8591e4d27ac425a2 Parents: d37b7ce Author: Sebastian Nagel Authored: Wed May 25 14:24:11 2016 +0200 Committer: Sebastian Nagel Committed: Fri Aug 19 12:07:06 2016 +0200 -- conf/nutch-default.xml | 8 + .../org/apache/nutch/fetcher/FetcherThread.java | 17 +- .../org/apache/nutch/protocol/Protocol.java | 20 ++- .../apache/nutch/protocol/RobotRulesParser.java | 174 +++ .../nutch/protocol/http/api/HttpBase.java | 29 ++-- .../protocol/http/api/HttpRobotRulesParser.java | 52 +- .../org/apache/nutch/protocol/file/File.java| 13 +- .../java/org/apache/nutch/protocol/ftp/Ftp.java | 9 +- .../nutch/protocol/ftp/FtpRobotRulesParser.java | 17 +- 9 files changed, 265 insertions(+), 74 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/conf/nutch-default.xml -- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 67326ee..8c329bc 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -982,6 +982,14 @@ fetcher.bandwidth.target. Defaults to 30 and must be at least 1. + + fetcher.store.robotstxt + false + If true, fetcher will store the robots.txt response + content and status for debugging or archival purposes. + + + http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/java/org/apache/nutch/fetcher/FetcherThread.java -- diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index e57e735..cac16ff 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -22,6 +22,7 @@ import java.net.URL; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Map.Entry; import java.util.concurrent.atomic.AtomicInteger; @@ -129,6 +130,8 @@ public class FetcherThread extends Thread { private AtomicLong bytes; + private List robotsTxtContent = null; + //Used by the REST service private FetchNode fetchNode; private boolean reportToNutchServer; @@ -188,6 +191,9 @@ public class FetcherThread extends Thread { "fetcher.follow.outlinks.num.links", 4); outlinksDepthDivisor = conf.getInt( "fetcher.follow.outlinks.depth.divisor", 2); +if (conf.getBoolean("fetcher.store.robotstxt", false)) { + robotsTxtContent = new LinkedList(); +} } @SuppressWarnings("fallthrough") @@ -256,7 +262,16 @@ public class FetcherThread extends Thread { redirecting = false; Protocol protocol = this.protocolFactory.getProtocol(fit.url .toString()); -BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum); +BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, robotsTxtContent); +if (robotsTxtContent != null) { + for (Content robotsTxt : robotsTxtContent) { +LOG.debug("fetched and stored robots.txt {}", +robotsTxt.getUrl()); +output.collect(new Text(robotsTxt.getUrl()), +new NutchWritable(robotsTxt)); + } + robotsTxtContent.clear(); +} if (!rules.isAllowed(fit.u.toString())) { // unblock ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true); http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/java/org/apache/nutch/protocol/Protocol.java -- diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java index efd0100..ddebffb 100755 --- a/src/java/org/apache/nutch/protocol/Protocol.java +++ b/src/java/org/apache/nutch/protocol/Protocol.java @@ -17,6 +17,8 @@ package org.apache.nutch.protocol; +import java.util.List; + // Hadoop imports import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.io.Text; @@ -38,13 +40,21 @@ public interface Protocol extends Pluggable, Configurable { ProtocolOutput getProt
[5/5] nutch git commit: NUTCH-2300 Fetcher to optionally save robots.txt Merge branch 'SaveRobotsTxt' of https://github.com/sebastian-nagel/nutch, this closes #141
NUTCH-2300 Fetcher to optionally save robots.txt Merge branch 'SaveRobotsTxt' of https://github.com/sebastian-nagel/nutch, this closes #141 Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/3fca1a59 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/3fca1a59 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/3fca1a59 Branch: refs/heads/master Commit: 3fca1a5902a151867733806fc0511f18ab0b4e6f Parents: d37b7ce f3af9a5 Author: Sebastian Nagel Authored: Mon Aug 22 23:50:16 2016 +0200 Committer: Sebastian Nagel Committed: Mon Aug 22 23:50:16 2016 +0200 -- conf/nutch-default.xml | 10 ++ .../org/apache/nutch/fetcher/FetcherThread.java | 29 +++- .../org/apache/nutch/parse/ParseSegment.java| 11 +- .../org/apache/nutch/protocol/Protocol.java | 20 ++- .../apache/nutch/protocol/RobotRulesParser.java | 174 +++ .../nutch/protocol/http/api/HttpBase.java | 29 ++-- .../protocol/http/api/HttpRobotRulesParser.java | 52 +- .../org/apache/nutch/protocol/file/File.java| 13 +- .../java/org/apache/nutch/protocol/ftp/Ftp.java | 9 +- .../nutch/protocol/ftp/FtpRobotRulesParser.java | 17 +- 10 files changed, 286 insertions(+), 78 deletions(-) --
[4/5] nutch git commit: simplified code: use diamond operator
simplified code: use diamond operator Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f3af9a54 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f3af9a54 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f3af9a54 Branch: refs/heads/master Commit: f3af9a5467eb74a9f85adf47a0f4814fa0b3392d Parents: 33cdca7 Author: Sebastian Nagel Authored: Mon Aug 22 17:39:40 2016 +0200 Committer: Sebastian Nagel Committed: Mon Aug 22 17:39:40 2016 +0200 -- .../src/java/org/apache/nutch/protocol/http/api/HttpBase.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/f3af9a54/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java -- diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 4d1a0cc..90b256a 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -573,7 +573,7 @@ public abstract class HttpBase implements Protocol { if (input == null || input.length == 0) { return new HashMap(); } -HashMap hm = new HashMap(); +HashMap hm = new HashMap<>(); for (int i = 0; i < input.length; i++) { if (!"".equals(input[i].trim())) { hm.put(input[i], input[i]);
[2/5] nutch git commit: Ignore robots.txt when parsing segment, refactored storing of robots.txt in FetcherThread
Ignore robots.txt when parsing segment, refactored storing of robots.txt in FetcherThread Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/264eea01 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/264eea01 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/264eea01 Branch: refs/heads/master Commit: 264eea01a4d868578dcf641d6ce405444d276929 Parents: 6c9cca5 Author: Sebastian Nagel Authored: Fri Aug 19 15:06:14 2016 +0200 Committer: Sebastian Nagel Committed: Fri Aug 19 15:06:14 2016 +0200 -- .../org/apache/nutch/fetcher/FetcherThread.java | 20 ++-- .../org/apache/nutch/parse/ParseSegment.java| 11 +++ 2 files changed, 21 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/264eea01/src/java/org/apache/nutch/fetcher/FetcherThread.java -- diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index cac16ff..6024b8d 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -264,12 +264,7 @@ public class FetcherThread extends Thread { .toString()); BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, robotsTxtContent); if (robotsTxtContent != null) { - for (Content robotsTxt : robotsTxtContent) { -LOG.debug("fetched and stored robots.txt {}", -robotsTxt.getUrl()); -output.collect(new Text(robotsTxt.getUrl()), -new NutchWritable(robotsTxt)); - } + outputRobotsTxt(robotsTxtContent); robotsTxtContent.clear(); } if (!rules.isAllowed(fit.u.toString())) { @@ -758,6 +753,19 @@ public class FetcherThread extends Thread { return null; } + private void outputRobotsTxt(List robotsTxtContent) { +for (Content robotsTxt : robotsTxtContent) { + LOG.debug("fetched and stored robots.txt {}", + robotsTxt.getUrl()); + try { +output.collect(new Text(robotsTxt.getUrl()), +new NutchWritable(robotsTxt)); + } catch (IOException e) { +LOG.error("fetcher caught: {}", e.toString()); + } +} + } + private void updateStatus(int bytesInPage) throws IOException { pages.incrementAndGet(); bytes.addAndGet(bytesInPage); http://git-wip-us.apache.org/repos/asf/nutch/blob/264eea01/src/java/org/apache/nutch/parse/ParseSegment.java -- diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index b008bed..1a0da90 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -84,11 +84,14 @@ public class ParseSegment extends NutchTool implements Tool, key = newKey; } -int status = Integer.parseInt(content.getMetadata().get( -Nutch.FETCH_STATUS_KEY)); -if (status != CrawlDatum.STATUS_FETCH_SUCCESS) { +String fetchStatus = content.getMetadata().get(Nutch.FETCH_STATUS_KEY); +if (fetchStatus == null) { + // no fetch status, skip document + LOG.debug("Skipping {} as content has no fetch status", key); + return; +} else if (Integer.parseInt(fetchStatus) != CrawlDatum.STATUS_FETCH_SUCCESS) { // content not fetched successfully, skip document - LOG.debug("Skipping " + key + " as content is not fetched successfully"); + LOG.debug("Skipping {} as content is not fetched successfully", key); return; }
[3/5] nutch git commit: add hint and log warning that fetcher.store.robotstxt works only in combination with fetcher.store.content
add hint and log warning that fetcher.store.robotstxt works only in combination with fetcher.store.content Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/33cdca76 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/33cdca76 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/33cdca76 Branch: refs/heads/master Commit: 33cdca76ac91a63445d4e761081e8124a23413af Parents: 264eea0 Author: Sebastian Nagel Authored: Fri Aug 19 15:32:34 2016 +0200 Committer: Sebastian Nagel Committed: Fri Aug 19 15:32:34 2016 +0200 -- conf/nutch-default.xml | 6 -- src/java/org/apache/nutch/fetcher/FetcherThread.java | 6 +- 2 files changed, 9 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/33cdca76/conf/nutch-default.xml -- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 8c329bc..ec9d2d4 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -985,8 +985,10 @@ fetcher.store.robotstxt false - If true, fetcher will store the robots.txt response - content and status for debugging or archival purposes. + If true (and fetcher.store.content is also true), + fetcher will store the robots.txt response content and status for + debugging or archival purposes. The robots.txt is added to the + content/ folder of the fetched segment. http://git-wip-us.apache.org/repos/asf/nutch/blob/33cdca76/src/java/org/apache/nutch/fetcher/FetcherThread.java -- diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 6024b8d..449e220 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -192,7 +192,11 @@ public class FetcherThread extends Thread { outlinksDepthDivisor = conf.getInt( "fetcher.follow.outlinks.depth.divisor", 2); if (conf.getBoolean("fetcher.store.robotstxt", false)) { - robotsTxtContent = new LinkedList(); + if (storingContent) { +robotsTxtContent = new LinkedList(); + } else { +LOG.warn("Ignoring fetcher.store.robotstxt because not storing content (fetcher.store.content)!"); + } } }
[1/2] nutch git commit: NUTCH-2302 RAMConfManager Could Be Constructed With Custom Configuration
Repository: nutch Updated Branches: refs/heads/2.x 5c3a38128 -> 22683a1df NUTCH-2302 RAMConfManager Could Be Constructed With Custom Configuration Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/fd722c89 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/fd722c89 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/fd722c89 Branch: refs/heads/2.x Commit: fd722c896468fe047758891d75a58259c88289d8 Parents: b7f3fce Author: Furkan KAMACI Authored: Sat Aug 20 16:08:47 2016 +0300 Committer: Furkan KAMACI Committed: Sun Aug 21 00:40:45 2016 +0300 -- .../apache/nutch/api/impl/RAMConfManager.java | 65 1 file changed, 65 insertions(+) -- http://git-wip-us.apache.org/repos/asf/nutch/blob/fd722c89/src/java/org/apache/nutch/api/impl/RAMConfManager.java -- diff --git a/src/java/org/apache/nutch/api/impl/RAMConfManager.java b/src/java/org/apache/nutch/api/impl/RAMConfManager.java index 2afd658..13c05fd 100644 --- a/src/java/org/apache/nutch/api/impl/RAMConfManager.java +++ b/src/java/org/apache/nutch/api/impl/RAMConfManager.java @@ -33,19 +33,44 @@ import org.apache.nutch.util.NutchConfiguration; import com.google.common.collect.Maps; +/** + * Configuration manager which holds a map of {@link Configuration} type configurations and ids. + */ public class RAMConfManager implements ConfManager { private Map configurations = Maps.newConcurrentMap(); private AtomicInteger newConfigId = new AtomicInteger(); + /** + * Public constructor which creates a default configuration with id of {@link ConfigResource#DEFAULT}. + */ public RAMConfManager() { configurations.put(ConfigResource.DEFAULT, NutchConfiguration.create()); } + /** + * Public constructor which accepts a configuration id and {@link Configuration} type configuration. + */ + public RAMConfManager(String confId, Configuration configuration) { +configurations.put(confId, configuration); + } + + /** + * Lists configuration keys. + * + * @return Set of configuration keys + */ public Set list() { return configurations.keySet(); } + /** + * Returns configuration map for give configuration id. + * + * @param confId Configuration id. + * @return Configuration for given configuration id. + * {@link ConfigResource#DEFAULT} is used if given configuration id is null. + */ public Configuration get(String confId) { if (confId == null) { return configurations.get(ConfigResource.DEFAULT); @@ -53,6 +78,13 @@ public class RAMConfManager implements ConfManager { return configurations.get(confId); } + /** + * Returns configuration map for give configuration id. + * An empty map is returned if a configuration could not be retrieved for given configuration id. + * + * @param confId Configuration id + * @return map of configurations + */ public Map getAsMap(String confId) { Configuration configuration = configurations.get(confId); if (configuration == null) { @@ -68,6 +100,13 @@ public class RAMConfManager implements ConfManager { return configMap; } + /** + * Sets a property for the configuration which has given configuration id. + * + * @param confId Configuration id + * @param propName property name to set + * @param propValue property value to set + */ public void setProperty(String confId, String propName, String propValue) { if (!configurations.containsKey(confId)) { throw new IllegalArgumentException("Unknown configId '" + confId + "'"); @@ -76,10 +115,23 @@ public class RAMConfManager implements ConfManager { conf.set(propName, propValue); } + /** + * Deletes configuration for given configuration id. + * + * @param confId Configuration id + */ public void delete(String confId) { configurations.remove(confId); } + /** + * Creates hadoop configuration for given Nutch configuration. + * Checks whether it can create a Nutch configuration or not before it creates. + * Throws {@link IllegalArgumentException} if can not pass {{@link #canCreate(NutchConfig)}}. + * + * @param nutchConfig Nutch configuration + * @return created configuration id + */ @Override public String create(NutchConfig nutchConfig) { if (StringUtils.isBlank(nutchConfig.getConfigId())) { @@ -94,6 +146,14 @@ public class RAMConfManager implements ConfManager { return nutchConfig.getConfigId(); } + /** + * Checks can create a Nutch configuration or not. + * + * @param nutchConfig Nutch configuration + * @return True if forcing is enabled at Nutch configuration. + * Otherwise makes a check based on whether there is an existing configuration at configuration set
[2/2] nutch git commit: Merge branch '2.x' of https://git-wip-us.apache.org/repos/asf/nutch into 2.x
Merge branch '2.x' of https://git-wip-us.apache.org/repos/asf/nutch into 2.x Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/22683a1d Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/22683a1d Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/22683a1d Branch: refs/heads/2.x Commit: 22683a1df83fb8100acdda388382e181c1b5b43d Parents: fd722c8 5c3a381 Author: Lewis John McGibbney Authored: Mon Aug 22 20:23:55 2016 -0700 Committer: Lewis John McGibbney Committed: Mon Aug 22 20:23:55 2016 -0700 -- conf/nutch-default.xml | 7 +++ src/java/org/apache/nutch/crawl/GeneratorMapper.java | 7 +++ 2 files changed, 14 insertions(+) --