[
https://issues.apache.org/jira/browse/NUTCH-2543?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16415731#comment-16415731
]
ASF GitHub Bot commented on NUTCH-2543:
---
sebastian-nagel closed pull request #303: fix for NUTCH-2543 contributed by
Jurian Broertjes
URL: https://github.com/apache/nutch/pull/303
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 9be246a58..ee1b4ba97 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -70,12 +70,13 @@
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.AbstractChecker;
import org.apache.nutch.util.JexlUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.SegmentReaderUtil;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
-import org.apache.nutch.util.SegmentReaderUtil;
import org.apache.commons.jexl2.Expression;
/**
@@ -84,13 +85,15 @@
* @author Andrzej Bialecki
*
*/
-public class CrawlDbReader extends Configured implements Closeable, Tool {
+public class CrawlDbReader extends AbstractChecker implements Closeable {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private MapFile.Reader[] readers = null;
+ protected String crawlDb;
+
private void openReaders(String crawlDb, Configuration config)
throws IOException {
if (readers != null)
@@ -110,6 +113,7 @@ private void closeReaders() {
}
}
+readers = null;
}
public static class CrawlDatumCsvOutputFormat extends
@@ -593,15 +597,25 @@ public CrawlDatum get(String crawlDb, String url,
Configuration config)
return res;
}
- public void readUrl(String crawlDb, String url, Configuration config)
+ protected int process(String line, StringBuilder output) throws Exception {
+Job job = NutchJob.getInstance(getConf());
+Configuration config = job.getConfiguration();
+// Close readers, so we know we're not working on stale data
+closeReaders();
+readUrl(this.crawlDb, line, config, output);
+return 0;
+ }
+
+ public void readUrl(String crawlDb, String url, Configuration config,
StringBuilder output)
throws IOException {
CrawlDatum res = get(crawlDb, url, config);
-System.out.println("URL: " + url);
+output.append("URL: " + url + "\n");
if (res != null) {
- System.out.println(res);
+ output.append(res);
} else {
- System.out.println("not found");
+ output.append("not found");
}
+output.append("\n");
}
public void processDumpJob(String crawlDb, String output,
@@ -792,7 +806,8 @@ public void processTopNJob(String crawlDb, long topN, float
min,
}
- public int run(String[] args) throws IOException, InterruptedException,
ClassNotFoundException {
+
+ public int run(String[] args) throws IOException, InterruptedException,
ClassNotFoundException, Exception {
@SuppressWarnings("resource")
CrawlDbReader dbr = new CrawlDbReader();
@@ -827,8 +842,11 @@ public int run(String[] args) throws IOException,
InterruptedException, ClassNot
}
String param = null;
String crawlDb = args[0];
+this.crawlDb = crawlDb;
+int numConsumed = 0;
Job job = NutchJob.getInstance(getConf());
Configuration config = job.getConfiguration();
+
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-stats")) {
boolean toSort = false;
@@ -874,7 +892,9 @@ public int run(String[] args) throws IOException,
InterruptedException, ClassNot
dbr.processDumpJob(crawlDb, param, config, format, regex, status,
retry, expr, sample);
} else if (args[i].equals("-url")) {
param = args[++i];
-dbr.readUrl(crawlDb, param, config);
+StringBuilder output = new StringBuilder();
+dbr.readUrl(crawlDb, param, config, output);
+System.out.print(output);
} else if (args[i].equals("-topN")) {
param = args[++i];
long topN = Long.parseLong(param);
@@ -884,11 +904,18 @@ public int run(String[] args) throws IOException,
InterruptedException, ClassNot
min = Float.parseFloat(args[++i]);
}
dbr.processTopNJob(crawlDb, topN, min, param, config);
+ } else if ((numConsumed = super.parseArgs(args, i)) > 0) {
+i += numConsumed - 1;
} else {
System.err.printl