[jira] [Commented] (NUTCH-2012) Merge parsechecker and indexchecker

ASF GitHub Bot (JIRA) Wed, 11 Apr 2018 05:19:20 -0700

    [ 
https://issues.apache.org/jira/browse/NUTCH-2012?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16433812#comment-16433812
 ]


ASF GitHub Bot commented on NUTCH-2012:
---------------------------------------

sebastian-nagel closed pull request #310: NUTCH-2012 Merge parsechecker and 
indexchecker
URL: https://github.com/apache/nutch/pull/310
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index e83bf3e0a..284d4adc7 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -36,13 +36,11 @@
 import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.AbstractChecker;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.AbstractChecker;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -50,12 +48,12 @@
  * Reads and parses a URL and run the indexers on it. Displays the fields
  * obtained and the first 100 characters of their value
  * 
- * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker
- * http://www.lemonde.fr
+ * Tested with e.g.
  * 
- * @author Julien Nioche
+ * <pre>
+    echo "http://www.lemonde.fr"; | $NUTCH_HOME/bin/nutch indexchecker -stdin
+ * </pre>
  **/
-
 public class IndexingFiltersChecker extends AbstractChecker {
 
   protected URLNormalizers normalizers = null;
@@ -69,6 +67,7 @@
 
   public int run(String[] args) throws Exception {
     String url = null;
+
     usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] 
[-dumpText] [-md key=value] (-stdin | -listen <port> [-keepClientCnxOpen])";
 
     // Print help when no args given
@@ -113,8 +112,7 @@ public int run(String[] args) throws Exception {
       return super.run();
     }
   }
-    
-  
+
   protected int process(String url, StringBuilder output) throws Exception {
     if (normalizers != null) {
       url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
@@ -133,33 +131,48 @@ protected int process(String url, StringBuilder output) 
throws Exception {
       datum.getMetaData().put(new Text(key), new Text(value));
     }
 
-    IndexingFilters indexers = new IndexingFilters(getConf());
-    
-    int maxRedirects = 3;
+    int maxRedirects = getConf().getInt("http.redirect.max", 3);
+    if (followRedirects) {
+      if (maxRedirects == 0) {
+        LOG.info("Following max. 3 redirects (ignored http.redirect.max == 
0)");
+        maxRedirects = 3;
+      } else {
+        LOG.info("Following max. {} redirects", maxRedirects);
+      }
+    }
 
     ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
     Text turl = new Text(url);
     
     // Following redirects and not reached maxRedirects?
-    while (!protocolOutput.getStatus().isSuccess() && followRedirects && 
protocolOutput.getStatus().isRedirect() && maxRedirects != 0) {
+    int numRedirects = 0;
+    while (!protocolOutput.getStatus().isSuccess() && followRedirects
+        && protocolOutput.getStatus().isRedirect() && maxRedirects >= 
numRedirects) {
       String[] stuff = protocolOutput.getStatus().getArgs();
       url = stuff[0];
-      
+      LOG.info("Follow redirect to {}", url);
+
       if (normalizers != null) {
         url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
       }
-    
+
       turl.set(url);
-      
+
       // try again
       protocolOutput = getProtocolOutput(url, datum);
-      maxRedirects--;
+      numRedirects++;
     }
 
     if (!protocolOutput.getStatus().isSuccess()) {
-      output.append("Fetch failed with protocol status: "
-          + protocolOutput.getStatus() + "\n");
-      return 0;
+      System.err.println("Fetch failed with protocol status: "
+          + protocolOutput.getStatus());
+
+      if (protocolOutput.getStatus().isRedirect()) {
+          System.err.println("Redirect(s) not handled due to configuration.");
+          System.err.println("Max Redirects to handle per config: " + 
maxRedirects);
+          System.err.println("Number of Redirects handled: " + numRedirects);
+      }
+      return -1;
     }
 
     Content content = protocolOutput.getContent();
@@ -172,6 +185,7 @@ protected int process(String url, StringBuilder output) 
throws Exception {
     String contentType = content.getContentType();
 
     if (contentType == null) {
+      LOG.error("Failed to determine content type!");
       return -1;
     }
 
@@ -228,6 +242,8 @@ protected int process(String url, StringBuilder output) 
throws Exception {
       LOG.warn("Couldn't pass score, url {} ({})", turl, e);
     }
 
+    IndexingFilters indexers = new IndexingFilters(getConf());
+
     try {
       doc = indexers.filter(doc, parse, urlText, datum, inlinks);
     } catch (IndexingException e) {
@@ -262,17 +278,10 @@ protected int process(String url, StringBuilder output) 
throws Exception {
     return 0;
   }
   
-  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) 
throws Exception {
-    ProtocolFactory factory = new ProtocolFactory(getConf());
-    Protocol protocol = factory.getProtocol(url);
-    Text turl = new Text(url);
-    ProtocolOutput protocolOutput = protocol.getProtocolOutput(turl, datum);
-    return protocolOutput;
-  }
-
   public static void main(String[] args) throws Exception {
     final int res = ToolRunner.run(NutchConfiguration.create(),
         new IndexingFiltersChecker(), args);
     System.exit(res);
   }
+
 }
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java 
b/src/java/org/apache/nutch/parse/ParserChecker.java
index 9dee311e2..b0f71d47e 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -22,24 +22,21 @@
 import java.util.Iterator;
 import java.util.Map;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.AbstractChecker;
 import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.util.StringUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Parser checker, useful for testing parser. It also accurately reports
@@ -66,38 +63,39 @@
  * on <code>content.length</code> configuration.</li>
  * </ol>
  * 
- * @author John Xing
  */
 
-public class ParserChecker implements Tool {
+public class ParserChecker extends AbstractChecker {
+
+  protected URLNormalizers normalizers = null;
+  protected boolean dumpText = false;
+  protected boolean followRedirects = false;
+  // used to simulate the metadata propagated from injection
+  protected HashMap<String, String> metadata = new HashMap<>();
+  protected String forceAsContentType = null;
 
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
-  private Configuration conf;
-
-  public ParserChecker() {
-  }
 
   public int run(String[] args) throws Exception {
-    boolean dumpText = false;
-    boolean force = false;
-    String contentType = null;
     String url = null;
 
-    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md 
key=value] url";
+    String usage = "Usage: ParserChecker [-normalize] [-followRedirects] 
[-dumpText] [-forceAs mimeType] [-md key=value] (-stdin | -listen <port> 
[-keepClientCnxOpen])";
 
-    if (args.length == 0) {
-      LOG.error(usage);
-      return (-1);
+    // Print help when no args given
+    if (args.length < 1) {
+      System.err.println(usage);
+      System.exit(-1);
     }
 
-    // used to simulate the metadata propagated from injection
-    HashMap<String, String> metadata = new HashMap<>();
-
+    int numConsumed;
     for (int i = 0; i < args.length; i++) {
-      if (args[i].equals("-forceAs")) {
-        force = true;
-        contentType = args[++i];
+      if (args[i].equals("-normalize")) {
+        normalizers = new URLNormalizers(getConf(), 
URLNormalizers.SCOPE_DEFAULT);
+      } else if (args[i].equals("-followRedirects")) {
+        followRedirects = true;
+      } else if (args[i].equals("-forceAs")) {
+        forceAsContentType = args[++i];
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
       } else if (args[i].equals("-md")) {
@@ -110,19 +108,33 @@ public int run(String[] args) throws Exception {
         } else
           k = nextOne;
         metadata.put(k, v);
+      } else if ((numConsumed = super.parseArgs(args, i)) > 0) {
+        i += numConsumed - 1;
       } else if (i != args.length - 1) {
-        LOG.error(usage);
+        System.err.println("ERR: Not a recognized argument: " + args[i]);
+        System.err.println(usage);
         System.exit(-1);
       } else {
-        url = URLUtil.toASCII(args[i]);
+        url = args[i];
       }
     }
+    
+    if (url != null) {
+      return super.processSingle(url);
+    } else {
+      // Start listening
+      return super.run();
+    }
+  }
 
-    if (LOG.isInfoEnabled()) {
-      LOG.info("fetching: " + url);
+  protected int process(String url, StringBuilder output) throws Exception {
+    if (normalizers != null) {
+      url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
     }
 
-    CrawlDatum cd = new CrawlDatum();
+    LOG.info("fetching: " + url);
+
+    CrawlDatum datum = new CrawlDatum();
 
     Iterator<String> iter = metadata.keySet().iterator();
     while (iter.hasNext()) {
@@ -130,67 +142,85 @@ public int run(String[] args) throws Exception {
       String value = metadata.get(key);
       if (value == null)
         value = "";
-      cd.getMetaData().put(new Text(key), new Text(value));
+      datum.getMetaData().put(new Text(key), new Text(value));
     }
 
-    ProtocolFactory factory = new ProtocolFactory(conf);
-    Protocol protocol = factory.getProtocol(url);
-    Text turl = new Text(url);
-    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
+    int maxRedirects = getConf().getInt("http.redirect.max", 3);
+    if (followRedirects) {
+      if (maxRedirects == 0) {
+        LOG.info("Following max. 3 redirects (ignored http.redirect.max == 
0)");
+        maxRedirects = 3;
+      } else {
+        LOG.info("Following max. {} redirects", maxRedirects);
+      }
+    }
 
-    // if the configuration permits, handle redirects until we either run
-    // out of allowed redirects or we stop getting redirect statuses.
-    int maxRedirects = conf.getInt("http.redirect.max", 0);
+    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+    Text turl = new Text(url);
+    
+    // Following redirects and not reached maxRedirects?
     int numRedirects = 0;
-    while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
-        String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
-        LOG.info("Handling redirect to " + newURL);
+    while (!protocolOutput.getStatus().isSuccess() && followRedirects
+        && protocolOutput.getStatus().isRedirect() && maxRedirects >= 
numRedirects) {
+      String[] stuff = protocolOutput.getStatus().getArgs();
+      url = stuff[0];
+      LOG.info("Follow redirect to {}", url);
+
+      if (normalizers != null) {
+        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+      }
 
-        protocol = factory.getProtocol(newURL);
-        turl = new Text(newURL);
-        output = protocol.getProtocolOutput(turl, cd);
+      turl.set(url);
 
-        numRedirects++;
+      // try again
+      protocolOutput = getProtocolOutput(url, datum);
+      numRedirects++;
     }
 
-    if (!output.getStatus().isSuccess()) {
+    if (!protocolOutput.getStatus().isSuccess()) {
       System.err.println("Fetch failed with protocol status: "
-          + output.getStatus());
+          + protocolOutput.getStatus());
 
-      if (output.getStatus().isRedirect()) {
+      if (protocolOutput.getStatus().isRedirect()) {
           System.err.println("Redirect(s) not handled due to configuration.");
           System.err.println("Max Redirects to handle per config: " + 
maxRedirects);
           System.err.println("Number of Redirects handled: " + numRedirects);
       }
-      return (-1);
+      return -1;
     }
 
-    Content content = output.getContent();
+    Content content = protocolOutput.getContent();
 
     if (content == null) {
-      LOG.error("No content for " + url);
-      return (-1);
+      output.append("No content for " + url + "\n");
+      return 0;
     }
 
-    if (force) {
-      content.setContentType(contentType);
+    String contentType;
+    if (forceAsContentType != null) {
+      content.setContentType(forceAsContentType);
+      contentType = forceAsContentType;
     } else {
       contentType = content.getContentType();
     }
 
     if (contentType == null) {
       LOG.error("Failed to determine content type!");
-      return (-1);
+      return -1;
     }
 
+    // store the guessed content type in the crawldatum
+    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE),
+        new Text(contentType));
+
     if (ParseSegment.isTruncated(content)) {
       LOG.warn("Content is truncated, parse may fail!");
     }
 
-    ScoringFilters scfilters = new ScoringFilters(conf);
+    ScoringFilters scfilters = new ScoringFilters(getConf());
     // call the scoring filters
     try {
-      scfilters.passScoreBeforeParsing(turl, cd, content);
+      scfilters.passScoreBeforeParsing(turl, datum, content);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e
@@ -199,7 +229,7 @@ public int run(String[] args) throws Exception {
       }
     }
 
-    ParseResult parseResult = new ParseUtil(conf).parse(content);
+    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
 
     if (parseResult == null) {
       LOG.error("Parsing content failed!");
@@ -254,16 +284,6 @@ public int run(String[] args) throws Exception {
     return 0;
   }
 
-  @Override
-  public Configuration getConf() {
-    return conf;
-  }
-
-  @Override
-  public void setConf(Configuration c) {
-    conf = c;
-  }
-
   public static void main(String[] args) throws Exception {
     int res = ToolRunner.run(NutchConfiguration.create(), new ParserChecker(),
         args);
diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java 
b/src/java/org/apache/nutch/util/AbstractChecker.java
index 84877d7e6..8d365ecaf 100644
--- a/src/java/org/apache/nutch/util/AbstractChecker.java
+++ b/src/java/org/apache/nutch/util/AbstractChecker.java
@@ -28,8 +28,12 @@
 import java.nio.charset.StandardCharsets;
 
 import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.util.Tool;
-
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -182,4 +186,12 @@ protected boolean readWrite(BufferedReader in, 
OutputStream out) throws Exceptio
       return true;
     }
   }
+
+  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) 
throws Exception {
+    ProtocolFactory factory = new ProtocolFactory(getConf());
+    Protocol protocol = factory.getProtocol(url);
+    Text turl = new Text(url);
+    return protocol.getProtocolOutput(turl, datum);
+  }
+
 }
\ No newline at end of file


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Merge parsechecker and indexchecker
> -----------------------------------
>
>                 Key: NUTCH-2012
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2012
>             Project: Nutch
>          Issue Type: Improvement
>    Affects Versions: 1.10
>            Reporter: Sebastian Nagel
>            Priority: Minor
>
> ParserChecker and IndexingFiltersChecker have evolved from simple tools to 
> check parsers and parsefilters resp. indexing filters to powerful tools which 
> emulate the crawling of a single URL/document:
> - check robots.txt (NUTCH-2002)
> - follow redirects (NUTCH-2004)
> Keeping both tools in sync takes extra work (cf. NUTCH-1757/NUTCH-2006, also 
> NUTCH-2002, NUTCH-2004 are done only for parsechecker). It's time to merge 
> them
> * either into one general debugging tool, keeping parsechecker and 
> indexchecker as aliases
> * centralize common code in one utility class



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (NUTCH-2012) Merge parsechecker and indexchecker

Reply via email to