Author: lewismc
Date: Wed Apr 17 23:20:36 2013
New Revision: 1469099

URL: http://svn.apache.org/r1469099
Log:
NUTCH-1501 Harmonize behavior of parsechecker and indexchecker

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/log4j.properties
    
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
    nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1469099&r1=1469098&r2=1469099&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Apr 17 23:20:36 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel + 
lewismc)
+
 * NUTCH-1551 Improve WebTableReader field order and display batchId (lewismc)
 
 * NUTCH-1552 possibility of a NPE in index-more plugin (kaveh minooie via 
lewismc)

Modified: nutch/branches/2.x/conf/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/log4j.properties?rev=1469099&r1=1469098&r2=1469099&view=diff
==============================================================================
--- nutch/branches/2.x/conf/log4j.properties (original)
+++ nutch/branches/2.x/conf/log4j.properties Wed Apr 17 23:20:36 2013
@@ -38,6 +38,8 @@ log4j.logger.org.apache.nutch.indexer.so
 log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout
 log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout
+log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout
 
 log4j.logger.org.apache.nutch=INFO
 log4j.logger.org.apache.hadoop=WARN

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1469099&r1=1469098&r2=1469099&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 (original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 Wed Apr 17 23:20:36 2013
@@ -37,6 +37,7 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.protocol.ProtocolStatusUtils;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -67,7 +68,7 @@ public class IndexingFiltersChecker exte
       return -1;
     }
 
-    url = args[0];
+    url = URLUtil.toASCII(args[0]);
 
     if (LOG.isInfoEnabled()) {
       LOG.info("fetching: " + url);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1469099&r1=1469098&r2=1469099&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Wed 
Apr 17 23:20:36 2013
@@ -28,6 +28,7 @@ import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
@@ -36,10 +37,32 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.URLUtil;
 
 /**
  * Parser checker, useful for testing parser.
- * 
+ * It also accurately reports possible fetching and 
+ * parsing failures and presents protocol status signals to aid 
+ * debugging. The tool enables us to retrieve the following data from 
+ * any url:
+ * <ol>
+ * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content} 
type.</li>
+ * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) 
and is used to remove
+ * duplicates during the dedup procedure. 
+ * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or
+ * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
+ * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Title</tt>: of the URL</li>
+ * <li><tt>Outlinks</tt>: associated with the URL</li>
+ * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
+ * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, 
<i>Cache-Control</>, etc.</li>
+ * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
+ * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
+ * <li><tt>ParseText</tt>: The page parse text which varies in length 
depdnecing on 
+ * <code>content.length</code> configuration.</li>
+ * </ol>
  * @author John Xing
  */
 
@@ -60,7 +83,7 @@ public class ParserChecker implements To
     String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";
 
     if (args.length == 0) {
-      System.err.println(usage);
+      LOG.error(usage);
       return (-1);
     }
 
@@ -71,10 +94,10 @@ public class ParserChecker implements To
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
       } else if (i != args.length - 1) {
-        System.err.println(usage);
+        LOG.error(usage);
         System.exit(-1);
       } else {
-        url = args[i];
+        url = URLUtil.toASCII(args[i]);
       }
     }
 
@@ -110,15 +133,10 @@ public class ParserChecker implements To
     }
 
     if (contentType == null) {
-      System.err.println("");
+      LOG.error("Failed to determine content type!");
       return (-1);
     }
 
-    if (LOG.isInfoEnabled()) {
-      LOG.info("parsing: " + url);
-      LOG.info("contentType: " + contentType);
-    }
-
     page.setContentType(new Utf8(contentType));
 
     if (ParserJob.isTruncated(url, page)) {
@@ -128,13 +146,23 @@ public class ParserChecker implements To
     Parse parse = new ParseUtil(conf).parse(url, page);
 
     if (parse == null) {
-      System.err.println("Problem with parse - check log");
+      LOG.error("Problem with parse - check log");
       return (-1);
     }
+    
+    // Calculate the signature
+    byte[] signature = 
SignatureFactory.getSignature(getConf()).calculate(page);
+    
+    if (LOG.isInfoEnabled()) {
+      LOG.info("parsing: " + url);
+      LOG.info("contentType: " + contentType);
+      LOG.info("signature: " + StringUtil.toHexString(signature));
+    }
+
 
-    System.out.print("---------\nUrl\n---------------\n");
+    LOG.info("---------\nUrl\n---------------\n");
     System.out.print(url + "\n");
-    System.out.print("---------\nMetadata\n---------\n");
+    LOG.info("---------\nMetadata\n---------\n");
     Map<Utf8, ByteBuffer> metadata = page.getMetadata();
     StringBuffer sb = new StringBuffer();
     if (metadata != null) {
@@ -148,7 +176,7 @@ public class ParserChecker implements To
       System.out.print(sb.toString());
     }
     if (dumpText) {
-      System.out.print("---------\nParseText\n---------\n");
+      LOG.info("---------\nParseText\n---------\n");
       System.out.print(parse.getText());
     }
 
@@ -170,4 +198,5 @@ public class ParserChecker implements To
         args);
     System.exit(res);
   }
+
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1469099&r1=1469098&r2=1469099&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Wed Apr 17 
23:20:36 2013
@@ -18,7 +18,7 @@
 package org.apache.nutch.util;
 
 import java.net.MalformedURLException;
-import java.net.URL;
+import java.net.*;
 import java.util.regex.Pattern;
 
 import org.apache.nutch.util.domain.DomainSuffix;
@@ -333,6 +333,43 @@ public class URLUtil {
     }
   }
   
+  public static String toASCII(String url) {
+    try {
+      URL u = new URL(url);
+      URI p = new URI(u.getProtocol(),
+        null,
+        IDN.toASCII(u.getHost()),
+        u.getPort(),
+        u.getPath(),
+        u.getQuery(),
+        u.getRef());
+
+      return p.toString();
+    }
+    catch (Exception e) {
+      return null;
+    }
+  }
+
+  public static String toUNICODE(String url) {
+    try {
+      URL u = new URL(url);
+      URI p = new URI(u.getProtocol(),
+        null,
+        IDN.toUnicode(u.getHost()),
+        u.getPort(),
+        u.getPath(),
+        u.getQuery(),
+        u.getRef());
+
+      return p.toString();
+    }
+    catch (Exception e) {
+      return null;
+    }
+  }
+
+
   /** For testing */
   public static void main(String[] args){
     


Reply via email to