Author: lewismc
Date: Wed Apr 17 23:22:59 2013
New Revision: 1469100

URL: http://svn.apache.org/r1469100
Log:
NUTCH-1501 Harmonize behavior of parsechecker and indexchecker

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1469100&r1=1469099&r2=1469100&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Apr 17 23:22:59 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel + 
lewismc)
+
 * NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)
 
 * NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1469100&r1=1469099&r2=1469100&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Wed Apr 17 23:22:59 2013
@@ -68,7 +68,7 @@ public class IndexingFiltersChecker exte
 
     if (args.length != 1) {
       System.err.println(usage);
-      System.exit(-1);
+      return -1;
     }
 
     url = URLUtil.toASCII(args[0]);

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1469100&r1=1469099&r2=1469100&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed Apr 17 
23:22:59 2013
@@ -35,19 +35,38 @@ import org.apache.nutch.util.StringUtil;
 
 /**
  * Parser checker, useful for testing parser.
- * 
+ * It also accurately reports possible fetching and 
+ * parsing failures and presents protocol status signals to aid 
+ * debugging. The tool enables us to retrieve the following data from 
+ * any url:
+ * <ol>
+ * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content} 
type.</li>
+ * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) 
and is used to remove
+ * duplicates during the dedup procedure. 
+ * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or
+ * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
+ * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Title</tt>: of the URL</li>
+ * <li><tt>Outlinks</tt>: associated with the URL</li>
+ * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
+ * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, 
<i>Cache-Control</>, etc.</li>
+ * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
+ * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
+ * <li><tt>ParseText</tt>: The page parse text which varies in length 
depdnecing on 
+ * <code>content.length</code> configuration.</li>
+ * </ol>
  * @author John Xing
  */
 
 public class ParserChecker implements Tool {
 
   public static final Logger LOG = 
LoggerFactory.getLogger(ParserChecker.class);
+  private Configuration conf;
 
   public ParserChecker() {
   }
 
-  Configuration conf = null;
-
   public int run(String[] args) throws Exception {
     boolean dumpText = false;
     boolean force = false;
@@ -57,8 +76,8 @@ public class ParserChecker implements To
     String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";
 
     if (args.length == 0) {
-      System.err.println(usage);
-      System.exit(-1);
+      LOG.error(usage);
+      return (-1);
     }
 
     for (int i = 0; i < args.length; i++) {
@@ -68,7 +87,7 @@ public class ParserChecker implements To
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
       } else if (i != args.length - 1) {
-        System.err.println(usage);
+        LOG.error(usage);
         System.exit(-1);
       } else {
         url = URLUtil.toASCII(args[i]);
@@ -102,7 +121,7 @@ public class ParserChecker implements To
     }
 
     if (contentType == null) {
-      System.err.println("");
+      LOG.error("Failed to determine content type!");
       return (-1);
     }
 
@@ -112,9 +131,14 @@ public class ParserChecker implements To
 
     ParseResult parseResult = new ParseUtil(conf).parse(content);
 
+    if (parseResult == null) {
+      LOG.error("Problem with parse - check log");
+      return (-1);
+    }
+
     // Calculate the signature
     byte[] signature = 
SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new 
Text(url)));
-
+    
     if (LOG.isInfoEnabled()) {
       LOG.info("parsing: " + url);
       LOG.info("contentType: " + contentType);
@@ -123,12 +147,12 @@ public class ParserChecker implements To
 
     for (java.util.Map.Entry<Text, Parse> entry : parseResult) {
       Parse parse = entry.getValue();
-      System.out.print("---------\nUrl\n---------------\n");
+      LOG.info("---------\nUrl\n---------------\n");
       System.out.print(entry.getKey());
-      System.out.print("\n---------\nParseData\n---------\n");
+      LOG.info("\n---------\nParseData\n---------\n");
       System.out.print(parse.getData().toString());
       if (dumpText) {
-        System.out.print("---------\nParseText\n---------\n");
+        LOG.info("---------\nParseText\n---------\n");
         System.out.print(parse.getText());
       }
     }


Reply via email to