Author: pkosiorowski
Date: Mon Aug  8 12:59:56 2005
New Revision: 230870

URL: http://svn.apache.org/viewcvs?rev=230870&view=rev
Log:
NUTCH-7. Relative links from identical(MD5) pages were treated incorrectly.

Modified:
    
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java?rev=230870&r1=230869&r2=230870&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java 
Mon Aug  8 12:59:56 2005
@@ -69,6 +69,9 @@
     final private static float DECAY_VALUE = 0.85f;
 
     public static final Logger LOG = 
LogFormatter.getLogger("org.apache.nutch.tools.DistributedAnalysisTool");
+    
+    public final static long OUTLINK_LIMIT = 10000;
+    
 
     /**
      * The EditSet inner class represents all of the sorted edits
@@ -343,8 +346,10 @@
         try {
             // Iterate through all items in the webdb, sorted by URL
             long curIndex = 0;
+            long linkCount = 0;
             ScoreValue score = new ScoreValue();
             IWebDBReader reader = new WebDBReader(nfs, dbDir);
+            MD5Hash lastHash = null;
             try {
                 for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); 
curIndex++) {
                     //
@@ -366,7 +371,25 @@
                     // OK, do some analysis!
                     //
                     Page curPage = (Page) e.nextElement();
+                    
+                    // Process only one page from set of pages having the same
+                    // MD5. Otherwise all links from these pages would be 
processed
+                    // multiple times.
+                    MD5Hash newHash = curPage.getMD5();
+                    if (newHash.equals(lastHash)) {
+                        continue;
+                    }
+                    lastHash = newHash;
+                    
                     Link outLinks[] = reader.getLinks(curPage.getMD5());
+                    linkCount += outLinks.length;
+                    
+                    if (outLinks.length > OUTLINK_LIMIT) {
+                        LOG.info("Suspicious outlink count = "
+                                + outLinks.length + " for ["
+                                + curPage.getURL().toString() + "].");
+                    }
+                    
                     int targetOutlinkers = 0;
                     for (int i = 0; i < outLinks.length; i++) {
                         if (outLinks[i].targetHasOutlink()) {
@@ -402,7 +425,9 @@
                     }
 
                     if (((curIndex - startIndex) % 5000) == 0) {
-                        LOG.info("Pages consumed: " + (curIndex - startIndex) 
+ " (at index " + curIndex + ")");
+                        LOG.info("Pages consumed: " + (curIndex - startIndex)
+                                + " (at index " + curIndex
+                                + "). Links fetched: " + linkCount + ".");
                     }
                 }
             } finally {


Reply via email to