Author: pkosiorowski Date: Mon Aug 8 12:59:56 2005 New Revision: 230870 URL: http://svn.apache.org/viewcvs?rev=230870&view=rev Log: NUTCH-7. Relative links from identical(MD5) pages were treated incorrectly.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java?rev=230870&r1=230869&r2=230870&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java Mon Aug 8 12:59:56 2005 @@ -69,6 +69,9 @@ final private static float DECAY_VALUE = 0.85f; public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.tools.DistributedAnalysisTool"); + + public final static long OUTLINK_LIMIT = 10000; + /** * The EditSet inner class represents all of the sorted edits @@ -343,8 +346,10 @@ try { // Iterate through all items in the webdb, sorted by URL long curIndex = 0; + long linkCount = 0; ScoreValue score = new ScoreValue(); IWebDBReader reader = new WebDBReader(nfs, dbDir); + MD5Hash lastHash = null; try { for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); curIndex++) { // @@ -366,7 +371,25 @@ // OK, do some analysis! // Page curPage = (Page) e.nextElement(); + + // Process only one page from set of pages having the same + // MD5. Otherwise all links from these pages would be processed + // multiple times. + MD5Hash newHash = curPage.getMD5(); + if (newHash.equals(lastHash)) { + continue; + } + lastHash = newHash; + Link outLinks[] = reader.getLinks(curPage.getMD5()); + linkCount += outLinks.length; + + if (outLinks.length > OUTLINK_LIMIT) { + LOG.info("Suspicious outlink count = " + + outLinks.length + " for [" + + curPage.getURL().toString() + "]."); + } + int targetOutlinkers = 0; for (int i = 0; i < outLinks.length; i++) { if (outLinks[i].targetHasOutlink()) { @@ -402,7 +425,9 @@ } if (((curIndex - startIndex) % 5000) == 0) { - LOG.info("Pages consumed: " + (curIndex - startIndex) + " (at index " + curIndex + ")"); + LOG.info("Pages consumed: " + (curIndex - startIndex) + + " (at index " + curIndex + + "). Links fetched: " + linkCount + "."); } } } finally {