Author: ab Date: Sat Sep 23 10:11:58 2006 New Revision: 449274 URL: http://svn.apache.org/viewvc?view=rev&rev=449274 Log: NUTCH-336: differentiate between newly discovered pages (known value through inlink contributions) and newly injected pages (aribtrarily defined initial value).
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?view=diff&rev=449274&r1=449273&r2=449274 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Sat Sep 23 10:11:58 2006 @@ -80,10 +80,10 @@ CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, interval); datum.setScore(scoreInjected); try { - scfilters.initialScore(value, datum); + scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { - LOG.warn("Cannot filter init score for url " + url + + LOG.warn("Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } datum.setScore(scoreInjected); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java?view=diff&rev=449274&r1=449273&r2=449274 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java Sat Sep 23 10:11:58 2006 @@ -41,7 +41,21 @@ public final static String X_POINT_ID = ScoringFilter.class.getName(); /** - * Set an initial score for newly injected pages. + * Set an initial score for newly injected pages. Note: newly injected pages + * may have no inlinks, so filter implementations may wish to set this + * score to a non-zero value, to give newly injected pages some initial + * credit. + * @param url url of the page + * @param datum new datum. Filters will modify it in-place. + * @throws ScoringFilterException + */ + public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException; + + /** + * Set an initial score for newly discovered pages. Note: newly discovered pages + * have at least one inlink with its score contribution, so filter implementations + * may choose to set initial score to zero (unknown value), and then the inlink + * score contribution will set the "real" value of the new page. * @param url url of the page * @param datum new datum. Filters will modify it in-place. * @throws ScoringFilterException Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java?view=diff&rev=449274&r1=449273&r2=449274 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java Sat Sep 23 10:11:58 2006 @@ -92,10 +92,17 @@ return initSort; } - /** Calculate a new initial score, used when adding new pages. */ + /** Calculate a new initial score, used when adding newly discovered pages. */ public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException { for (int i = 0; i < this.filters.length; i++) { this.filters[i].initialScore(url, datum); + } + } + + /** Calculate a new initial score, used when injecting new pages. */ + public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException { + for (int i = 0; i < this.filters.length; i++) { + this.filters[i].injectedScore(url, datum); } } Modified: lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?view=diff&rev=449274&r1=449273&r2=449274 ============================================================================== --- lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original) +++ lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Sat Sep 23 10:11:58 2006 @@ -73,8 +73,14 @@ } /** Set to the value defined in config, 1.0f by default. */ - public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException { + public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException { datum.setScore(scoreInjected); + } + + /** Set to 0.0f (unknown value) - inlink contributions will bring it to + * a correct level. Newly discovered pages have at least one inlink. */ + public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException { + datum.setScore(0.0f); } /** Use [EMAIL PROTECTED] CrawlDatum#getScore()}. */