Author: ab
Date: Sat Sep 23 10:11:58 2006
New Revision: 449274

URL: http://svn.apache.org/viewvc?view=rev&rev=449274
Log:
NUTCH-336: differentiate between newly discovered pages (known value through
inlink contributions) and newly injected pages (aribtrarily defined initial
value).

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
    lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java
    lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java
    
lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?view=diff&rev=449274&r1=449273&r2=449274
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Sat Sep 23 
10:11:58 2006
@@ -80,10 +80,10 @@
         CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 
interval);
         datum.setScore(scoreInjected);
         try {
-          scfilters.initialScore(value, datum);
+          scfilters.injectedScore(value, datum);
         } catch (ScoringFilterException e) {
           if (LOG.isWarnEnabled()) {
-            LOG.warn("Cannot filter init score for url " + url +
+            LOG.warn("Cannot filter injected score for url " + url +
                      ", using default (" + e.getMessage() + ")");
           }
           datum.setScore(scoreInjected);

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java?view=diff&rev=449274&r1=449273&r2=449274
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java Sat 
Sep 23 10:11:58 2006
@@ -41,7 +41,21 @@
   public final static String X_POINT_ID = ScoringFilter.class.getName();
   
   /**
-   * Set an initial score for newly injected pages.
+   * Set an initial score for newly injected pages. Note: newly injected pages
+   * may have no inlinks, so filter implementations may wish to set this 
+   * score to a non-zero value, to give newly injected pages some initial
+   * credit.
+   * @param url url of the page
+   * @param datum new datum. Filters will modify it in-place.
+   * @throws ScoringFilterException
+   */
+  public void injectedScore(UTF8 url, CrawlDatum datum) throws 
ScoringFilterException;
+  
+  /**
+   * Set an initial score for newly discovered pages. Note: newly discovered 
pages
+   * have at least one inlink with its score contribution, so filter 
implementations
+   * may choose to set initial score to zero (unknown value), and then the 
inlink
+   * score contribution will set the "real" value of the new page.
    * @param url url of the page
    * @param datum new datum. Filters will modify it in-place.
    * @throws ScoringFilterException

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java?view=diff&rev=449274&r1=449273&r2=449274
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java 
Sat Sep 23 10:11:58 2006
@@ -92,10 +92,17 @@
     return initSort;
   }
 
-  /** Calculate a new initial score, used when adding new pages. */
+  /** Calculate a new initial score, used when adding newly discovered pages. 
*/
   public void initialScore(UTF8 url, CrawlDatum datum) throws 
ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
       this.filters[i].initialScore(url, datum);
+    }
+  }
+
+  /** Calculate a new initial score, used when injecting new pages. */
+  public void injectedScore(UTF8 url, CrawlDatum datum) throws 
ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      this.filters[i].injectedScore(url, datum);
     }
   }
 

Modified: 
lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?view=diff&rev=449274&r1=449273&r2=449274
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
 Sat Sep 23 10:11:58 2006
@@ -73,8 +73,14 @@
   }
 
   /** Set to the value defined in config, 1.0f by default. */
-  public void initialScore(UTF8 url, CrawlDatum datum) throws 
ScoringFilterException {
+  public void injectedScore(UTF8 url, CrawlDatum datum) throws 
ScoringFilterException {
     datum.setScore(scoreInjected);
+  }
+
+  /** Set to 0.0f (unknown value) - inlink contributions will bring it to
+   * a correct level. Newly discovered pages have at least one inlink. */
+  public void initialScore(UTF8 url, CrawlDatum datum) throws 
ScoringFilterException {
+    datum.setScore(0.0f);
   }
 
   /** Use [EMAIL PROTECTED] CrawlDatum#getScore()}. */


Reply via email to