Author: ab
Date: Sat Sep 23 12:36:47 2006
New Revision: 449293

URL: http://svn.apache.org/viewvc?view=rev&rev=449293
Log:
NUTCH-350: urls incorrectly marked as STATUS_FETCH_GONE when blocked by
http.max.delays. Instead the status is set to STATUS_FETCH_RETRY. Since this
is an intermittent problem related to the Fetcher implementation, we don't
increase the retry counter.

Added:
    
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
   (with props)
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
    
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=449293&r1=449292&r2=449293
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Sep 23 12:36:47 2006
@@ -32,6 +32,18 @@
 11. NUTCH-332 - Fix the problem of doubling scores caused by links pointing
     to the current page (e.g. anchors). (Stefan Groschupf via ab)
 
+12. NUTCH-365 - Flexible URL normalization (ab)
+
+13. NUTCH-336 - Differentiate between newly discovered pages and newly
+    injected pages (Chris Schneider via ab) NOTE: this changes the
+    scoring API, filter implementations need to be updated.
+
+14. NUTCH-337 - Fetcher ignores the fetcher.parse value (Stefan Groschupf
+    via ab)
+
+15. NUTCH-350 - Urls blocked by http.max.delays incorrectly marked as GONE
+    (Stefan Groschupf via ab)
+
 
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=449293&r1=449292&r2=449293
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Sep 
23 12:36:47 2006
@@ -189,18 +189,24 @@
                 }
                 break;
 
+              // failures - increase the retry counter
               case ProtocolStatus.EXCEPTION:
                 logError(url, status.getMessage());
+              /* FALLTHROUGH */
               case ProtocolStatus.RETRY:          // retry
                 datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1);
+              /* FALLTHROUGH */
+              // intermittent blocking - retry without increasing the counter
+              case ProtocolStatus.WOULDBLOCK:
+              case ProtocolStatus.BLOCKED:
                 output(url, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
                 break;
                 
+              // permanent failures
               case ProtocolStatus.GONE:           // gone
               case ProtocolStatus.NOTFOUND:
               case ProtocolStatus.ACCESS_DENIED:
               case ProtocolStatus.ROBOTS_DENIED:
-              case ProtocolStatus.WOULDBLOCK:
               case ProtocolStatus.NOTMODIFIED:
                 output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
                 break;

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?view=diff&rev=449293&r1=449292&r2=449293
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java 
Sat Sep 23 12:36:47 2006
@@ -64,7 +64,9 @@
    * The expected number of milliseconds to wait before retry may be provided
    * in args. */
   public static final int WOULDBLOCK           = 22;
-  
+  /** Thread was blocked http.max.delays times during fetching. */
+  public static final int BLOCKED              = 23;
+   
   // Useful static instances for status codes that don't usually require any
   // additional arguments.
   public static final ProtocolStatus STATUS_SUCCESS = new 
ProtocolStatus(SUCCESS);
@@ -77,6 +79,7 @@
   public static final ProtocolStatus STATUS_NOTFETCHING = new 
ProtocolStatus(NOTFETCHING);
   public static final ProtocolStatus STATUS_NOTMODIFIED = new 
ProtocolStatus(NOTMODIFIED);
   public static final ProtocolStatus STATUS_WOULDBLOCK = new 
ProtocolStatus(WOULDBLOCK);
+  public static final ProtocolStatus STATUS_BLOCKED = new 
ProtocolStatus(BLOCKED);
   
   private int code;
   private long lastModified;
@@ -99,6 +102,7 @@
     codeToName.put(new Integer(NOTFETCHING), "notfetching");
     codeToName.put(new Integer(NOTMODIFIED), "notmodified");
     codeToName.put(new Integer(WOULDBLOCK), "wouldblock");
+    codeToName.put(new Integer(BLOCKED), "blocked");
   }
   
   public ProtocolStatus() {

Added: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java?view=auto&rev=449293
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
 Sat Sep 23 12:36:47 2006
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+public class BlockedException extends HttpException {
+  
+  public BlockedException(String msg) {
+    super(msg);
+  }
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?view=diff&rev=449293&r1=449292&r2=449293
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Sat Sep 23 12:36:47 2006
@@ -197,7 +197,12 @@
                 null, null, this.conf);
         return new ProtocolOutput(c, ProtocolStatus.STATUS_WOULDBLOCK);
       }
-      String host = blockAddr(u, delay);
+      String host;
+      try {
+        host = blockAddr(u, delay);
+      } catch (BlockedException be) {
+        return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED);
+      }
       Response response;
       try {
         response = getResponse(u, datum, false); // make a request
@@ -354,7 +359,7 @@
       }
       
       if (delays == maxDelays)
-        throw new HttpException("Exceeded http.max.delays: retry later.");
+        throw new BlockedException("Exceeded http.max.delays: retry later.");
       
       long done = time.longValue();
       long now = System.currentTimeMillis();


Reply via email to