Author: ab Date: Sat Sep 23 12:36:47 2006 New Revision: 449293 URL: http://svn.apache.org/viewvc?view=rev&rev=449293 Log: NUTCH-350: urls incorrectly marked as STATUS_FETCH_GONE when blocked by http.max.delays. Instead the status is set to STATUS_FETCH_RETRY. Since this is an intermittent problem related to the Fetcher implementation, we don't increase the retry counter.
Added: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=449293&r1=449292&r2=449293 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Sep 23 12:36:47 2006 @@ -32,6 +32,18 @@ 11. NUTCH-332 - Fix the problem of doubling scores caused by links pointing to the current page (e.g. anchors). (Stefan Groschupf via ab) +12. NUTCH-365 - Flexible URL normalization (ab) + +13. NUTCH-336 - Differentiate between newly discovered pages and newly + injected pages (Chris Schneider via ab) NOTE: this changes the + scoring API, filter implementations need to be updated. + +14. NUTCH-337 - Fetcher ignores the fetcher.parse value (Stefan Groschupf + via ab) + +15. NUTCH-350 - Urls blocked by http.max.delays incorrectly marked as GONE + (Stefan Groschupf via ab) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=449293&r1=449292&r2=449293 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Sep 23 12:36:47 2006 @@ -189,18 +189,24 @@ } break; + // failures - increase the retry counter case ProtocolStatus.EXCEPTION: logError(url, status.getMessage()); + /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1); + /* FALLTHROUGH */ + // intermittent blocking - retry without increasing the counter + case ProtocolStatus.WOULDBLOCK: + case ProtocolStatus.BLOCKED: output(url, datum, null, CrawlDatum.STATUS_FETCH_RETRY); break; + // permanent failures case ProtocolStatus.GONE: // gone case ProtocolStatus.NOTFOUND: case ProtocolStatus.ACCESS_DENIED: case ProtocolStatus.ROBOTS_DENIED: - case ProtocolStatus.WOULDBLOCK: case ProtocolStatus.NOTMODIFIED: output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE); break; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?view=diff&rev=449293&r1=449292&r2=449293 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Sat Sep 23 12:36:47 2006 @@ -64,7 +64,9 @@ * The expected number of milliseconds to wait before retry may be provided * in args. */ public static final int WOULDBLOCK = 22; - + /** Thread was blocked http.max.delays times during fetching. */ + public static final int BLOCKED = 23; + // Useful static instances for status codes that don't usually require any // additional arguments. public static final ProtocolStatus STATUS_SUCCESS = new ProtocolStatus(SUCCESS); @@ -77,6 +79,7 @@ public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING); public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(NOTMODIFIED); public static final ProtocolStatus STATUS_WOULDBLOCK = new ProtocolStatus(WOULDBLOCK); + public static final ProtocolStatus STATUS_BLOCKED = new ProtocolStatus(BLOCKED); private int code; private long lastModified; @@ -99,6 +102,7 @@ codeToName.put(new Integer(NOTFETCHING), "notfetching"); codeToName.put(new Integer(NOTMODIFIED), "notmodified"); codeToName.put(new Integer(WOULDBLOCK), "wouldblock"); + codeToName.put(new Integer(BLOCKED), "blocked"); } public ProtocolStatus() { Added: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java?view=auto&rev=449293 ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java (added) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java Sat Sep 23 12:36:47 2006 @@ -0,0 +1,25 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.http.api; + +public class BlockedException extends HttpException { + + public BlockedException(String msg) { + super(msg); + } + +} Propchange: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?view=diff&rev=449293&r1=449292&r2=449293 ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Sat Sep 23 12:36:47 2006 @@ -197,7 +197,12 @@ null, null, this.conf); return new ProtocolOutput(c, ProtocolStatus.STATUS_WOULDBLOCK); } - String host = blockAddr(u, delay); + String host; + try { + host = blockAddr(u, delay); + } catch (BlockedException be) { + return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED); + } Response response; try { response = getResponse(u, datum, false); // make a request @@ -354,7 +359,7 @@ } if (delays == maxDelays) - throw new HttpException("Exceeded http.max.delays: retry later."); + throw new BlockedException("Exceeded http.max.delays: retry later."); long done = time.longValue(); long now = System.currentTimeMillis();