[ https://issues.apache.org/jira/browse/NUTCH-1736?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
ysc updated NUTCH-1736: ----------------------- Comment: was deleted (was: 1、for nutch1.x can use the below patch: #P nutch1.7 Index: src/java/org/apache/nutch/metadata/HttpHeaders.java =================================================================== --- src/java/org/apache/nutch/metadata/HttpHeaders.java (revision 1573324) +++ src/java/org/apache/nutch/metadata/HttpHeaders.java (working copy) @@ -26,6 +26,8 @@ */ public interface HttpHeaders { + public final static String TRANSFER_ENCODING = "Transfer-Encoding"; + public final static String CONTENT_ENCODING = "Content-Encoding"; public final static String CONTENT_LANGUAGE = "Content-Language"; Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java =================================================================== --- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (revision 1573324) +++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (working copy) @@ -156,9 +156,13 @@ parseHeaders(in, line); haveSeenNonContinueStatus= code != 100; // 100 is "Continue" } + String transferEncoding = getHeader(Response.TRANSFER_ENCODING); + if(transferEncoding != null && "chunked".equalsIgnoreCase(transferEncoding.trim())){ + readChunkedContent(in, line); + }else{ + readPlainContent(in); + } - readPlainContent(in); - String contentEncoding = getHeader(Response.CONTENT_ENCODING); if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { content = http.processGzipEncoded(content, url); @@ -432,5 +436,4 @@ in.unread(value); return value; } - } 2、for nutch2.x can use the below patch: #P nutch-2.2.1 Index: src/java/org/apache/nutch/metadata/HttpHeaders.java =================================================================== --- src/java/org/apache/nutch/metadata/HttpHeaders.java (revision 1523958) +++ src/java/org/apache/nutch/metadata/HttpHeaders.java (working copy) @@ -28,6 +28,7 @@ * @author Jérôme Charron */ public interface HttpHeaders { + public final static String TRANSFER_ENCODING = "Transfer-Encoding"; public final static String CONTENT_ENCODING = "Content-Encoding"; Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java =================================================================== --- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (revision 1523958) +++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (working copy) @@ -150,7 +150,12 @@ haveSeenNonContinueStatus= code != 100; // 100 is "Continue" } - readPlainContent(in); + String transferEncoding = getHeader(Response.TRANSFER_ENCODING); + if(transferEncoding != null && "chunked".equalsIgnoreCase(transferEncoding.trim())){ + readChunkedContent(in, line); + }else{ + readPlainContent(in); + } String contentEncoding = getHeader(Response.CONTENT_ENCODING); if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { @@ -234,7 +239,92 @@ } content = out.toByteArray(); } + /** + * + * @param in + * @param line + * @throws HttpException + * @throws IOException + */ + @SuppressWarnings("unused") + private void readChunkedContent(PushbackInputStream in, + StringBuffer line) + throws HttpException, IOException { + boolean doneChunks= false; + int contentBytesRead= 0; + byte[] bytes = new byte[Http.BUFFER_SIZE]; + ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); + while (!doneChunks) { + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("Http: starting chunk"); + } + + readLine(in, line, false); + + String chunkLenStr; + // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); } + + int pos= line.indexOf(";"); + if (pos < 0) { + chunkLenStr= line.toString(); + } else { + chunkLenStr= line.substring(0, pos); + // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + line.substring(pos+1)); } + } + chunkLenStr= chunkLenStr.trim(); + int chunkLen; + try { + chunkLen= Integer.parseInt(chunkLenStr, 16); + } catch (NumberFormatException e){ + throw new HttpException("bad chunk length: "+line.toString()); + } + + if (chunkLen == 0) { + doneChunks= true; + break; + } + + if ( (contentBytesRead + chunkLen) > http.getMaxContent() ) + chunkLen= http.getMaxContent() - contentBytesRead; + + // read one chunk + int chunkBytesRead= 0; + while (chunkBytesRead < chunkLen) { + + int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? + (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE; + int len= in.read(bytes, 0, toRead); + + if (len == -1) + throw new HttpException("chunk eof after " + contentBytesRead + + " bytes in successful chunks" + + " and " + chunkBytesRead + + " in current chunk"); + + // DANGER!!! Will printed GZIPed stuff right to your + // terminal! + // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0, len)); } + + out.write(bytes, 0, len); + chunkBytesRead+= len; + } + + readLine(in, line, false); + + } + + if (!doneChunks) { + if (contentBytesRead != http.getMaxContent()) + throw new HttpException("chunk eof: !doneChunk && didn't max out"); + return; + } + + content = out.toByteArray(); + parseHeaders(in, line); + + } + private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { readLine(in, line, false); ) > can't fetch page if http response header contains Transfer-Encoding:chunked > --------------------------------------------------------------------------- > > Key: NUTCH-1736 > URL: https://issues.apache.org/jira/browse/NUTCH-1736 > Project: Nutch > Issue Type: Bug > Components: protocol > Affects Versions: 1.6, 2.1, 1.7, 2.2, 2.3, 1.8, 2.4, 1.9, 2.2.1 > Reporter: ysc > Priority: Critical > Original Estimate: 24h > Remaining Estimate: 24h > > fetching: > http://szs.mof.gov.cn/zhengwuxinxi/zhengcefabu/201402/t20140224_1046354.html > Fetch failed with protocol status: EXCEPTION: java.io.IOException: > unzipBestEffort returned null -- This message was sent by Atlassian JIRA (v6.2#6252)