Author: jnioche Date: Wed Jun 11 15:56:20 2014 New Revision: 1601937 URL: http://svn.apache.org/r1601937 Log: NUTCH-1736 Can't fetch page if http response header contains Transfer-Encoding:chunked
Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1601937&r1=1601936&r2=1601937&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Wed Jun 11 15:56:20 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1736 Can't fetch page if http response header contains Transfer-Encodingï¼chunked (ysc via jnioche) + * NUTCH-1782 NodeWalker to return current node (markus) * NUTCH-1781 Update gora-*-mapping.xml and gora.proeprties to reflect Gora 0.4 (lewismc) Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1601937&r1=1601936&r2=1601937&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java Wed Jun 11 15:56:20 2014 @@ -28,6 +28,7 @@ package org.apache.nutch.metadata; * @author Jérôme Charron */ public interface HttpHeaders { + public final static String TRANSFER_ENCODING = "Transfer-Encoding"; public final static String CONTENT_ENCODING = "Content-Encoding"; Modified: nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1601937&r1=1601936&r2=1601937&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Wed Jun 11 15:56:20 2014 @@ -196,7 +196,13 @@ public class HttpResponse implements Res haveSeenNonContinueStatus= code != 100; // 100 is "Continue" } - readPlainContent(in); + String transferEncoding = getHeader(Response.TRANSFER_ENCODING); + if (transferEncoding != null + && "chunked".equalsIgnoreCase(transferEncoding.trim())) { + readChunkedContent(in, line); + } else { + readPlainContent(in); + } String contentEncoding = getHeader(Response.CONTENT_ENCODING); if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { @@ -281,6 +287,93 @@ public class HttpResponse implements Res content = out.toByteArray(); } + /** + * + * @param in + * @param line + * @throws HttpException + * @throws IOException + */ + @SuppressWarnings("unused") + private void readChunkedContent(PushbackInputStream in, StringBuffer line) + throws HttpException, IOException { + boolean doneChunks = false; + int contentBytesRead = 0; + byte[] bytes = new byte[Http.BUFFER_SIZE]; + ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); + + while (!doneChunks) { + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("Http: starting chunk"); + } + + readLine(in, line, false); + + String chunkLenStr; + // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); + // } + + int pos = line.indexOf(";"); + if (pos < 0) { + chunkLenStr = line.toString(); + } else { + chunkLenStr = line.substring(0, pos); + // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + + // line.substring(pos+1)); } + } + chunkLenStr = chunkLenStr.trim(); + int chunkLen; + try { + chunkLen = Integer.parseInt(chunkLenStr, 16); + } catch (NumberFormatException e) { + throw new HttpException("bad chunk length: " + line.toString()); + } + + if (chunkLen == 0) { + doneChunks = true; + break; + } + + if (http.getMaxContent() >= 0 + && (contentBytesRead + chunkLen) > http.getMaxContent()) + chunkLen = http.getMaxContent() - contentBytesRead; + + // read one chunk + int chunkBytesRead = 0; + while (chunkBytesRead < chunkLen) { + + int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? (chunkLen - chunkBytesRead) + : Http.BUFFER_SIZE; + int len = in.read(bytes, 0, toRead); + + if (len == -1) + throw new HttpException("chunk eof after " + contentBytesRead + + " bytes in successful chunks" + " and " + chunkBytesRead + + " in current chunk"); + + // DANGER!!! Will printed GZIPed stuff right to your + // terminal! + // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0, + // len)); } + + out.write(bytes, 0, len); + chunkBytesRead += len; + } + + readLine(in, line, false); + } + + if (!doneChunks) { + if (contentBytesRead != http.getMaxContent()) + throw new HttpException("chunk eof: !doneChunk && didn't max out"); + return; + } + + content = out.toByteArray(); + parseHeaders(in, line); + + } + private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { readLine(in, line, false);