Author: jnioche
Date: Wed Jun 11 15:56:20 2014
New Revision: 1601937

URL: http://svn.apache.org/r1601937
Log:
NUTCH-1736 Can't fetch page if http response header contains 
Transfer-Encoding:chunked

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java
    
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1601937&r1=1601936&r2=1601937&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Jun 11 15:56:20 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1736 Can't fetch page if http response header contains 
Transfer-Encoding:chunked (ysc via jnioche)
+
 * NUTCH-1782 NodeWalker to return current node (markus)
 
 * NUTCH-1781 Update gora-*-mapping.xml and gora.proeprties to reflect Gora 0.4 
(lewismc)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1601937&r1=1601936&r2=1601937&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java Wed 
Jun 11 15:56:20 2014
@@ -28,6 +28,7 @@ package org.apache.nutch.metadata;
  * @author Jérôme Charron
  */
 public interface HttpHeaders {
+  public final static String TRANSFER_ENCODING = "Transfer-Encoding";
 
   public final static String CONTENT_ENCODING = "Content-Encoding";
 

Modified: 
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1601937&r1=1601936&r2=1601937&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Wed Jun 11 15:56:20 2014
@@ -196,7 +196,13 @@ public class HttpResponse implements Res
         haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
       }
 
-      readPlainContent(in);
+      String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
+      if (transferEncoding != null
+          && "chunked".equalsIgnoreCase(transferEncoding.trim())) {
+        readChunkedContent(in, line);
+      } else {
+        readPlainContent(in);
+      }
 
       String contentEncoding = getHeader(Response.CONTENT_ENCODING);
       if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
@@ -281,6 +287,93 @@ public class HttpResponse implements Res
     content = out.toByteArray();
   }
 
+  /**
+   * 
+   * @param in
+   * @param line
+   * @throws HttpException
+   * @throws IOException
+   */
+  @SuppressWarnings("unused")
+  private void readChunkedContent(PushbackInputStream in, StringBuffer line)
+      throws HttpException, IOException {
+    boolean doneChunks = false;
+    int contentBytesRead = 0;
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+
+    while (!doneChunks) {
+      if (Http.LOG.isTraceEnabled()) {
+        Http.LOG.trace("Http: starting chunk");
+      }
+
+      readLine(in, line, false);
+
+      String chunkLenStr;
+      // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'");
+      // }
+
+      int pos = line.indexOf(";");
+      if (pos < 0) {
+        chunkLenStr = line.toString();
+      } else {
+        chunkLenStr = line.substring(0, pos);
+        // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
+        // line.substring(pos+1)); }
+      }
+      chunkLenStr = chunkLenStr.trim();
+      int chunkLen;
+      try {
+        chunkLen = Integer.parseInt(chunkLenStr, 16);
+      } catch (NumberFormatException e) {
+        throw new HttpException("bad chunk length: " + line.toString());
+      }
+
+      if (chunkLen == 0) {
+        doneChunks = true;
+        break;
+      }
+
+      if (http.getMaxContent() >= 0
+          && (contentBytesRead + chunkLen) > http.getMaxContent())
+        chunkLen = http.getMaxContent() - contentBytesRead;
+
+      // read one chunk
+      int chunkBytesRead = 0;
+      while (chunkBytesRead < chunkLen) {
+
+        int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? 
(chunkLen - chunkBytesRead)
+            : Http.BUFFER_SIZE;
+        int len = in.read(bytes, 0, toRead);
+
+        if (len == -1)
+          throw new HttpException("chunk eof after " + contentBytesRead
+              + " bytes in successful chunks" + " and " + chunkBytesRead
+              + " in current chunk");
+
+        // DANGER!!! Will printed GZIPed stuff right to your
+        // terminal!
+        // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 
0,
+        // len)); }
+
+        out.write(bytes, 0, len);
+        chunkBytesRead += len;
+      }
+
+      readLine(in, line, false);
+    }
+
+    if (!doneChunks) {
+      if (contentBytesRead != http.getMaxContent())
+        throw new HttpException("chunk eof: !doneChunk && didn't max out");
+      return;
+    }
+
+    content = out.toByteArray();
+    parseHeaders(in, line);
+
+  }
+  
   private int parseStatusLine(PushbackInputStream in, StringBuffer line)
     throws IOException, HttpException {
     readLine(in, line, false);


Reply via email to