[ 
https://issues.apache.org/jira/browse/NUTCH-1736?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ysc updated NUTCH-1736:
-----------------------

    Comment: was deleted

(was: 1、for nutch1.x can use the below patch:

#P nutch1.7
Index: src/java/org/apache/nutch/metadata/HttpHeaders.java
===================================================================
--- src/java/org/apache/nutch/metadata/HttpHeaders.java (revision 1573324)
+++ src/java/org/apache/nutch/metadata/HttpHeaders.java (working copy)
@@ -26,6 +26,8 @@
  */
 public interface HttpHeaders {
 
+  public final static String TRANSFER_ENCODING = "Transfer-Encoding";
+       
   public final static String CONTENT_ENCODING = "Content-Encoding";
   
   public final static String CONTENT_LANGUAGE = "Content-Language";
Index: 
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
--- 
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
  (revision 1573324)
+++ 
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
  (working copy)
@@ -156,9 +156,13 @@
         parseHeaders(in, line);
         haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
       }
+      String transferEncoding = getHeader(Response.TRANSFER_ENCODING); 
+      if(transferEncoding != null && 
"chunked".equalsIgnoreCase(transferEncoding.trim())){       
+         readChunkedContent(in, line);  
+      }else{
+         readPlainContent(in);  
+      }
 
-      readPlainContent(in);
-
       String contentEncoding = getHeader(Response.CONTENT_ENCODING);
       if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
         content = http.processGzipEncoded(content, url);
@@ -432,5 +436,4 @@
     in.unread(value);
     return value;
   }
-
 }

2、for nutch2.x can use the below patch:

#P nutch-2.2.1
Index: src/java/org/apache/nutch/metadata/HttpHeaders.java
===================================================================
--- src/java/org/apache/nutch/metadata/HttpHeaders.java (revision 1523958)
+++ src/java/org/apache/nutch/metadata/HttpHeaders.java (working copy)
@@ -28,6 +28,7 @@
  * @author Jérôme Charron
  */
 public interface HttpHeaders {
+  public final static String TRANSFER_ENCODING = "Transfer-Encoding";
 
   public final static String CONTENT_ENCODING = "Content-Encoding";
 
Index: 
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
--- 
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
  (revision 1523958)
+++ 
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
  (working copy)
@@ -150,7 +150,12 @@
         haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
       }
 
-      readPlainContent(in);
+         String transferEncoding = getHeader(Response.TRANSFER_ENCODING); 
+         if(transferEncoding != null && 
"chunked".equalsIgnoreCase(transferEncoding.trim())){            
+                readChunkedContent(in, line);  
+         }else{
+                readPlainContent(in);  
+         }
 
       String contentEncoding = getHeader(Response.CONTENT_ENCODING);
       if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
@@ -234,7 +239,92 @@
     }
     content = out.toByteArray();
   }
+  /**
+   * 
+   * @param in
+   * @param line
+   * @throws HttpException
+   * @throws IOException
+   */
+  @SuppressWarnings("unused")
+  private void readChunkedContent(PushbackInputStream in,  
+                                  StringBuffer line) 
+    throws HttpException, IOException {
+    boolean doneChunks= false;
+    int contentBytesRead= 0;
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
 
+    while (!doneChunks) {
+      if (Http.LOG.isTraceEnabled()) {
+        Http.LOG.trace("Http: starting chunk");
+      }
+
+      readLine(in, line, false);
+
+      String chunkLenStr;
+      // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + 
"'"); }
+
+      int pos= line.indexOf(";");
+      if (pos < 0) {
+        chunkLenStr= line.toString();
+      } else {
+        chunkLenStr= line.substring(0, pos);
+        // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + 
line.substring(pos+1)); }
+      }
+      chunkLenStr= chunkLenStr.trim();
+      int chunkLen;
+      try {
+        chunkLen= Integer.parseInt(chunkLenStr, 16);
+      } catch (NumberFormatException e){ 
+        throw new HttpException("bad chunk length: "+line.toString());
+      }
+
+      if (chunkLen == 0) {
+        doneChunks= true;
+        break;
+      }
+
+      if ( (contentBytesRead + chunkLen) > http.getMaxContent() )
+        chunkLen= http.getMaxContent() - contentBytesRead;
+
+      // read one chunk
+      int chunkBytesRead= 0;
+      while (chunkBytesRead < chunkLen) {
+
+        int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
+                    (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE;
+        int len= in.read(bytes, 0, toRead);
+
+        if (len == -1) 
+          throw new HttpException("chunk eof after " + contentBytesRead
+                                      + " bytes in successful chunks"
+                                      + " and " + chunkBytesRead 
+                                      + " in current chunk");
+
+        // DANGER!!! Will printed GZIPed stuff right to your
+        // terminal!
+        // if (LOG.isTraceEnabled()) { LOG.trace("read: " +  new String(bytes, 
0, len)); }
+
+        out.write(bytes, 0, len);
+        chunkBytesRead+= len;  
+      }
+
+      readLine(in, line, false);
+
+    }
+
+    if (!doneChunks) {
+      if (contentBytesRead != http.getMaxContent()) 
+        throw new HttpException("chunk eof: !doneChunk && didn't max out");
+      return;
+    }
+
+    content = out.toByteArray();
+    parseHeaders(in, line);
+
+  }
+  
   private int parseStatusLine(PushbackInputStream in, StringBuffer line)
     throws IOException, HttpException {
     readLine(in, line, false);
)

> can't fetch page if http response header contains Transfer-Encoding:chunked
> ---------------------------------------------------------------------------
>
>                 Key: NUTCH-1736
>                 URL: https://issues.apache.org/jira/browse/NUTCH-1736
>             Project: Nutch
>          Issue Type: Bug
>          Components: protocol
>    Affects Versions: 1.6, 2.1, 1.7, 2.2, 2.3, 1.8, 2.4, 1.9, 2.2.1
>            Reporter: ysc
>            Priority: Critical
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> fetching: 
> http://szs.mof.gov.cn/zhengwuxinxi/zhengcefabu/201402/t20140224_1046354.html
> Fetch failed with protocol status: EXCEPTION: java.io.IOException: 
> unzipBestEffort returned null



--
This message was sent by Atlassian JIRA
(v6.2#6252)

Reply via email to