Author: etnu
Date: Wed Nov 26 11:10:41 2008
New Revision: 720947

URL: http://svn.apache.org/viewvc?rev=720947&view=rev
Log:
Created an optional short circuit for performing encoding detection. This makes 
a huge difference in CPU usage when dealing with large numbers of responses 
that do not specify their encoding. The biggest gains are found when using the 
optional flag that skips ICU, though if remote sites frequently fail to return 
Content-Type headers and are not either UTF-8 or ISO-8859-1, the result will be 
garbage.


Added:
    
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/
    
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
    
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/
    
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
Modified:
    incubator/shindig/trunk/java/common/conf/shindig.properties
    
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
    
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java

Modified: incubator/shindig/trunk/java/common/conf/shindig.properties
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/common/conf/shindig.properties?rev=720947&r1=720946&r2=720947&view=diff
==============================================================================
--- incubator/shindig/trunk/java/common/conf/shindig.properties (original)
+++ incubator/shindig/trunk/java/common/conf/shindig.properties Wed Nov 26 
11:10:41 2008
@@ -47,3 +47,7 @@
 
 # true to enable JMX stats.
 shindig.cache.ehcache.jmx.stats=true
+
+# true to skip expensive encoding detection.
+# if true, will only attempt to validate utf-8. Assumes all other encodings 
are ISO-8859-1.
+shindig.http.fast-encoding-detection=true

Added: 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java?rev=720947&view=auto
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
 (added)
+++ 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
 Wed Nov 26 11:10:41 2008
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.shindig.gadgets.encoding;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+
+/**
+ * Attempts to determine the encoding of a given string.
+ *
+ * Highly skewed towards common encodings (UTF-8 and Latin-1).
+ */
+public class EncodingDetector {
+
+  /**
+   * Returns the detected encoding of the given byte array.
+   *
+   * @param input The data to detect the encoding for.
+   * @param assume88591IfNotUtf8 True to assume that the encoding is 
ISO-8859-1 (the standard
+   *     encoding for HTTP) if the bytes are not valid UTF-8. Only recommended 
if you can reasonably
+   *     expect that other encodings are going to be specified. Full encoding 
detection is very
+   *     expensive!
+   * @return The detected encoding.
+   */
+  public static String detectEncoding(byte[] input, boolean 
assume88591IfNotUtf8) {
+    if (looksLikeValidUtf8(input)) {
+      return "UTF-8";
+    }
+
+    if (assume88591IfNotUtf8) {
+      return "ISO-8859-1";
+    }
+
+    // Fall back to the incredibly slow ICU. It might be better to just skip 
this entirely.
+    CharsetDetector detector = new CharsetDetector();
+    detector.setText(input);
+    CharsetMatch match = detector.detect();
+    return match.getName().toUpperCase();
+  }
+
+  /**
+   * A pretty good test that something is UTF-8. There are many sequences that 
will pass here that
+   * aren't valid UTF-8 due to the requirement that the shortest possible 
sequence always be used.
+   * We're ok with this behavior because the main goal is speed.
+   */
+  private static boolean looksLikeValidUtf8(byte[] input) {
+    int i = 0;
+    if (input.length >= 3 &&
+       (input[0] & 0xFF) == 0xEF &&
+       (input[1] & 0xFF) == 0xBB &
+       (input[2] & 0xFF) == 0xBF) {
+      // Skip BOM.
+      i = 3;
+    }
+
+    int endOfSequence;
+    for (int j = input.length; i < j; ++i) {
+      int bite = input[i];
+      if ((bite & 0x80) == 0) {
+        continue; // ASCII
+      }
+
+      // Determine number of bytes in the sequence.
+      if ((bite & 0x0E0) == 0x0C0) {
+        endOfSequence = i + 1;
+      } else if ((bite & 0x0F0) == 0x0E0) {
+        endOfSequence = i + 2;
+      } else if ((bite & 0x0F8) == 0xF0) {
+        endOfSequence = i + 3;
+      } else {
+        // Not a valid utf-8 byte sequence. Skip.
+        return false;
+      }
+
+      while (i < endOfSequence) {
+        i++;
+        bite = input[i];
+        if ((bite & 0xC0) != 0x80) {
+          // High bit not set, not a vlaid sequence
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+}

Modified: 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java?rev=720947&r1=720946&r2=720947&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
 Wed Nov 26 11:10:41 2008
@@ -18,14 +18,13 @@
 package org.apache.shindig.gadgets.http;
 
 import org.apache.shindig.common.util.DateUtil;
+import org.apache.shindig.gadgets.encoding.EncodingDetector;
 
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import com.google.inject.Inject;
 import com.google.inject.name.Named;
-import com.ibm.icu.text.CharsetDetector;
-import com.ibm.icu.text.CharsetMatch;
 
 import java.io.ByteArrayInputStream;
 import java.io.Externalizable;
@@ -119,12 +118,15 @@
 
   static final String DEFAULT_ENCODING = "UTF-8";
 
-  @Inject @Named("shindig.cache.http.negativeCacheTtl")
+  @Inject(optional = true) @Named("shindig.cache.http.negativeCacheTtl")
   private static long negativeCacheTtl = DEFAULT_NEGATIVE_CACHE_TTL;
 
-  @Inject @Named("shindig.cache.http.defaultTtl")
+  @Inject(optional = true) @Named("shindig.cache.http.defaultTtl")
   private static long defaultTtl = DEFAULT_TTL;
 
+  @Inject(optional = true) @Named("shindig.http.fast-encoding-detection")
+  private static boolean fastEncodingDetection = true;
+
   // Holds character sets for fast conversion
   private static final Map<String, Charset> encodingToCharset = 
Maps.newConcurrentHashMap();
 
@@ -412,6 +414,10 @@
    * @return The detected encoding or DEFAULT_ENCODING.
    */
   private static String getAndUpdateEncoding(Map<String, List<String>> 
headers, byte[] body) {
+    if (body == null || body.length == 0) {
+      return DEFAULT_ENCODING;
+    }
+
     List<String> values = headers.get("Content-Type");
     String contentType = values == null ? null : values.isEmpty() ? null : 
values.get(0);
     if (contentType != null) {
@@ -431,24 +437,16 @@
           return charset;
         }
       }
-    }
-
-    if (body == null || body.length == 0) {
-      return DEFAULT_ENCODING;
-    }
-
-    // If the header doesn't specify the charset, try to determine it by 
examining the content.
-    CharsetDetector detector = new CharsetDetector();
-    detector.setText(body);
-    CharsetMatch match = detector.detect();
-
-    if (contentType != null) {
+      String encoding = EncodingDetector.detectEncoding(body, 
fastEncodingDetection);
       // Record the charset in the content-type header so that its value can 
be cached
       // and re-used. This is a BIG performance win.
-      headers.put("Content-Type",
-          Lists.newArrayList(contentType + "; charset=" + 
match.getName().toUpperCase()));
+      headers.put("Content-Type", Lists.newArrayList(contentType + "; 
charset=" + encoding));
+      return encoding;
+    } else {
+      // If no content type was specified, we'll assume an unknown binary type.
+      contentType = "application/octet-stream";
+      return DEFAULT_ENCODING;
     }
-    return match.getName().toUpperCase();
   }
 
   @Override

Added: 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java?rev=720947&view=auto
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
 (added)
+++ 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
 Wed Nov 26 11:10:41 2008
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.shindig.gadgets.encoding;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+public class EncodingDetectorTest {
+
+  @Test
+  public void asciiAssumesUtf8() throws Exception {
+    byte[] data = "Hello, world".getBytes("US-ASCII");
+    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+  }
+
+  @Test
+  public void detectedUtf8WithByteOrderMark() {
+    byte[] data = new byte[] {
+        (byte)0xEF, (byte)0xBB, (byte)0xBF, 'h', 'e', 'l', 'l', 'o'
+    };
+
+    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+  }
+
+  @Test
+  public void assumeLatin1OnInvalidUtf8() throws Exception {
+    byte[] data = "\u4F60\u597D".getBytes("BIG5");
+
+    assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data, true));
+  }
+
+  @Test
+  public void doNotAssumeLatin1OnInvalidUtf8() throws Exception {
+    byte[] data = 
("\u6211\u662F\u4E00\u4E2A\u4E0D\u5584\u4E8E\u8BB2\u8BDD\u7684\u4EBA\uFF0C" +
+                   
"\u552F\u5176\u4E0D\u5584\u4E8E\u8BB2\u8BDD\uFF0C\u6709\u601D\u60F3\u8868" +
+                   
"\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
+                   .getBytes("GB18030");
+
+    assertEquals("GB18030", EncodingDetector.detectEncoding(data, false));
+  }
+
+  @Test
+  public void longUtf8StringIsUtf8() throws Exception {
+    byte[] data = 
("\u6211\u662F\u4E00\u4E2A\u4E0D\u5584\u4E8E\u8BB2\u8BDD\u7684\u4EBA\uFF0C" +
+                   
"\u552F\u5176\u4E0D\u5584\u4E8E\u8BB2\u8BDD\uFF0C\u6709\u601D\u60F3\u8868" +
+                   
"\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
+                   .getBytes("UTF-8");
+
+    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+  }
+
+  @Test
+  public void shortUtf8StringIsUtf8() throws Exception {
+    byte[] data = "Games, HQ, Mang\u00E1, Anime e tudo que um bom nerd 
ama".getBytes("UTF-8");
+
+    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+  }
+}

Modified: 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java?rev=720947&r1=720946&r2=720947&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
 Wed Nov 26 11:10:41 2008
@@ -58,6 +58,7 @@
   public void testGetEncoding() throws Exception {
     HttpResponse response = new HttpResponseBuilder()
         .addHeader("Content-Type", "text/plain; charset=TEST-CHARACTER-SET")
+        .setResponse(new byte[] {'j', 'u', 'n', 'k'})
         .create();
     assertEquals("TEST-CHARACTER-SET", response.getEncoding());
   }
@@ -65,6 +66,7 @@
   public void testGetEncodingQuotes() throws Exception {
     HttpResponse response = new HttpResponseBuilder()
         .addHeader("Content-Type", "text/plain; 
charset=\"TEST-CHARACTER-SET\"")
+        .setResponse(new byte[] {'j', 'u', 'n', 'k'})
         .create();
     assertEquals("TEST-CHARACTER-SET", response.getEncoding());
   }
@@ -124,8 +126,7 @@
      HttpResponse response = new HttpResponseBuilder()
         .setResponse(LATIN1_DATA)
         .create();
-    assertEquals("ISO-8859-1", response.getEncoding().toUpperCase());
-    assertEquals(LATIN1_STRING, response.getResponseAsString());
+    assertEquals(HttpResponse.DEFAULT_ENCODING, response.getEncoding());
   }
 
   public void testGetEncodingForImageContentType() throws Exception {


Reply via email to