I couldn't find a better option for doing encoding detection, and the ICU library is generally well regarded. Do you have a better suggestion?
On Mon, Jun 23, 2008 at 1:32 PM, David Primmer <[EMAIL PROTECTED]> wrote: > This is 4+megs of jars. Is an addition of this size really necessary > for this task? > > davep > > On Wed, Jun 18, 2008 at 1:20 AM, <[EMAIL PROTECTED]> wrote: > > Author: etnu > > Date: Wed Jun 18 01:19:58 2008 > > New Revision: 669108 > > > > URL: http://svn.apache.org/viewvc?rev=669108&view=rev > > Log: > > Applied SHINDIG-391, which adds more robust character encoding detection > for HttpResponse using ICU, from Patrick Fairbank. > > > > > > Modified: > > incubator/shindig/trunk/java/gadgets/pom.xml > > > > incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java > > > > incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java > > incubator/shindig/trunk/pom.xml > > > > Modified: incubator/shindig/trunk/java/gadgets/pom.xml > > URL: > http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/pom.xml?rev=669108&r1=669107&r2=669108&view=diff > > > ============================================================================== > > --- incubator/shindig/trunk/java/gadgets/pom.xml (original) > > +++ incubator/shindig/trunk/java/gadgets/pom.xml Wed Jun 18 01:19:58 2008 > > @@ -135,7 +135,7 @@ > > <artifactId>core</artifactId> > > <scope>compile</scope> > > </dependency> > > - <dependency> > > + <dependency> > > <groupId>com.google.code.google-collections</groupId> > > <artifactId>google-collect</artifactId> > > </dependency> > > @@ -153,9 +153,14 @@ > > <artifactId>jetty</artifactId> > > <scope>test</scope> > > </dependency> > > - <dependency> > > + <dependency> > > <groupId>rome</groupId> > > <artifactId>rome</artifactId> > > </dependency> > > + <dependency> > > + <groupId>com.ibm.icu</groupId> > > + <artifactId>icu4j</artifactId> > > + <scope>compile</scope> > > + </dependency> > > </dependencies> > > </project> > > > > Modified: > incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java > > URL: > http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java?rev=669108&r1=669107&r2=669108&view=diff > > > ============================================================================== > > --- > incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java > (original) > > +++ > incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java > Wed Jun 18 01:19:58 2008 > > @@ -19,6 +19,9 @@ > > > > import org.apache.shindig.common.util.DateUtil; > > > > +import com.ibm.icu.text.CharsetDetector; > > +import com.ibm.icu.text.CharsetMatch; > > + > > import java.io.ByteArrayInputStream; > > import java.io.InputStream; > > import java.nio.ByteBuffer; > > @@ -28,8 +31,10 @@ > > import java.util.Collections; > > import java.util.Date; > > import java.util.HashMap; > > +import java.util.HashSet; > > import java.util.List; > > import java.util.Map; > > +import java.util.Set; > > import java.util.TreeMap; > > import java.util.concurrent.ConcurrentHashMap; > > > > @@ -42,6 +47,9 @@ > > public final static int SC_NOT_FOUND = 404; > > public final static int SC_INTERNAL_SERVER_ERROR = 500; > > public final static int SC_TIMEOUT = 504; > > + private final static Set<String> BINARY_CONTENT_TYPES = new > HashSet<String>(Arrays.asList( > > + "image/jpeg", "image/png", "image/gif", "image/jpg", > "application/x-shockwave-flash" > > + )); > > > > private final int httpStatusCode; > > private static final String DEFAULT_ENCODING = "UTF-8"; > > @@ -142,6 +150,9 @@ > > String contentType = getHeader("Content-Type"); > > if (contentType != null) { > > String[] parts = contentType.split(";"); > > + if (BINARY_CONTENT_TYPES.contains(parts[0])) { > > + return DEFAULT_ENCODING; > > + } > > if (parts.length == 2) { > > int offset = parts[1].indexOf("charset="); > > if (offset != -1) { > > @@ -149,7 +160,12 @@ > > } > > } > > } > > - return DEFAULT_ENCODING; > > + > > + // If the header doesn't specify the charset, try to determine it by > examining the content. > > + CharsetDetector detector = new CharsetDetector(); > > + detector.setText(responseBytes); > > + CharsetMatch match = detector.detect(); > > + return match.getName().toUpperCase(); > > } > > > > public int getHttpStatusCode() { > > > > Modified: > incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java > > URL: > http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java?rev=669108&r1=669107&r2=669108&view=diff > > > ============================================================================== > > --- > incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java > (original) > > +++ > incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java > Wed Jun 18 01:19:58 2008 > > @@ -25,12 +25,30 @@ > > > > import java.util.Arrays; > > import java.util.HashMap; > > -import java.util.Iterator; > > import java.util.LinkedList; > > import java.util.List; > > import java.util.Map; > > > > public class HttpResponseTest extends TestCase { > > + private final static byte[] UTF8_DATA = new byte[] { > > + (byte)0xEF, (byte)0xBB, (byte)0xBF, 'h', 'e', 'l', 'l', 'o' > > + }; > > + private final static String UTF8_STRING = "hello"; > > + > > + // A large string is needed for accurate charset detection. > > + private final static byte[] LATIN1_DATA = new byte[] { > > + 'G', 'a', 'm', 'e', 's', ',', ' ', 'H', 'Q', ',', ' ', 'M', 'a', > 'n', 'g', (byte)0xE1, ',', ' ', > > + 'A', 'n', 'i', 'm', 'e', ' ', 'e', ' ', 't', 'u', 'd', 'o', ' ', > 'q', 'u', 'e', ' ', 'u', 'm', > > + ' ', 'b', 'o', 'm', ' ', 'n', 'e', 'r', 'd', ' ', 'a', 'm', 'a' > > + }; > > + private final static String LATIN1_STRING > > + = "Games, HQ, Mang\u00E1, Anime e tudo que um bom nerd ama"; > > + > > + private final static byte[] BIG5_DATA = new byte[] { > > + (byte)0xa7, (byte)0x41, (byte)0xa6, (byte)0x6e > > + }; > > + private final static String BIG5_STRING = "\u4F60\u597D"; > > + > > private Map<String, List<String>> headers; > > > > @Override > > @@ -56,32 +74,60 @@ > > > > public void testEncodingDetectionUtf8WithBom() throws Exception { > > // Input is UTF-8 with BOM. > > - byte[] data = new byte[] { > > - (byte)0xEF, (byte)0xBB, (byte)0xBF, 'h', 'e', 'l', 'l', 'o' > > - }; > > addHeader("Content-Type", "text/plain; charset=UTF-8"); > > - HttpResponse response = new HttpResponse(200, data, headers); > > - assertEquals("hello", response.getResponseAsString()); > > + HttpResponse response = new HttpResponse(200, UTF8_DATA, headers); > > + assertEquals(UTF8_STRING, response.getResponseAsString()); > > } > > > > public void testEncodingDetectionLatin1() throws Exception { > > // Input is a basic latin-1 string with 1 non-UTF8 compatible char. > > - byte[] data = new byte[] { > > - 'h', (byte)0xE9, 'l', 'l', 'o' > > - }; > > addHeader("Content-Type", "text/plain; charset=iso-8859-1"); > > - HttpResponse response = new HttpResponse(200, data, headers); > > - assertEquals("h\u00E9llo", response.getResponseAsString()); > > + HttpResponse response = new HttpResponse(200, LATIN1_DATA, headers); > > + assertEquals(LATIN1_STRING, response.getResponseAsString()); > > } > > > > public void testEncodingDetectionBig5() throws Exception { > > - byte[] data = new byte[] { > > - (byte)0xa7, (byte)0x41, (byte)0xa6, (byte)0x6e > > - }; > > addHeader("Content-Type", "text/plain; charset=BIG5"); > > - HttpResponse response = new HttpResponse(200, data, headers); > > - String resp = response.getResponseAsString(); > > - assertEquals("\u4F60\u597D", response.getResponseAsString()); > > + HttpResponse response = new HttpResponse(200, BIG5_DATA, headers); > > + assertEquals(BIG5_STRING, response.getResponseAsString()); > > + } > > + > > + public void testEncodingDetectionUtf8WithBomNoCharsetSpecified() > throws Exception { > > + addHeader("Content-Type", "text/plain"); > > + HttpResponse response = new HttpResponse(200, UTF8_DATA, headers); > > + assertEquals("UTF-8", response.getEncoding().toUpperCase()); > > + assertEquals(UTF8_STRING, response.getResponseAsString()); > > + } > > + > > + public void testEncodingDetectionLatin1NoCharsetSpecified() throws > Exception { > > + addHeader("Content-Type", "text/plain;"); > > + HttpResponse response = new HttpResponse(200, LATIN1_DATA, headers); > > + assertEquals("ISO-8859-1", response.getEncoding().toUpperCase()); > > + assertEquals(LATIN1_STRING, response.getResponseAsString()); > > + } > > + > > + public void testEncodingDetectionUtf8WithBomNoContentHeader() throws > Exception { > > + HttpResponse response = new HttpResponse(200, UTF8_DATA, headers); > > + assertEquals("UTF-8", response.getEncoding().toUpperCase()); > > + assertEquals(UTF8_STRING, response.getResponseAsString()); > > + } > > + > > + public void testEncodingDetectionLatin1NoContentHeader() throws > Exception { > > + HttpResponse response = new HttpResponse(200, LATIN1_DATA, headers); > > + assertEquals("ISO-8859-1", response.getEncoding().toUpperCase()); > > + assertEquals(LATIN1_STRING, response.getResponseAsString()); > > + } > > + > > + public void testGetEncodingForImageContentType() throws Exception { > > + addHeader("Content-Type", "image/png; charset=iso-8859-1"); > > + HttpResponse response = new HttpResponse(200, LATIN1_DATA, headers); > > + assertEquals("UTF-8", response.getEncoding().toUpperCase()); > > + } > > + > > + public void testGetEncodingForFlashContentType() throws Exception { > > + addHeader("Content-Type", "application/x-shockwave-flash; > charset=iso-8859-1"); > > + HttpResponse response = new HttpResponse(200, LATIN1_DATA, headers); > > + assertEquals("UTF-8", response.getEncoding().toUpperCase()); > > } > > > > public void testPreserveBinaryData() throws Exception { > > @@ -98,12 +144,14 @@ > > addHeader("Cache-Control", "no-cache"); > > HttpResponse response = new HttpResponse(200, new byte[0], headers); > > assertTrue(response.isStrictNoCache()); > > + assertEquals(-1, response.getCacheExpiration()); > > } > > > > public void testStrictPragmaNoCache() throws Exception { > > addHeader("Pragma", "no-cache"); > > HttpResponse response = new HttpResponse(200, new byte[0], headers); > > assertTrue(response.isStrictNoCache()); > > + assertEquals(-1, response.getCacheExpiration()); > > } > > > > public void testStrictPragmaJunk() throws Exception { > > @@ -118,7 +166,17 @@ > > long time = ((System.currentTimeMillis() / 1000) * 1000) + 10000L; > > addHeader("Expires", DateUtil.formatDate(time)); > > HttpResponse response = new HttpResponse(200, new byte[0], headers); > > - assertEquals(time, response.getExpiration()); > > + assertEquals(time, response.getCacheExpiration()); > > + } > > + > > + public void testMaxAge() throws Exception { > > + int maxAge = 10; > > + long expected = ((System.currentTimeMillis() / 1000) * 1000) + > (maxAge * 1000); > > + addHeader("Cache-Control", "public, max-age=" + maxAge); > > + HttpResponse response = new HttpResponse(200, new byte[0], headers); > > + long expiration = response.getCacheExpiration(); > > + assertTrue("getExpiration is less than start time + maxAge", > expiration >= expected); > > + assertTrue("getExpiration is too high.", expiration <= expected + > 1000); > > } > > > > public void testNegativeCaching() { > > > > Modified: incubator/shindig/trunk/pom.xml > > URL: > http://svn.apache.org/viewvc/incubator/shindig/trunk/pom.xml?rev=669108&r1=669107&r2=669108&view=diff > > > ============================================================================== > > --- incubator/shindig/trunk/pom.xml (original) > > +++ incubator/shindig/trunk/pom.xml Wed Jun 18 01:19:58 2008 > > @@ -832,11 +832,16 @@ > > <artifactId>joda-time</artifactId> > > <version>1.5.2</version> > > </dependency> > > - <dependency> > > + <dependency> > > <groupId>rome</groupId> > > <artifactId>rome</artifactId> > > <version>0.9</version> > > - </dependency> > > + </dependency> > > + <dependency> > > + <groupId>com.ibm.icu</groupId> > > + <artifactId>icu4j</artifactId> > > + <version>3.8</version> > > + </dependency> > > </dependencies> > > </dependencyManagement> > > </project> > > > > > > >

