I couldn't find a better option for doing encoding detection, and the ICU
library is generally well regarded. Do you have a better suggestion?

On Mon, Jun 23, 2008 at 1:32 PM, David Primmer <[EMAIL PROTECTED]>
wrote:

> This is 4+megs of jars. Is an addition of this size really necessary
> for this task?
>
> davep
>
> On Wed, Jun 18, 2008 at 1:20 AM,  <[EMAIL PROTECTED]> wrote:
> > Author: etnu
> > Date: Wed Jun 18 01:19:58 2008
> > New Revision: 669108
> >
> > URL: http://svn.apache.org/viewvc?rev=669108&view=rev
> > Log:
> > Applied SHINDIG-391, which adds more robust character encoding detection
> for HttpResponse using ICU, from Patrick Fairbank.
> >
> >
> > Modified:
> >    incubator/shindig/trunk/java/gadgets/pom.xml
> >
>  
> incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
> >
>  
> incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
> >    incubator/shindig/trunk/pom.xml
> >
> > Modified: incubator/shindig/trunk/java/gadgets/pom.xml
> > URL:
> http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/pom.xml?rev=669108&r1=669107&r2=669108&view=diff
> >
> ==============================================================================
> > --- incubator/shindig/trunk/java/gadgets/pom.xml (original)
> > +++ incubator/shindig/trunk/java/gadgets/pom.xml Wed Jun 18 01:19:58 2008
> > @@ -135,7 +135,7 @@
> >       <artifactId>core</artifactId>
> >       <scope>compile</scope>
> >     </dependency>
> > -               <dependency>
> > +    <dependency>
> >       <groupId>com.google.code.google-collections</groupId>
> >       <artifactId>google-collect</artifactId>
> >     </dependency>
> > @@ -153,9 +153,14 @@
> >       <artifactId>jetty</artifactId>
> >       <scope>test</scope>
> >     </dependency>
> > -               <dependency>
> > +    <dependency>
> >       <groupId>rome</groupId>
> >       <artifactId>rome</artifactId>
> >     </dependency>
> > +    <dependency>
> > +      <groupId>com.ibm.icu</groupId>
> > +      <artifactId>icu4j</artifactId>
> > +      <scope>compile</scope>
> > +    </dependency>
> >   </dependencies>
> >  </project>
> >
> > Modified:
> incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
> > URL:
> http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java?rev=669108&r1=669107&r2=669108&view=diff
> >
> ==============================================================================
> > ---
> incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
> (original)
> > +++
> incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
> Wed Jun 18 01:19:58 2008
> > @@ -19,6 +19,9 @@
> >
> >  import org.apache.shindig.common.util.DateUtil;
> >
> > +import com.ibm.icu.text.CharsetDetector;
> > +import com.ibm.icu.text.CharsetMatch;
> > +
> >  import java.io.ByteArrayInputStream;
> >  import java.io.InputStream;
> >  import java.nio.ByteBuffer;
> > @@ -28,8 +31,10 @@
> >  import java.util.Collections;
> >  import java.util.Date;
> >  import java.util.HashMap;
> > +import java.util.HashSet;
> >  import java.util.List;
> >  import java.util.Map;
> > +import java.util.Set;
> >  import java.util.TreeMap;
> >  import java.util.concurrent.ConcurrentHashMap;
> >
> > @@ -42,6 +47,9 @@
> >   public final static int SC_NOT_FOUND = 404;
> >   public final static int SC_INTERNAL_SERVER_ERROR = 500;
> >   public final static int SC_TIMEOUT = 504;
> > +  private final static Set<String> BINARY_CONTENT_TYPES = new
> HashSet<String>(Arrays.asList(
> > +    "image/jpeg", "image/png", "image/gif", "image/jpg",
> "application/x-shockwave-flash"
> > +  ));
> >
> >   private final int httpStatusCode;
> >   private static final String DEFAULT_ENCODING = "UTF-8";
> > @@ -142,6 +150,9 @@
> >     String contentType = getHeader("Content-Type");
> >     if (contentType != null) {
> >       String[] parts = contentType.split(";");
> > +      if (BINARY_CONTENT_TYPES.contains(parts[0])) {
> > +        return DEFAULT_ENCODING;
> > +      }
> >       if (parts.length == 2) {
> >         int offset = parts[1].indexOf("charset=");
> >         if (offset != -1) {
> > @@ -149,7 +160,12 @@
> >         }
> >       }
> >     }
> > -    return DEFAULT_ENCODING;
> > +
> > +    // If the header doesn't specify the charset, try to determine it by
> examining the content.
> > +    CharsetDetector detector = new CharsetDetector();
> > +    detector.setText(responseBytes);
> > +    CharsetMatch match = detector.detect();
> > +    return match.getName().toUpperCase();
> >   }
> >
> >   public int getHttpStatusCode() {
> >
> > Modified:
> incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
> > URL:
> http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java?rev=669108&r1=669107&r2=669108&view=diff
> >
> ==============================================================================
> > ---
> incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
> (original)
> > +++
> incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
> Wed Jun 18 01:19:58 2008
> > @@ -25,12 +25,30 @@
> >
> >  import java.util.Arrays;
> >  import java.util.HashMap;
> > -import java.util.Iterator;
> >  import java.util.LinkedList;
> >  import java.util.List;
> >  import java.util.Map;
> >
> >  public class HttpResponseTest extends TestCase {
> > +  private final static byte[] UTF8_DATA = new byte[] {
> > +    (byte)0xEF, (byte)0xBB, (byte)0xBF, 'h', 'e', 'l', 'l', 'o'
> > +  };
> > +  private final static String UTF8_STRING = "hello";
> > +
> > +  // A large string is needed for accurate charset detection.
> > +  private final static byte[] LATIN1_DATA = new byte[] {
> > +    'G', 'a', 'm', 'e', 's', ',', ' ', 'H', 'Q', ',', ' ', 'M', 'a',
> 'n', 'g', (byte)0xE1, ',', ' ',
> > +    'A', 'n', 'i', 'm', 'e', ' ', 'e', ' ', 't', 'u', 'd', 'o', ' ',
> 'q', 'u', 'e', ' ', 'u', 'm',
> > +    ' ', 'b', 'o', 'm', ' ', 'n', 'e', 'r', 'd', ' ', 'a', 'm', 'a'
> > +  };
> > +  private final static String LATIN1_STRING
> > +      = "Games, HQ, Mang\u00E1, Anime e tudo que um bom nerd ama";
> > +
> > +  private final static byte[] BIG5_DATA = new byte[] {
> > +    (byte)0xa7, (byte)0x41, (byte)0xa6, (byte)0x6e
> > +  };
> > +  private final static String BIG5_STRING = "\u4F60\u597D";
> > +
> >   private Map<String, List<String>> headers;
> >
> >   @Override
> > @@ -56,32 +74,60 @@
> >
> >   public void testEncodingDetectionUtf8WithBom() throws Exception {
> >     // Input is UTF-8 with BOM.
> > -    byte[] data = new byte[] {
> > -      (byte)0xEF, (byte)0xBB, (byte)0xBF, 'h', 'e', 'l', 'l', 'o'
> > -    };
> >     addHeader("Content-Type", "text/plain; charset=UTF-8");
> > -    HttpResponse response = new HttpResponse(200, data, headers);
> > -    assertEquals("hello", response.getResponseAsString());
> > +    HttpResponse response = new HttpResponse(200, UTF8_DATA, headers);
> > +    assertEquals(UTF8_STRING, response.getResponseAsString());
> >   }
> >
> >   public void testEncodingDetectionLatin1() throws Exception {
> >     // Input is a basic latin-1 string with 1 non-UTF8 compatible char.
> > -    byte[] data = new byte[] {
> > -      'h', (byte)0xE9, 'l', 'l', 'o'
> > -    };
> >     addHeader("Content-Type", "text/plain; charset=iso-8859-1");
> > -    HttpResponse response = new HttpResponse(200, data, headers);
> > -    assertEquals("h\u00E9llo", response.getResponseAsString());
> > +    HttpResponse response = new HttpResponse(200, LATIN1_DATA, headers);
> > +    assertEquals(LATIN1_STRING, response.getResponseAsString());
> >   }
> >
> >   public void testEncodingDetectionBig5() throws Exception {
> > -    byte[] data = new byte[] {
> > -      (byte)0xa7, (byte)0x41, (byte)0xa6, (byte)0x6e
> > -    };
> >     addHeader("Content-Type", "text/plain; charset=BIG5");
> > -    HttpResponse response = new HttpResponse(200, data, headers);
> > -    String resp = response.getResponseAsString();
> > -    assertEquals("\u4F60\u597D", response.getResponseAsString());
> > +    HttpResponse response = new HttpResponse(200, BIG5_DATA, headers);
> > +    assertEquals(BIG5_STRING, response.getResponseAsString());
> > +  }
> > +
> > +  public void testEncodingDetectionUtf8WithBomNoCharsetSpecified()
> throws Exception {
> > +    addHeader("Content-Type", "text/plain");
> > +    HttpResponse response = new HttpResponse(200, UTF8_DATA, headers);
> > +    assertEquals("UTF-8", response.getEncoding().toUpperCase());
> > +    assertEquals(UTF8_STRING, response.getResponseAsString());
> > +  }
> > +
> > +  public void testEncodingDetectionLatin1NoCharsetSpecified() throws
> Exception {
> > +    addHeader("Content-Type", "text/plain;");
> > +    HttpResponse response = new HttpResponse(200, LATIN1_DATA, headers);
> > +    assertEquals("ISO-8859-1", response.getEncoding().toUpperCase());
> > +    assertEquals(LATIN1_STRING, response.getResponseAsString());
> > +  }
> > +
> > +  public void testEncodingDetectionUtf8WithBomNoContentHeader() throws
> Exception {
> > +    HttpResponse response = new HttpResponse(200, UTF8_DATA, headers);
> > +    assertEquals("UTF-8", response.getEncoding().toUpperCase());
> > +    assertEquals(UTF8_STRING, response.getResponseAsString());
> > +  }
> > +
> > +  public void testEncodingDetectionLatin1NoContentHeader() throws
> Exception {
> > +    HttpResponse response = new HttpResponse(200, LATIN1_DATA, headers);
> > +    assertEquals("ISO-8859-1", response.getEncoding().toUpperCase());
> > +    assertEquals(LATIN1_STRING, response.getResponseAsString());
> > +  }
> > +
> > +  public void testGetEncodingForImageContentType() throws Exception {
> > +    addHeader("Content-Type", "image/png; charset=iso-8859-1");
> > +    HttpResponse response = new HttpResponse(200, LATIN1_DATA, headers);
> > +    assertEquals("UTF-8", response.getEncoding().toUpperCase());
> > +  }
> > +
> > +  public void testGetEncodingForFlashContentType() throws Exception {
> > +    addHeader("Content-Type", "application/x-shockwave-flash;
> charset=iso-8859-1");
> > +    HttpResponse response = new HttpResponse(200, LATIN1_DATA, headers);
> > +    assertEquals("UTF-8", response.getEncoding().toUpperCase());
> >   }
> >
> >   public void testPreserveBinaryData() throws Exception {
> > @@ -98,12 +144,14 @@
> >     addHeader("Cache-Control", "no-cache");
> >     HttpResponse response = new HttpResponse(200, new byte[0], headers);
> >     assertTrue(response.isStrictNoCache());
> > +    assertEquals(-1, response.getCacheExpiration());
> >   }
> >
> >   public void testStrictPragmaNoCache() throws Exception {
> >     addHeader("Pragma", "no-cache");
> >     HttpResponse response = new HttpResponse(200, new byte[0], headers);
> >     assertTrue(response.isStrictNoCache());
> > +    assertEquals(-1, response.getCacheExpiration());
> >   }
> >
> >   public void testStrictPragmaJunk() throws Exception {
> > @@ -118,7 +166,17 @@
> >     long time = ((System.currentTimeMillis() / 1000) * 1000) + 10000L;
> >     addHeader("Expires", DateUtil.formatDate(time));
> >     HttpResponse response = new HttpResponse(200, new byte[0], headers);
> > -    assertEquals(time, response.getExpiration());
> > +    assertEquals(time, response.getCacheExpiration());
> > +  }
> > +
> > +  public void testMaxAge() throws Exception {
> > +    int maxAge = 10;
> > +    long expected = ((System.currentTimeMillis() / 1000) * 1000) +
> (maxAge * 1000);
> > +    addHeader("Cache-Control", "public, max-age=" + maxAge);
> > +    HttpResponse response = new HttpResponse(200, new byte[0], headers);
> > +    long expiration = response.getCacheExpiration();
> > +    assertTrue("getExpiration is less than start time + maxAge",
> expiration >= expected);
> > +    assertTrue("getExpiration is too high.", expiration <= expected +
> 1000);
> >   }
> >
> >   public void testNegativeCaching() {
> >
> > Modified: incubator/shindig/trunk/pom.xml
> > URL:
> http://svn.apache.org/viewvc/incubator/shindig/trunk/pom.xml?rev=669108&r1=669107&r2=669108&view=diff
> >
> ==============================================================================
> > --- incubator/shindig/trunk/pom.xml (original)
> > +++ incubator/shindig/trunk/pom.xml Wed Jun 18 01:19:58 2008
> > @@ -832,11 +832,16 @@
> >         <artifactId>joda-time</artifactId>
> >         <version>1.5.2</version>
> >       </dependency>
> > -                       <dependency>
> > +      <dependency>
> >         <groupId>rome</groupId>
> >         <artifactId>rome</artifactId>
> >         <version>0.9</version>
> > -                 </dependency>
> > +      </dependency>
> > +      <dependency>
> > +        <groupId>com.ibm.icu</groupId>
> > +        <artifactId>icu4j</artifactId>
> > +        <version>3.8</version>
> > +      </dependency>
> >     </dependencies>
> >   </dependencyManagement>
> >  </project>
> >
> >
> >
>

Reply via email to