Author: ansell
Date: Mon Sep 3 23:21:44 2012
New Revision: 1380400
URL: http://svn.apache.org/viewvc?rev=1380400&view=rev
Log:
ANY23-118 : Split encoding detection out into its own module
Added:
incubator/any23/trunk/encoding/
incubator/any23/trunk/encoding/pom.xml
incubator/any23/trunk/encoding/src/
incubator/any23/trunk/encoding/src/main/
incubator/any23/trunk/encoding/src/main/java/
incubator/any23/trunk/encoding/src/main/java/org/
incubator/any23/trunk/encoding/src/main/java/org/apache/
incubator/any23/trunk/encoding/src/main/java/org/apache/any23/
incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/
incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
incubator/any23/trunk/encoding/src/test/
incubator/any23/trunk/encoding/src/test/java/
incubator/any23/trunk/encoding/src/test/java/org/
incubator/any23/trunk/encoding/src/test/java/org/apache/
incubator/any23/trunk/encoding/src/test/java/org/apache/any23/
incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/
incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
Removed:
incubator/any23/trunk/core/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
incubator/any23/trunk/core/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
Modified:
incubator/any23/trunk/core/pom.xml
incubator/any23/trunk/pom.xml
Modified: incubator/any23/trunk/core/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/any23/trunk/core/pom.xml?rev=1380400&r1=1380399&r2=1380400&view=diff
==============================================================================
--- incubator/any23/trunk/core/pom.xml (original)
+++ incubator/any23/trunk/core/pom.xml Mon Sep 3 23:21:44 2012
@@ -47,6 +47,11 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-encoding</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
<artifactId>apache-any23-nquads</artifactId>
<version>${project.version}</version>
<scope>test</scope>
Added: incubator/any23/trunk/encoding/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/any23/trunk/encoding/pom.xml?rev=1380400&view=auto
==============================================================================
--- incubator/any23/trunk/encoding/pom.xml (added)
+++ incubator/any23/trunk/encoding/pom.xml Mon Sep 3 23:21:44 2012
@@ -0,0 +1,40 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <artifactId>apache-any23</artifactId>
+ <groupId>org.apache.any23</groupId>
+ <version>0.7.1-incubating-SNAPSHOT</version>
+ <relativePath>..</relativePath>
+ </parent>
+ <artifactId>apache-any23-encoding</artifactId>
+ <name>Apache Any23 :: Encoding Detection</name>
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-api</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-test-resources</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ <type>test-jar</type>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
Added:
incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
URL:
http://svn.apache.org/viewvc/incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java?rev=1380400&view=auto
==============================================================================
---
incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
(added)
+++
incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
Mon Sep 3 23:21:44 2012
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.encoding;
+
+import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * An implementation of {@link EncodingDetector} based on
+ * <a href="http://tika.apache.org/">Apache Tika</a>.
+ *
+ * @author Michele Mostarda ( [email protected] )
+ * @author Davide Palmisano ( [email protected] )
+ * @version $Id$
+ */
+public class TikaEncodingDetector implements EncodingDetector {
+
+ public String guessEncoding(InputStream is) throws IOException {
+ CharsetDetector charsetDetector = new CharsetDetector();
+ charsetDetector.setText( is instanceof BufferedInputStream ? is : new
BufferedInputStream(is) );
+ charsetDetector.enableInputFilter(true);
+ CharsetMatch cm = charsetDetector.detect();
+ return cm.getName();
+ }
+
+}
Added:
incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
URL:
http://svn.apache.org/viewvc/incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java?rev=1380400&view=auto
==============================================================================
---
incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
(added)
+++
incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
Mon Sep 3 23:21:44 2012
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.encoding;
+
+import junit.framework.Assert;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Test case for {@link TikaEncodingDetector}.
+ *
+ * @author Michele Mostarda ( [email protected] )
+ * @author Davide Palmisano ( [email protected] )
+ * @version $Id$
+ */
+public class TikaEncodingDetectorTest {
+
+ private TikaEncodingDetector detector;
+
+ @Before
+ public void setUp() {
+ detector = new TikaEncodingDetector();
+ }
+
+ @After
+ public void tearDown() {
+ detector = null;
+ }
+
+ @Test
+ public void testISO8859HTML() throws IOException {
+ assertEncoding( "ISO-8859-1",
"/microformats/xfn/encoding-iso-8859-1.html" );
+ }
+
+ @Test
+ public void testISO8859XHTML() throws IOException {
+ assertEncoding( "ISO-8859-1",
"/microformats/xfn/encoding-iso-8859-1.xhtml" );
+ }
+
+ @Test
+ public void testUTF8AfterTitle() throws IOException {
+ assertEncoding( "UTF-8",
"/microformats/xfn/encoding-utf-8-after-title.html" );
+ }
+
+ @Test
+ public void testUTF8HTML() throws IOException {
+ assertEncoding( "UTF-8", "/microformats/xfn/encoding-utf-8.html" );
+ }
+
+ @Test
+ public void testUTF8XHTML() throws IOException {
+ assertEncoding( "UTF-8", "/microformats/xfn/encoding-utf-8.xhtml" );
+ }
+
+ @Test
+ public void testEncodingHTML() throws IOException {
+ assertEncoding( "UTF-8", "/html/encoding-test.html" );
+ }
+
+ private void assertEncoding(final String expected, final String resource)
throws IOException {
+ InputStream fis = this.getClass().getResourceAsStream(resource);
+ try {
+ String encoding = detector.guessEncoding(fis);
+ Assert.assertEquals( "Unexpected encoding", expected, encoding );
+ } finally {
+ fis.close();
+ }
+ }
+
+}
Modified: incubator/any23/trunk/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/any23/trunk/pom.xml?rev=1380400&r1=1380399&r2=1380400&view=diff
==============================================================================
--- incubator/any23/trunk/pom.xml (original)
+++ incubator/any23/trunk/pom.xml Mon Sep 3 23:21:44 2012
@@ -193,6 +193,7 @@
<module>nquads</module>
<module>csvutils</module>
<module>mime</module>
+ <module>encoding</module>
<module>core</module>
<module>plugins/basic-crawler</module>
<module>plugins/html-scraper</module>