Author: btellier
Date: Mon Jun 29 08:46:59 2015
New Revision: 1688147
URL: http://svn.apache.org/r1688147
Log:
MAILBOX-245 Profile to not load tika Jar
Added:
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractor.java
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/ParsedContent.java
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/TextExtractor.java
james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/extractor/
james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractorTest.java
james/mailbox/trunk/store/src/test/resources/
james/mailbox/trunk/store/src/test/resources/documents/
james/mailbox/trunk/store/src/test/resources/documents/Text.txt
james/mailbox/trunk/store/src/test/resources/documents/writter.docx
james/mailbox/trunk/tika/
james/mailbox/trunk/tika/pom.xml
james/mailbox/trunk/tika/src/
james/mailbox/trunk/tika/src/main/
james/mailbox/trunk/tika/src/main/java/
james/mailbox/trunk/tika/src/main/java/org/
james/mailbox/trunk/tika/src/main/java/org/apache/
james/mailbox/trunk/tika/src/main/java/org/apache/james/
james/mailbox/trunk/tika/src/main/java/org/apache/james/mailbox/
james/mailbox/trunk/tika/src/main/java/org/apache/james/mailbox/tika/
james/mailbox/trunk/tika/src/main/java/org/apache/james/mailbox/tika/extractor/
james/mailbox/trunk/tika/src/main/java/org/apache/james/mailbox/tika/extractor/TikaTextExtractor.java
james/mailbox/trunk/tika/src/test/
james/mailbox/trunk/tika/src/test/java/
james/mailbox/trunk/tika/src/test/java/org/
james/mailbox/trunk/tika/src/test/java/org/apache/
james/mailbox/trunk/tika/src/test/java/org/apache/james/
james/mailbox/trunk/tika/src/test/java/org/apache/james/mailbox/
james/mailbox/trunk/tika/src/test/java/org/apache/james/mailbox/tika/
james/mailbox/trunk/tika/src/test/java/org/apache/james/mailbox/tika/extractor/
james/mailbox/trunk/tika/src/test/java/org/apache/james/mailbox/tika/extractor/TikaTextExtractorTest.java
james/mailbox/trunk/tika/src/test/resources/
james/mailbox/trunk/tika/src/test/resources/documents/
james/mailbox/trunk/tika/src/test/resources/documents/PDF.pdf
james/mailbox/trunk/tika/src/test/resources/documents/Text.txt
james/mailbox/trunk/tika/src/test/resources/documents/calc.ods
james/mailbox/trunk/tika/src/test/resources/documents/calc.xlsx
james/mailbox/trunk/tika/src/test/resources/documents/fake.txt
james/mailbox/trunk/tika/src/test/resources/documents/slides.odp
james/mailbox/trunk/tika/src/test/resources/documents/slides.pptx
james/mailbox/trunk/tika/src/test/resources/documents/writter.docx
james/mailbox/trunk/tika/src/test/resources/documents/writter.odt
Removed:
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonUsingTika.java
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/
james/mailbox/trunk/elasticsearch/src/test/resources/documents/PDF.pdf
james/mailbox/trunk/elasticsearch/src/test/resources/documents/Text.txt
james/mailbox/trunk/elasticsearch/src/test/resources/documents/calc.ods
james/mailbox/trunk/elasticsearch/src/test/resources/documents/calc.xlsx
james/mailbox/trunk/elasticsearch/src/test/resources/documents/fake.txt
james/mailbox/trunk/elasticsearch/src/test/resources/documents/slides.odp
james/mailbox/trunk/elasticsearch/src/test/resources/documents/slides.pptx
james/mailbox/trunk/elasticsearch/src/test/resources/documents/writter.docx
james/mailbox/trunk/elasticsearch/src/test/resources/documents/writter.odt
Modified:
james/mailbox/trunk/elasticsearch/pom.xml
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/IndexableMessage.java
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJson.java
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePart.java
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartContainerBuilder.java
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/RootMimePartContainerBuilder.java
james/mailbox/trunk/elasticsearch/src/main/resources/META-INF/spring/mailbox-elasticsearch.xml
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/ElasticSearchIntegrationTest.java
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java
james/mailbox/trunk/pom.xml
Modified: james/mailbox/trunk/elasticsearch/pom.xml
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/pom.xml?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/pom.xml (original)
+++ james/mailbox/trunk/elasticsearch/pom.xml Mon Jun 29 08:46:59 2015
@@ -29,7 +29,7 @@
<artifactId>apache-james-mailbox-elasticsearch</artifactId>
<description>Apache James Mailbox IMAP search implementation using
ElasticSearch</description>
- <name>Apache James :: Mailbox :: ElasticSearch</name>
+ <name>Apache James :: Mailbox :: Tika</name>
<properties>
<javax.mail.groupId>javax.mail</javax.mail.groupId>
@@ -52,6 +52,11 @@
<scope>test</scope>
</dependency>
<dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-james-mailbox-tika</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
@@ -103,16 +108,6 @@
<scope>test</scope>
</dependency>
<dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-core</artifactId>
- <version>1.7</version>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parsers</artifactId>
- <version>1.7</version>
- </dependency>
- <dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<version>3.0.0</version>
Modified:
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/IndexableMessage.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/IndexableMessage.java?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
---
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/IndexableMessage.java
(original)
+++
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/IndexableMessage.java
Mon Jun 29 08:46:59 2015
@@ -23,8 +23,8 @@ import com.fasterxml.jackson.annotation.
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.Multimap;
-import org.apache.james.mailbox.elasticsearch.json.extractor.TextExtractor;
import org.apache.james.mailbox.elasticsearch.query.DateResolutionFormater;
+import org.apache.james.mailbox.store.extractor.TextExtractor;
import org.apache.james.mailbox.store.mail.model.MailboxId;
import org.apache.james.mailbox.store.mail.model.Message;
import org.apache.james.mailbox.store.mail.model.Property;
Modified:
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJson.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJson.java?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
---
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJson.java
(original)
+++
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJson.java
Mon Jun 29 08:46:59 2015
@@ -26,7 +26,7 @@ import com.fasterxml.jackson.databind.Ob
import com.fasterxml.jackson.datatype.guava.GuavaModule;
import com.fasterxml.jackson.datatype.jdk8.Jdk8Module;
import com.google.common.base.Preconditions;
-import org.apache.james.mailbox.elasticsearch.json.extractor.TextExtractor;
+import org.apache.james.mailbox.store.extractor.TextExtractor;
import org.apache.james.mailbox.store.mail.model.Message;
public class MessageToElasticSearchJson {
@@ -41,6 +41,10 @@ public class MessageToElasticSearchJson
this.mapper.registerModule(new Jdk8Module());
}
+ public MessageToElasticSearchJson(TextExtractor textExtractor) {
+ this(textExtractor, ZoneId.systemDefault());
+ }
+
public String convertToJson(Message<?> message) throws
JsonProcessingException {
Preconditions.checkNotNull(message);
return mapper.writeValueAsString(IndexableMessage.from(message,
textExtractor));
Modified:
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePart.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePart.java?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
---
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePart.java
(original)
+++
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePart.java
Mon Jun 29 08:46:59 2015
@@ -26,9 +26,9 @@ import com.google.common.collect.Immutab
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import org.apache.commons.io.FilenameUtils;
-import
org.apache.james.mailbox.elasticsearch.json.extractor.DefaultTextExtractor;
-import org.apache.james.mailbox.elasticsearch.json.extractor.ParsedContent;
-import org.apache.james.mailbox.elasticsearch.json.extractor.TextExtractor;
+import org.apache.james.mailbox.store.extractor.DefaultTextExtractor;
+import org.apache.james.mailbox.store.extractor.ParsedContent;
+import org.apache.james.mailbox.store.extractor.TextExtractor;
import org.apache.james.mime4j.stream.Field;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -119,7 +119,7 @@ public class MimePart {
Optional<ParsedContent> parsedContent =
parseContent(textExtractor);
return new MimePart(
headerCollectionBuilder.build(),
- parsedContent.map(ParsedContent::getTextualContent)
+ parsedContent.map( x ->
Optional.ofNullable(x.getTextualContent()))
.orElse(Optional.empty())
,
mediaType,
@@ -128,15 +128,24 @@ public class MimePart {
fileExtension,
contentDisposition,
children,
- parsedContent.map(ParsedContent::getMetadata)
- .orElse(ImmutableMultimap.<String,
String>builder().build())
+ parsedContent
+ .map(x -> x.getMetadata()
+ .entrySet()
+ .stream()
+ .reduce(ImmutableMultimap.<String, String>builder(),
+ (builder, entry) -> builder.putAll(entry.getKey(),
entry.getValue()),
+ (builder1, builder2) ->
builder1.putAll(builder2.build())).build())
+ .orElse(ImmutableMultimap.of())
);
}
private Optional<ParsedContent> parseContent(TextExtractor
textExtractor) {
if (bodyContent.isPresent()) {
try {
- return
Optional.of(textExtractor.extractContent(bodyContent.get(),
computeContentType(), fileName));
+ return Optional.of(textExtractor.extractContent(
+ bodyContent.get(),
+ computeContentType().orElse(null),
+ fileName.orElse(null)));
} catch (Exception e) {
LOGGER.warn("Failed parsing attachment", e);
}
Modified:
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartContainerBuilder.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartContainerBuilder.java?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
---
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartContainerBuilder.java
(original)
+++
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartContainerBuilder.java
Mon Jun 29 08:46:59 2015
@@ -19,7 +19,7 @@
package org.apache.james.mailbox.elasticsearch.json;
-import org.apache.james.mailbox.elasticsearch.json.extractor.TextExtractor;
+import org.apache.james.mailbox.store.extractor.TextExtractor;
import org.apache.james.mime4j.stream.Field;
import java.io.InputStream;
Modified:
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
---
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java
(original)
+++
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java
Mon Jun 29 08:46:59 2015
@@ -20,7 +20,7 @@
package org.apache.james.mailbox.elasticsearch.json;
import com.google.common.base.Preconditions;
-import org.apache.james.mailbox.elasticsearch.json.extractor.TextExtractor;
+import org.apache.james.mailbox.store.extractor.TextExtractor;
import org.apache.james.mailbox.store.mail.model.MailboxId;
import org.apache.james.mailbox.store.mail.model.Message;
import org.apache.james.mime4j.MimeException;
Modified:
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/RootMimePartContainerBuilder.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/RootMimePartContainerBuilder.java?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
---
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/RootMimePartContainerBuilder.java
(original)
+++
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/RootMimePartContainerBuilder.java
Mon Jun 29 08:46:59 2015
@@ -19,7 +19,7 @@
package org.apache.james.mailbox.elasticsearch.json;
-import org.apache.james.mailbox.elasticsearch.json.extractor.TextExtractor;
+import org.apache.james.mailbox.store.extractor.TextExtractor;
import org.apache.james.mime4j.stream.Field;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Modified:
james/mailbox/trunk/elasticsearch/src/main/resources/META-INF/spring/mailbox-elasticsearch.xml
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/resources/META-INF/spring/mailbox-elasticsearch.xml?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
---
james/mailbox/trunk/elasticsearch/src/main/resources/META-INF/spring/mailbox-elasticsearch.xml
(original)
+++
james/mailbox/trunk/elasticsearch/src/main/resources/META-INF/spring/mailbox-elasticsearch.xml
Mon Jun 29 08:46:59 2015
@@ -39,7 +39,9 @@
<constructor-arg index="0" ref="elasticsearch-clientprovider-2"/>
</bean>
- <bean id="elasticsearch-json"
class="org.apache.james.mailbox.elasticsearch.json.MessageToElasticSearchJson"/>
+ <bean id="elasticsearch-json"
class="org.apache.james.mailbox.elasticsearch.json.MessageToElasticSearchJson">
+ <constructor-arg index="0" ref="text-extractor"/>
+ </bean>
<bean id="elasticsearch-searcher"
class="org.apache.james.mailbox.elasticsearch.search.ElasticSearchSearcher">
<constructor-arg index="0" ref="elasticsearch-clientprovider-2"/>
@@ -65,4 +67,6 @@
<bean id="criterion-converter"
class="org.apache.james.mailbox.elasticsearch.query.CriterionConverter"/>
+ <bean id="text-extractor"
class="org.apache.james.mailbox.tika.extractor.TikaTextExtractor"/>
+
</beans>
Modified:
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/ElasticSearchIntegrationTest.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/ElasticSearchIntegrationTest.java?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
---
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/ElasticSearchIntegrationTest.java
(original)
+++
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/ElasticSearchIntegrationTest.java
Mon Jun 29 08:46:59 2015
@@ -30,7 +30,7 @@ import org.apache.james.mailbox.acl.Simp
import org.apache.james.mailbox.acl.UnionMailboxACLResolver;
import
org.apache.james.mailbox.elasticsearch.events.ElasticSearchListeningMessageSearchIndex;
import org.apache.james.mailbox.elasticsearch.json.MessageToElasticSearchJson;
-import
org.apache.james.mailbox.elasticsearch.json.extractor.DefaultTextExtractor;
+import org.apache.james.mailbox.store.extractor.DefaultTextExtractor;
import org.apache.james.mailbox.elasticsearch.query.CriterionConverter;
import org.apache.james.mailbox.elasticsearch.query.QueryConverter;
import org.apache.james.mailbox.elasticsearch.search.ElasticSearchSearcher;
Modified:
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
---
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java
(original)
+++
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java
Mon Jun 29 08:46:59 2015
@@ -22,8 +22,8 @@ package org.apache.james.mailbox.elastic
import com.google.common.base.Throwables;
import org.apache.commons.io.IOUtils;
import org.apache.james.mailbox.FlagsBuilder;
-import
org.apache.james.mailbox.elasticsearch.json.extractor.DefaultTextExtractor;
-import org.apache.james.mailbox.elasticsearch.json.extractor.TikaTextExtractor;
+import org.apache.james.mailbox.tika.extractor.TikaTextExtractor;
+import org.apache.james.mailbox.store.extractor.DefaultTextExtractor;
import org.apache.james.mailbox.store.TestId;
import org.apache.james.mailbox.store.mail.model.Message;
import org.apache.james.mailbox.store.mail.model.impl.PropertyBuilder;
Modified: james/mailbox/trunk/pom.xml
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/pom.xml?rev=1688147&r1=1688146&r2=1688147&view=diff
==============================================================================
--- james/mailbox/trunk/pom.xml (original)
+++ james/mailbox/trunk/pom.xml Mon Jun 29 08:46:59 2015
@@ -60,6 +60,7 @@
<module>memory</module>
<module>store</module>
<module>spring</module>
+ <module>tika</module>
<module>tool</module>
<module>zoo-seq-provider</module>
</modules>
@@ -201,11 +202,16 @@
<artifactId>apache-james-mailbox-cassandra</artifactId>
<version>${project.version}</version>
</dependency>
- <dependency>
+ <dependency>
<groupId>org.apache.james</groupId>
<artifactId>apache-james-mailbox-elasticsearch</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-james-mailbox-tika</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>org.apache.james</groupId>
@@ -601,6 +607,30 @@
<profiles>
<profile>
+ <id>exclude-tika</id>
+ <activation>
+ <activeByDefault>false</activeByDefault>
+ </activation>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-james-mailbox-elasticsearch</artifactId>
+ <type>pom</type>
+ <exclusions>
+ <exclusion>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ </dependencies>
+ </profile>
+ <profile>
<id>geronimo</id>
<properties>
<javax.mail.groupId>org.apache.geronimo.javamail</javax.mail.groupId>
Added:
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractor.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractor.java?rev=1688147&view=auto
==============================================================================
---
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractor.java
(added)
+++
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractor.java
Mon Jun 29 08:46:59 2015
@@ -0,0 +1,43 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one *
+ * or more contributor license agreements. See the NOTICE file *
+ * distributed with this work for additional information *
+ * regarding copyright ownership. The ASF licenses this file *
+ * to you under the Apache License, Version 2.0 (the *
+ * "License"); you may not use this file except in compliance *
+ * with the License. You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, *
+ * software distributed under the License is distributed on an *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
+ * KIND, either express or implied. See the License for the *
+ * specific language governing permissions and limitations *
+ * under the License. *
+ ****************************************************************/
+
+package org.apache.james.mailbox.store.extractor;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+
+/**
+ * A default text extractor that is directly based on the input file provided.
+ *
+ * Costs less calculations that TikaTextExtractor, but result is not that good.
+ */
+public class DefaultTextExtractor implements TextExtractor {
+
+ @Override
+ public ParsedContent extractContent(InputStream inputStream, String
contentType, String fileName) throws Exception {
+ if(contentType != null && contentType.startsWith("text/") ) {
+ return new ParsedContent(IOUtils.toString(inputStream), new
HashMap<String, List<String>>());
+ } else {
+ return new ParsedContent(null, new HashMap<String,
List<String>>());
+ }
+ }
+}
Added:
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/ParsedContent.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/ParsedContent.java?rev=1688147&view=auto
==============================================================================
---
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/ParsedContent.java
(added)
+++
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/ParsedContent.java
Mon Jun 29 08:46:59 2015
@@ -0,0 +1,43 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one *
+ * or more contributor license agreements. See the NOTICE file *
+ * distributed with this work for additional information *
+ * regarding copyright ownership. The ASF licenses this file *
+ * to you under the Apache License, Version 2.0 (the *
+ * "License"); you may not use this file except in compliance *
+ * with the License. You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, *
+ * software distributed under the License is distributed on an *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
+ * KIND, either express or implied. See the License for the *
+ * specific language governing permissions and limitations *
+ * under the License. *
+ ****************************************************************/
+
+package org.apache.james.mailbox.store.extractor;
+
+import java.util.List;
+import java.util.Map;
+
+public class ParsedContent {
+
+ private final String textualContent;
+ private final Map<String, List<String>> metadata;
+
+ public ParsedContent(String textualContent, Map<String, List<String>>
metadata) {
+ this.textualContent = textualContent;
+ this.metadata = metadata;
+ }
+
+ public String getTextualContent() {
+ return textualContent;
+ }
+
+ public Map<String, List<String>> getMetadata() {
+ return metadata;
+ }
+
+}
Added:
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/TextExtractor.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/TextExtractor.java?rev=1688147&view=auto
==============================================================================
---
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/TextExtractor.java
(added)
+++
james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/extractor/TextExtractor.java
Mon Jun 29 08:46:59 2015
@@ -0,0 +1,28 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one *
+ * or more contributor license agreements. See the NOTICE file *
+ * distributed with this work for additional information *
+ * regarding copyright ownership. The ASF licenses this file *
+ * to you under the Apache License, Version 2.0 (the *
+ * "License"); you may not use this file except in compliance *
+ * with the License. You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, *
+ * software distributed under the License is distributed on an *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
+ * KIND, either express or implied. See the License for the *
+ * specific language governing permissions and limitations *
+ * under the License. *
+ ****************************************************************/
+
+package org.apache.james.mailbox.store.extractor;
+
+import java.io.InputStream;
+
+public interface TextExtractor {
+
+ ParsedContent extractContent(InputStream inputStream, String contentType,
String fileName) throws Exception;
+
+}
Added:
james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractorTest.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractorTest.java?rev=1688147&view=auto
==============================================================================
---
james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractorTest.java
(added)
+++
james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractorTest.java
Mon Jun 29 08:46:59 2015
@@ -0,0 +1,56 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one *
+ * or more contributor license agreements. See the NOTICE file *
+ * distributed with this work for additional information *
+ * regarding copyright ownership. The ASF licenses this file *
+ * to you under the Apache License, Version 2.0 (the *
+ * "License"); you may not use this file except in compliance *
+ * with the License. You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, *
+ * software distributed under the License is distributed on an *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
+ * KIND, either express or implied. See the License for the *
+ * specific language governing permissions and limitations *
+ * under the License. *
+ ****************************************************************/
+
+package org.apache.james.mailbox.store.extractor;
+
+import java.io.InputStream;
+
+import org.junit.Before;
+import org.junit.Test;
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class DefaultTextExtractorTest {
+ private TextExtractor textExtractor;
+
+ @Before
+ public void setUp() {
+ textExtractor = new DefaultTextExtractor();
+ }
+
+ @Test
+ public void textTest() throws Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/Text.txt");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(inputStream, "text/plain",
"Text.txt")
+ .getTextualContent())
+ .isEqualTo("This is some awesome text text.\n\n");
+ }
+
+ @Test
+ public void textMicrosoftWorldTest() throws Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/writter.docx");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(
+ inputStream,
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "writter.docx")
+ .getTextualContent())
+ .isNull();
+ }
+}
Added: james/mailbox/trunk/store/src/test/resources/documents/Text.txt
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/store/src/test/resources/documents/Text.txt?rev=1688147&view=auto
==============================================================================
--- james/mailbox/trunk/store/src/test/resources/documents/Text.txt (added)
+++ james/mailbox/trunk/store/src/test/resources/documents/Text.txt Mon Jun 29
08:46:59 2015
@@ -0,0 +1,2 @@
+This is some awesome text text.
+
Added: james/mailbox/trunk/store/src/test/resources/documents/writter.docx
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/store/src/test/resources/documents/writter.docx?rev=1688147&view=auto
==============================================================================
(empty)
Added: james/mailbox/trunk/tika/pom.xml
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/pom.xml?rev=1688147&view=auto
==============================================================================
--- james/mailbox/trunk/tika/pom.xml (added)
+++ james/mailbox/trunk/tika/pom.xml Mon Jun 29 08:46:59 2015
@@ -0,0 +1,230 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <artifactId>apache-james-mailbox</artifactId>
+ <groupId>org.apache.james</groupId>
+ <version>0.6-SNAPSHOT</version>
+
+ </parent>
+
+ <artifactId>apache-james-mailbox-tika</artifactId>
+ <description>Apache James Mailbox project for optional Tika dependency, to
extract attachment textual content before indexation</description>
+ <name>Apache James :: Mailbox :: ElasticSearch</name>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-james-mailbox-store</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>1.7</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>1.7</version>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.assertj</groupId>
+ <artifactId>assertj-core</artifactId>
+ </dependency>
+ </dependencies>
+
+ <profiles>
+ <profile>
+ <id>remove-tika</id>
+ <activation>
+ <activeByDefault>false</activeByDefault>
+ </activation>
+ </profile>
+ <profile>
+ <id>exclude-tika</id>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <exclude>**/**</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ <profile>
+ <id>disable-build-for-older-jdk</id>
+ <activation>
+ <jdk>(,1.8)</jdk>
+ </activation>
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-jar-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>default-jar</id>
+ <phase>none</phase>
+ </execution>
+ <execution>
+ <id>jar</id>
+ <phase>none</phase>
+ </execution>
+ <execution>
+ <id>test-jar</id>
+ <phase>none</phase>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>default-compile</id>
+ <phase>none</phase>
+ </execution>
+ <execution>
+ <id>default-testCompile</id>
+ <phase>none</phase>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>default-test</id>
+ <phase>none</phase>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-source-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>attach-sources</id>
+ <phase>none</phase>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-install-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>default-install</id>
+ <phase>none</phase>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-resources-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>default-resources</id>
+ <phase>none</phase>
+ </execution>
+ <execution>
+ <id>default-testResources</id>
+ <phase>none</phase>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-site-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>attach-descriptor</id>
+ <phase>none</phase>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ <profile>
+ <id>build-for-jdk-8</id>
+ <activation>
+ <jdk>[1.8,)</jdk>
+ </activation>
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifest>
+
<mainClass>fully.qualified.MainClass</mainClass>
+ </manifest>
+ </archive>
+ <descriptorRefs>
+
<descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <source>1.8</source>
+ <target>1.8</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ <profile>
+ <id>disable-animal-sniffer</id>
+ <activation>
+ <jdk>[1.6,)</jdk>
+ </activation>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>animal-sniffer-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>check_java_6</id>
+ <phase>none</phase>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ </profiles>
+
+
+</project>
Added:
james/mailbox/trunk/tika/src/main/java/org/apache/james/mailbox/tika/extractor/TikaTextExtractor.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/main/java/org/apache/james/mailbox/tika/extractor/TikaTextExtractor.java?rev=1688147&view=auto
==============================================================================
---
james/mailbox/trunk/tika/src/main/java/org/apache/james/mailbox/tika/extractor/TikaTextExtractor.java
(added)
+++
james/mailbox/trunk/tika/src/main/java/org/apache/james/mailbox/tika/extractor/TikaTextExtractor.java
Mon Jun 29 08:46:59 2015
@@ -0,0 +1,98 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one *
+ * or more contributor license agreements. See the NOTICE file *
+ * distributed with this work for additional information *
+ * regarding copyright ownership. The ASF licenses this file *
+ * to you under the Apache License, Version 2.0 (the *
+ * "License"); you may not use this file except in compliance *
+ * with the License. You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, *
+ * software distributed under the License is distributed on an *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
+ * KIND, either express or implied. See the License for the *
+ * specific language governing permissions and limitations *
+ * under the License. *
+ ****************************************************************/
+
+package org.apache.james.mailbox.tika.extractor;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.ImmutableList;
+import org.apache.james.mailbox.store.extractor.ParsedContent;
+import org.apache.james.mailbox.store.extractor.TextExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+
+public class TikaTextExtractor implements TextExtractor {
+
+ private static class MetadataEntry {
+
+ private final String name;
+ private final ImmutableList<String> entries;
+
+ public MetadataEntry(String name, List<String> entries) {
+ this.name = name;
+ this.entries = ImmutableList.copyOf(entries);
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public List<String> getEntries() {
+ return entries;
+ }
+ }
+
+ private final Parser parser;
+
+ public TikaTextExtractor() {
+ parser = new AutoDetectParser();
+ }
+
+ public ParsedContent extractContent(InputStream inputStream, String
contentType, String fileName) throws Exception {
+ Metadata metadata = createInitializedMetadata(contentType, fileName);
+
+ StringWriter stringWriter = new StringWriter();
+ BodyContentHandler bodyContentHandler = new
BodyContentHandler(stringWriter);
+ parser.parse(inputStream, bodyContentHandler, metadata, new
ParseContext());
+
+ return new ParsedContent(stringWriter.toString(),
convertMetadataToMultimap(metadata));
+ }
+
+ private Metadata createInitializedMetadata(String contentType, String
fileName) {
+ Metadata metadata = new Metadata();
+ if (contentType != null) {
+ metadata.set(Metadata.CONTENT_TYPE, contentType);
+ }
+ if (fileName != null) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ }
+ return metadata;
+ }
+
+ private Map<String, List<String>> convertMetadataToMultimap(Metadata
metadata) {
+ return Arrays.stream(metadata.names())
+ .map(name -> new MetadataEntry(name,
Arrays.asList(metadata.getValues(name))))
+ .reduce(new HashMap<>(), (metadataMultiMap, metadataEntry) -> {
+ metadataMultiMap.put(metadataEntry.getName(),
metadataEntry.getEntries());
+ return metadataMultiMap;
+ }, (metadataMultimap1, metadataMultimap2) -> {
+ metadataMultimap1.putAll(metadataMultimap2);
+ return metadataMultimap1;
+ });
+ }
+
+}
Added:
james/mailbox/trunk/tika/src/test/java/org/apache/james/mailbox/tika/extractor/TikaTextExtractorTest.java
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/test/java/org/apache/james/mailbox/tika/extractor/TikaTextExtractorTest.java?rev=1688147&view=auto
==============================================================================
---
james/mailbox/trunk/tika/src/test/java/org/apache/james/mailbox/tika/extractor/TikaTextExtractorTest.java
(added)
+++
james/mailbox/trunk/tika/src/test/java/org/apache/james/mailbox/tika/extractor/TikaTextExtractorTest.java
Mon Jun 29 08:46:59 2015
@@ -0,0 +1,125 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one *
+ * or more contributor license agreements. See the NOTICE file *
+ * distributed with this work for additional information *
+ * regarding copyright ownership. The ASF licenses this file *
+ * to you under the Apache License, Version 2.0 (the *
+ * "License"); you may not use this file except in compliance *
+ * with the License. You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, *
+ * software distributed under the License is distributed on an *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
+ * KIND, either express or implied. See the License for the *
+ * specific language governing permissions and limitations *
+ * under the License. *
+ ****************************************************************/
+
+package org.apache.james.mailbox.tika.extractor;
+
+import java.io.InputStream;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.apache.james.mailbox.store.extractor.TextExtractor;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TikaTextExtractorTest {
+
+ private TextExtractor textExtractor;
+
+ @Before
+ public void setUp() {
+ textExtractor = new TikaTextExtractor();
+ }
+
+ @Test
+ public void textTest() throws Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/Text.txt");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(inputStream, "text/plain",
"Text.txt").getTextualContent())
+ .isEqualTo("This is some awesome text text.\n\n\n");
+ }
+
+ @Test
+ public void textMicrosoftWorldTest() throws Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/writter.docx");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(inputStream,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"writter.docx").getTextualContent())
+ .isEqualTo("This is an awesome document on libroffice writterÂ
!\n");
+ }
+
+ @Test
+ public void textOdtTest() throws Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/writter.odt");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(inputStream,
"application/vnd.oasis.opendocument.text", "writter.odt").getTextualContent())
+ .isEqualTo("This is an awesome document on libroffice writterÂ
!\n");
+ }
+
+ @Test
+ public void documentWithBadDeclaredMetadataShouldBeWellHandled() throws
Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/fake.txt");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(inputStream,
"application/vnd.oasis.opendocument.text", "writter.odt").getTextualContent())
+ .isEqualTo("This is an awesome document on libroffice writterÂ
!\n");
+ }
+
+ @Test
+ public void slidePowerPointTest() throws Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/slides.pptx");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(inputStream,
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"slides.pptx").getTextualContent())
+ .isEqualTo("James is awesome\nIt manages attachments so wellÂ
!\n");
+ }
+
+ @Test
+ public void slideOdpTest() throws Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/slides.odp");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(inputStream,
"application/vnd.oasis.opendocument.presentation",
"slides.odp").getTextualContent())
+ .isEqualTo("James is awesome\n\nIt manages attachments so wellÂ
!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n");
+ }
+
+ @Test
+ public void pdfTest() throws Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/PDF.pdf");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(inputStream,
"application/pdf", "PDF.pdf").getTextualContent())
+ .isEqualTo("\nThis is an awesome document on libroffice writter
!\n\n\n");
+ }
+
+ @Test
+ public void odsTest() throws Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/calc.ods");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(inputStream,
"application/vnd.oasis.opendocument.spreadsheet",
"calc.ods").getTextualContent())
+ .isEqualTo("\tThis is an aesome LibreOffice document !\n" +
+ "\n" +
+ "\n" +
+ "???\n" +
+ "Page \n" +
+ "??? (???)\n" +
+ "00/00/0000, 00:00:00\n" +
+ "Page / \n");
+ }
+
+ @Test
+ public void excelTest() throws Exception {
+ InputStream inputStream =
ClassLoader.getSystemResourceAsStream("documents/calc.xlsx");
+ assertThat(inputStream).isNotNull();
+ assertThat(textExtractor.extractContent(inputStream,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"calc.xlsx").getTextualContent())
+ .isEqualTo("Feuille1\n" +
+ "\tThis is an aesome LibreOffice document !\n" +
+ "\n" +
+ "&A\t\n" +
+ "\n" +
+ "Page &P\t\n" +
+ "\n" +
+ "\n");
+ }
+
+}
Added: james/mailbox/trunk/tika/src/test/resources/documents/PDF.pdf
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/test/resources/documents/PDF.pdf?rev=1688147&view=auto
==============================================================================
(empty)
Added: james/mailbox/trunk/tika/src/test/resources/documents/Text.txt
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/test/resources/documents/Text.txt?rev=1688147&view=auto
==============================================================================
--- james/mailbox/trunk/tika/src/test/resources/documents/Text.txt (added)
+++ james/mailbox/trunk/tika/src/test/resources/documents/Text.txt Mon Jun 29
08:46:59 2015
@@ -0,0 +1,2 @@
+This is some awesome text text.
+
Added: james/mailbox/trunk/tika/src/test/resources/documents/calc.ods
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/test/resources/documents/calc.ods?rev=1688147&view=auto
==============================================================================
(empty)
Added: james/mailbox/trunk/tika/src/test/resources/documents/calc.xlsx
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/test/resources/documents/calc.xlsx?rev=1688147&view=auto
==============================================================================
(empty)
Added: james/mailbox/trunk/tika/src/test/resources/documents/fake.txt
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/test/resources/documents/fake.txt?rev=1688147&view=auto
==============================================================================
(empty)
Added: james/mailbox/trunk/tika/src/test/resources/documents/slides.odp
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/test/resources/documents/slides.odp?rev=1688147&view=auto
==============================================================================
(empty)
Added: james/mailbox/trunk/tika/src/test/resources/documents/slides.pptx
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/test/resources/documents/slides.pptx?rev=1688147&view=auto
==============================================================================
(empty)
Added: james/mailbox/trunk/tika/src/test/resources/documents/writter.docx
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/test/resources/documents/writter.docx?rev=1688147&view=auto
==============================================================================
(empty)
Added: james/mailbox/trunk/tika/src/test/resources/documents/writter.odt
URL:
http://svn.apache.org/viewvc/james/mailbox/trunk/tika/src/test/resources/documents/writter.odt?rev=1688147&view=auto
==============================================================================
(empty)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]