Author: oleg
Date: Fri Jul 15 18:48:36 2011
New Revision: 1147277
URL: http://svn.apache.org/viewvc?rev=1147277&view=rev
Log:
added ngram profiler and its tests, also added an optinton to the TikaCLI.java
for lang.profile creation and its test
Added:
tika/trunk/tika-app/src/test/
tika/trunk/tika-app/src/test/java/
tika/trunk/tika-app/src/test/java/org/
tika/trunk/tika-app/src/test/java/org/apache/
tika/trunk/tika-app/src/test/java/org/apache/tika/
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
tika/trunk/tika-app/src/test/resources/
tika/trunk/tika-app/src/test/resources/test-data/
tika/trunk/tika-app/src/test/resources/test-data/alice.cli.test
tika/trunk/tika-app/src/test/resources/test-data/welsh_corpus.txt
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
tika/trunk/tika-core/src/test/resources/org/apache/tika/language/langbuilder/
tika/trunk/tika-core/src/test/resources/org/apache/tika/language/langbuilder/welsh_corpus.txt
Modified:
tika/trunk/tika-app/pom.xml
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-bundle-it/pom.xml
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
Modified: tika/trunk/tika-app/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/pom.xml?rev=1147277&r1=1147276&r2=1147277&view=diff
==============================================================================
--- tika/trunk/tika-app/pom.xml (original)
+++ tika/trunk/tika-app/pom.xml Fri Jul 15 18:48:36 2011
@@ -35,7 +35,13 @@
<url>http://tika.apache.org/</url>
<dependencies>
- <dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>3.8.2</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-parsers</artifactId>
<version>${project.version}</version>
@@ -53,6 +59,7 @@
<version>1.7.1</version>
<scope>provided</scope>
</dependency>
+
</dependencies>
<build>
@@ -65,6 +72,10 @@
<directory>src/main/resources-filtered</directory>
<filtering>true</filtering>
</resource>
+ <resource>
+ <directory>src/test/resources</directory>
+ <filtering>true</filtering>
+ </resource>
</resources>
<plugins>
<plugin>
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1147277&r1=1147276&r2=1147277&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Fri Jul
15 18:48:36 2011
@@ -61,6 +61,7 @@ import org.apache.tika.gui.TikaGUI;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.language.LanguageProfilerBuilder;
import org.apache.tika.language.ProfilingHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -225,6 +226,22 @@ public class TikaCLI {
writer.flush();
}
};
+
+
+ /* Creates ngram profile */
+ private final OutputType CREATE_PROFILE = new OutputType() {
+ @Override
+ public void process(InputStream stream, OutputStream output)
+ throws Exception {
+ ngp = LanguageProfilerBuilder.create(profileName, stream,
encoding);
+ FileOutputStream fos = new FileOutputStream(new File(profileName +
".ngp"));
+ ngp.save(fos);//saves ngram profile
+ fos.close();
+ PrintWriter writer = new PrintWriter(getOutputWriter(output,
encoding));
+ writer.println("ngram profile location:=" + new
File(ngp.getName()).getCanonicalPath());
+ writer.flush();
+ }
+ };
private ParseContext context;
@@ -235,6 +252,8 @@ public class TikaCLI {
private Metadata metadata;
private OutputType type = XML;
+
+ private LanguageProfilerBuilder ngp = null;
/**
* Output character encoding, or <code>null</code> for platform default
@@ -247,6 +266,8 @@ public class TikaCLI {
private boolean fork = false;
+ private String profileName = null;
+
public TikaCLI() throws Exception {
context = new ParseContext();
detector = new DefaultDetector();
@@ -313,6 +334,9 @@ public class TikaCLI {
} else if (arg.startsWith("--client=")) {
URI uri = new URI(arg.substring("--client=".length()));
parser = new NetworkParser(uri);
+ } else if(arg.startsWith("--create-profile=")){
+ profileName = arg.substring("--create-profile=".length());
+ type = CREATE_PROFILE;
} else {
pipeMode = false;
metadata = new Metadata();
@@ -367,6 +391,8 @@ public class TikaCLI {
out.println(" -eX or --encoding=X Use output encoding X");
out.println(" -z or --extract Extract all attachements into
current directory");
out.println();
+ out.println(" --create-profile=X");
+ out.println(" Create NGram profile, where X is a profile
name");
out.println(" --list-parsers");
out.println(" List the available document parsers");
out.println(" --list-parser-details");
Added: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1147277&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
(added)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Fri
Jul 15 18:48:36 2011
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+import java.net.URI;
+import junit.framework.Assert;
+import junit.framework.TestCase;
+
+/**
+ * Tests the Tika's cli
+ */
+public class TikaCLITest extends TestCase{
+ /* Test members */
+ private File profile = null;
+ private ByteArrayOutputStream outContent = null;
+ private PrintStream stdout = null;
+ private PrintStream reassign = null;
+ private URI testDataURI = new
File("src/test/resources/test-data/").toURI();
+ private String resorcePrefix = testDataURI.toString();
+
+
+ public void setUp() throws Exception {
+ profile = new File("welsh.ngp");
+ outContent = new ByteArrayOutputStream();
+ stdout = System.out;
+ reassign = new PrintStream(outContent);
+ System.setOut(reassign);
+ }
+
+
+ /**
+ * Creates a welsh language profile
+ *
+ * @throws Exception
+ */
+ public void testCreateProfile() throws Exception {
+ String[] params = {"--create-profile=welsh", "-eUTF-8",
resorcePrefix + "welsh_corpus.txt"};
+ TikaCLI.main(params);
+ Assert.assertTrue(profile.exists());
+ }
+
+ /**
+ * Tests --list-parser-detail option of the cli
+ *
+ * @throws Exception
+ */
+ public void testListParserDetail() throws Exception{
+ String[] params = {"--list-parser-detail"};
+ TikaCLI.main(params);
+
Assert.assertTrue(outContent.toString().contains("application/vnd.oasis.opendocument.text-web"));
+ }
+
+ /**
+ * Tests --list-parser option of the cli
+ *
+ * @throws Exception
+ */
+ public void testListParsers() throws Exception{
+ String[] params = {"--list-parser"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent != null && outContent.size() >=
1447);
+ }
+
+ /**
+ * Tests -x option of the cli
+ *
+ * @throws Exception
+ */
+ public void testXMLOutput() throws Exception{
+ String[] params = {"-x", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("?xml
version=\"1.0\" encoding=\"UTF-8\"?"));
+ }
+
+ /**
+ * Tests a -h option of the cli
+ *
+ * @throws Exception
+ */
+ public void testHTMLOutput() throws Exception{
+ String[] params = {"-h", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("html
xmlns=\"http://www.w3.org/1999/xhtml"));
+ }
+
+ /**
+ * Tests -t option of the cli
+ *
+ * @throws Exception
+ */
+ public void testTextOutput() throws Exception{
+ String[] params = {"-t", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("finished off
the cake"));
+ }
+
+ /**
+ * Tests -m option of the cli
+ * @throws Exception
+ */
+ public void testMetadataOutput() throws Exception{
+ String[] params = {"-m", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("text/plain"));
+ }
+
+ /**
+ * Tests -l option of the cli
+ *
+ * @throws Exception
+ */
+ public void testLanguageOutput() throws Exception{
+ String[] params = {"-l", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("en"));
+ }
+
+ /**
+ * Tests -d option of the cli
+ *
+ * @throws Exception
+ */
+ public void testDetectOutput() throws Exception{
+ String[] params = {"-d", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("text/plain"));
+ }
+
+ /**
+ * Tests --list-met-models option of the cli
+ *
+ * @throws Exception
+ */
+ public void testListMetModels() throws Exception{
+ String[] params = {"--list-met-models", resorcePrefix +
"alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("text/plain"));
+ }
+
+ /**
+ * Tests --list-supported-types option of the cli
+ *
+ * @throws Exception
+ */
+ public void testListSupportedTypes() throws Exception{
+ String[] params = {"--list-supported-types", resorcePrefix +
"alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("supertype:
application/octet-stream"));
+ }
+
+ /**
+ * Tears down the test. Returns the System.out
+ */
+ public void tearDown() throws Exception {
+ if(profile != null && profile.exists())
+ profile.delete();
+ System.setOut(stdout);
+ }
+}
Added: tika/trunk/tika-app/src/test/resources/test-data/alice.cli.test
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/alice.cli.test?rev=1147277&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/resources/test-data/alice.cli.test (added)
+++ tika/trunk/tika-app/src/test/resources/test-data/alice.cli.test Fri Jul 15
18:48:36 2011
@@ -0,0 +1,26 @@
+What a curious feeling!' said Alice; I must be shutting up like a telescope.'
+And so it was indeed: she was now only ten inches high, and her face
brightened up at the thought that
+she was now the right size for going through the little door into that lovely
garden. First, however,
+she waited for a few minutes to see if she was going to shrink any further:
she felt a little nervous about this;
+for it might end, you know,' said Alice to herself, `in my going out
altogether, like a candle. I wonder what I
+should be like then?' And she tried to fancy what the flame of a candle is
like after the candle is blown out,
+for she could not remember ever having seen such a thing.
+After a while, finding that nothing more happened, she decided on going into
the garden at once; but, alas for poor Alice!
+when she got to the door, she found she had forgotten the little golden key,
and when she went back to the table for it,
+she found she could not possibly reach it: she could see it quite plainly
through the glass, and she tried her best to
+climb up one of the legs of the table, but it was too slippery; and when she
had tired herself out with trying, the
+poor little thing sat down and cried.
+
+Come, there's no use in crying like that!' said Alice to herself, rather
sharply; I advise you to leave off this minute!'
+She generally gave herself very good advice, (though she very seldom followed
it), and sometimes she scolded herself so
+severely as to bring tears into her eyes; and once she remembered trying to
box her own ears for having cheated herself in a
+game of croquet she was playing against herself, for this curious child was
very fond of pretending to be two people.
+But it's no use now,' thought poor Alice, to pretend to be two people! Why,
there's hardly enough of me left to make one respectable person!'
+Soon her eye fell on a little glass box that was lying under the table: she
opened it, and found in it a very small cake, on which the words
+EAT ME' were beautifully marked in currants. `Well, I'll eat it,' said Alice,
`and if it makes me grow larger, I can reach the key; and
+if it makes me grow smaller, I can creep under the door; so either way I'll
get into the garden, and I don't care which happens!'
+She ate a little bit, and said anxiously to herself, `Which way? Which way?',
holding her hand on the top of her head to
+feel which way it was growing, and she was quite surprised to find that she
remained the same size: to be sure, this generally
+happens when one eats cake, but Alice had got so much into the way of
expecting nothing but out-of-the-way things to happen,
+that it seemed quite dull and stupid for life to go on in the common way.
+So she set to work, and very soon finished off the cake.
\ No newline at end of file