Author: oleg
Date: Fri Jul 15 18:48:36 2011
New Revision: 1147277

URL: http://svn.apache.org/viewvc?rev=1147277&view=rev
Log:
added ngram profiler and its tests, also added an optinton to the TikaCLI.java 
for lang.profile creation and its test

Added:
    tika/trunk/tika-app/src/test/
    tika/trunk/tika-app/src/test/java/
    tika/trunk/tika-app/src/test/java/org/
    tika/trunk/tika-app/src/test/java/org/apache/
    tika/trunk/tika-app/src/test/java/org/apache/tika/
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
    tika/trunk/tika-app/src/test/resources/
    tika/trunk/tika-app/src/test/resources/test-data/
    tika/trunk/tika-app/src/test/resources/test-data/alice.cli.test
    tika/trunk/tika-app/src/test/resources/test-data/welsh_corpus.txt
    
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
    
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
    
tika/trunk/tika-core/src/test/resources/org/apache/tika/language/langbuilder/
    
tika/trunk/tika-core/src/test/resources/org/apache/tika/language/langbuilder/welsh_corpus.txt
Modified:
    tika/trunk/tika-app/pom.xml
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-bundle-it/pom.xml
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java

Modified: tika/trunk/tika-app/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/pom.xml?rev=1147277&r1=1147276&r2=1147277&view=diff
==============================================================================
--- tika/trunk/tika-app/pom.xml (original)
+++ tika/trunk/tika-app/pom.xml Fri Jul 15 18:48:36 2011
@@ -35,7 +35,13 @@
   <url>http://tika.apache.org/</url>
 
   <dependencies>
-    <dependency>
+       <dependency>
+               <groupId>junit</groupId>
+               <artifactId>junit</artifactId>
+               <version>3.8.2</version>
+               <scope>test</scope>
+       </dependency>
+       <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-parsers</artifactId>
       <version>${project.version}</version>
@@ -53,6 +59,7 @@
       <version>1.7.1</version>
       <scope>provided</scope>
     </dependency>
+ 
   </dependencies>
 
   <build>
@@ -65,6 +72,10 @@
         <directory>src/main/resources-filtered</directory>
         <filtering>true</filtering>
       </resource>
+      <resource>
+        <directory>src/test/resources</directory>
+        <filtering>true</filtering>
+      </resource>
     </resources>
     <plugins>
       <plugin>

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1147277&r1=1147276&r2=1147277&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Fri Jul 
15 18:48:36 2011
@@ -61,6 +61,7 @@ import org.apache.tika.gui.TikaGUI;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.language.LanguageProfilerBuilder;
 import org.apache.tika.language.ProfilingHandler;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -225,6 +226,22 @@ public class TikaCLI {
             writer.flush();
         }
     };
+    
+    
+    /* Creates ngram profile */
+    private final OutputType CREATE_PROFILE = new OutputType() {
+        @Override
+        public void process(InputStream stream, OutputStream output)
+                throws Exception {
+            ngp = LanguageProfilerBuilder.create(profileName, stream, 
encoding);
+            FileOutputStream fos = new FileOutputStream(new File(profileName + 
".ngp"));
+            ngp.save(fos);//saves ngram profile
+            fos.close();
+            PrintWriter writer = new PrintWriter(getOutputWriter(output, 
encoding));
+            writer.println("ngram profile location:=" + new 
File(ngp.getName()).getCanonicalPath());
+            writer.flush();
+        }
+    };
 
     private ParseContext context;
     
@@ -235,6 +252,8 @@ public class TikaCLI {
     private Metadata metadata;
 
     private OutputType type = XML;
+    
+    private LanguageProfilerBuilder ngp = null;
 
     /**
      * Output character encoding, or <code>null</code> for platform default
@@ -247,6 +266,8 @@ public class TikaCLI {
 
     private boolean fork = false;
 
+    private String profileName = null;
+    
     public TikaCLI() throws Exception {
         context = new ParseContext();
         detector = new DefaultDetector();
@@ -313,6 +334,9 @@ public class TikaCLI {
         } else if (arg.startsWith("--client=")) {
             URI uri = new URI(arg.substring("--client=".length()));
             parser = new NetworkParser(uri);
+        } else if(arg.startsWith("--create-profile=")){
+            profileName = arg.substring("--create-profile=".length());
+            type = CREATE_PROFILE;
         } else {
             pipeMode = false;
             metadata = new Metadata();
@@ -367,6 +391,8 @@ public class TikaCLI {
         out.println("    -eX or --encoding=X  Use output encoding X");
         out.println("    -z  or --extract     Extract all attachements into 
current directory");        
         out.println();
+        out.println("    --create-profile=X");
+        out.println("         Create NGram profile, where X is a profile 
name");
         out.println("    --list-parsers");
         out.println("         List the available document parsers");
         out.println("    --list-parser-details");

Added: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1147277&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
(added)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Fri 
Jul 15 18:48:36 2011
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+import java.net.URI;
+import junit.framework.Assert;
+import junit.framework.TestCase;
+
+/**
+ * Tests the Tika's cli
+ */
+public class TikaCLITest extends TestCase{
+    /* Test members */
+       private File profile = null;
+       private ByteArrayOutputStream outContent = null;
+       private PrintStream stdout = null;
+       private PrintStream reassign = null;
+       private URI testDataURI = new 
File("src/test/resources/test-data/").toURI();
+       private String resorcePrefix = testDataURI.toString();
+       
+       
+       public void setUp() throws Exception {
+               profile = new File("welsh.ngp");
+               outContent = new ByteArrayOutputStream();
+               stdout = System.out;
+               reassign = new PrintStream(outContent);
+               System.setOut(reassign);
+       }
+       
+       
+       /**
+        * Creates a welsh language profile
+        * 
+        * @throws Exception
+        */
+       public void testCreateProfile() throws Exception {
+               String[] params = {"--create-profile=welsh", "-eUTF-8", 
resorcePrefix + "welsh_corpus.txt"};
+               TikaCLI.main(params);
+               Assert.assertTrue(profile.exists());
+       }
+       
+       /**
+        * Tests --list-parser-detail option of the cli
+        * 
+        * @throws Exception
+        */
+       public void testListParserDetail() throws Exception{
+               String[] params = {"--list-parser-detail"};
+               TikaCLI.main(params);
+               
Assert.assertTrue(outContent.toString().contains("application/vnd.oasis.opendocument.text-web"));
+       }
+       
+       /**
+        * Tests --list-parser option of the cli
+        * 
+        * @throws Exception
+        */
+       public void testListParsers() throws Exception{
+               String[] params = {"--list-parser"};
+               TikaCLI.main(params);
+               Assert.assertTrue(outContent != null && outContent.size() >= 
1447);
+       }
+       
+       /**
+        * Tests -x option of the cli
+        * 
+        * @throws Exception
+        */
+       public void testXMLOutput() throws Exception{
+               String[] params = {"-x", resorcePrefix + "alice.cli.test"};
+               TikaCLI.main(params);
+               Assert.assertTrue(outContent.toString().contains("?xml 
version=\"1.0\" encoding=\"UTF-8\"?"));
+       }
+       
+       /**
+        * Tests a -h option of the cli
+        * 
+        * @throws Exception
+        */
+       public void testHTMLOutput() throws Exception{
+               String[] params = {"-h", resorcePrefix + "alice.cli.test"};
+               TikaCLI.main(params);
+               Assert.assertTrue(outContent.toString().contains("html 
xmlns=\"http://www.w3.org/1999/xhtml";));
+       }
+       
+       /**
+        * Tests -t option of the cli
+        * 
+        * @throws Exception
+        */
+       public void testTextOutput() throws Exception{
+               String[] params = {"-t", resorcePrefix + "alice.cli.test"};
+               TikaCLI.main(params);
+               Assert.assertTrue(outContent.toString().contains("finished off 
the cake"));
+       }
+       
+       /**
+        * Tests -m option of the cli
+        * @throws Exception
+        */
+       public void testMetadataOutput() throws Exception{
+               String[] params = {"-m", resorcePrefix + "alice.cli.test"};
+               TikaCLI.main(params);
+               Assert.assertTrue(outContent.toString().contains("text/plain"));
+       }
+       
+       /**
+        * Tests -l option of the cli
+        * 
+        * @throws Exception
+        */
+       public void testLanguageOutput() throws Exception{
+               String[] params = {"-l", resorcePrefix + "alice.cli.test"};
+               TikaCLI.main(params);
+               Assert.assertTrue(outContent.toString().contains("en"));
+       }
+       
+       /**
+        * Tests -d option of the cli
+        * 
+        * @throws Exception
+        */
+       public void testDetectOutput() throws Exception{
+               String[] params = {"-d", resorcePrefix + "alice.cli.test"};
+               TikaCLI.main(params);
+               Assert.assertTrue(outContent.toString().contains("text/plain"));
+       }
+       
+       /**
+        * Tests --list-met-models option of the cli
+        * 
+        * @throws Exception
+        */
+       public void testListMetModels() throws Exception{
+               String[] params = {"--list-met-models", resorcePrefix + 
"alice.cli.test"};
+               TikaCLI.main(params);
+               Assert.assertTrue(outContent.toString().contains("text/plain"));
+       }
+       
+       /**
+        * Tests --list-supported-types option of the cli
+        * 
+        * @throws Exception
+        */
+       public void testListSupportedTypes() throws Exception{
+               String[] params = {"--list-supported-types", resorcePrefix + 
"alice.cli.test"};
+               TikaCLI.main(params);
+               Assert.assertTrue(outContent.toString().contains("supertype: 
application/octet-stream"));
+       }
+       
+       /**
+        * Tears down the test. Returns the System.out
+        */
+       public void tearDown() throws Exception {
+               if(profile != null && profile.exists())
+                       profile.delete();
+               System.setOut(stdout);
+       }
+}

Added: tika/trunk/tika-app/src/test/resources/test-data/alice.cli.test
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/alice.cli.test?rev=1147277&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/resources/test-data/alice.cli.test (added)
+++ tika/trunk/tika-app/src/test/resources/test-data/alice.cli.test Fri Jul 15 
18:48:36 2011
@@ -0,0 +1,26 @@
+What a curious feeling!' said Alice; I must be shutting up like a telescope.'
+And so it was indeed: she was now only ten inches high, and her face 
brightened up at the thought that 
+she was now the right size for going through the little door into that lovely 
garden. First, however, 
+she waited for a few minutes to see if she was going to shrink any further: 
she felt a little nervous about this;
+for it might end, you know,' said Alice to herself, `in my going out 
altogether, like a candle. I wonder what I 
+should be like then?' And she tried to fancy what the flame of a candle is 
like after the candle is blown out, 
+for she could not remember ever having seen such a thing.
+After a while, finding that nothing more happened, she decided on going into 
the garden at once; but, alas for poor Alice! 
+when she got to the door, she found she had forgotten the little golden key, 
and when she went back to the table for it, 
+she found she could not possibly reach it: she could see it quite plainly 
through the glass, and she tried her best to 
+climb up one of the legs of the table, but it was too slippery; and when she 
had tired herself out with trying, the 
+poor little thing sat down and cried.
+
+Come, there's no use in crying like that!' said Alice to herself, rather 
sharply; I advise you to leave off this minute!' 
+She generally gave herself very good advice, (though she very seldom followed 
it), and sometimes she scolded herself so 
+severely as to bring tears into her eyes; and once she remembered trying to 
box her own ears for having cheated herself in a 
+game of croquet she was playing against herself, for this curious child was 
very fond of pretending to be two people. 
+But it's no use now,' thought poor Alice, to pretend to be two people! Why, 
there's hardly enough of me left to make one respectable person!'
+Soon her eye fell on a little glass box that was lying under the table: she 
opened it, and found in it a very small cake, on which the words 
+EAT ME' were beautifully marked in currants. `Well, I'll eat it,' said Alice, 
`and if it makes me grow larger, I can reach the key; and 
+if it makes me grow smaller, I can creep under the door; so either way I'll 
get into the garden, and I don't care which happens!'
+She ate a little bit, and said anxiously to herself, `Which way? Which way?', 
holding her hand on the top of her head to 
+feel which way it was growing, and she was quite surprised to find that she 
remained the same size: to be sure, this generally 
+happens when one eats cake, but Alice had got so much into the way of 
expecting nothing but out-of-the-way things to happen, 
+that it seemed quite dull and stupid for life to go on in the common way.
+So she set to work, and very soon finished off the cake. 
\ No newline at end of file


Reply via email to