Author: ssc
Date: Fri Nov 26 22:20:48 2010
New Revision: 1039578

URL: http://svn.apache.org/viewvc?rev=1039578&view=rev
Log:
MAHOUT-540 Integration test for the DocumentProcessor

Added:
    
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java

Added: 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java?rev=1039578&view=auto
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
 (added)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
 Fri Nov 26 22:20:48 2010
@@ -0,0 +1,54 @@
+package org.apache.mahout.vectorizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.StringTuple;
+import org.junit.Test;
+
+import java.util.Arrays;
+
+/**
+ * Tests tokenizing of <Text documentId, Text text> {...@link SequenceFile}s 
by the {...@link DocumentProcessor} into
+ * <Text documentId, StringTuple tokens> sequence files
+ */
+public class DocumentProcessorTest extends MahoutTestCase {
+
+  @Test
+  public void testTokenizeDocuments() throws Exception {
+    Configuration configuration = new Configuration();
+    FileSystem fs = FileSystem.get(configuration);
+    Path input = new Path(getTestTempDirPath(), "inputDir");
+    Path output = new Path(getTestTempDirPath(), "outputDir");
+
+    String documentId1 = "123";
+    String text1 = "A test for the document processor";
+    String documentId2 = "456";
+    String text2 = "and another one";
+
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, configuration, 
input, Text.class, Text.class);
+    writer.append(new Text(documentId1), new Text(text1));
+    writer.append(new Text(documentId2), new Text(text2));
+    writer.close();
+
+    DocumentProcessor.tokenizeDocuments(input, DefaultAnalyzer.class, output);
+
+    FileStatus[] statuses = fs.listStatus(output);
+    assertEquals(1, statuses.length);
+    Path filePath = statuses[0].getPath();
+    SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath, 
configuration);
+    Text key = reader.getKeyClass().asSubclass(Text.class).newInstance();
+    StringTuple value = 
reader.getValueClass().asSubclass(StringTuple.class).newInstance();
+
+    reader.next(key, value);
+    assertEquals(documentId1, key.toString());
+    assertEquals(Arrays.asList("test", "document", "processor"), 
value.getEntries());
+    reader.next(key, value);
+    assertEquals(documentId2, key.toString());
+    assertEquals(Arrays.asList("another", "one"), value.getEntries());
+  }
+}


Reply via email to