Author: ssc
Date: Fri Nov 26 22:20:48 2010
New Revision: 1039578
URL: http://svn.apache.org/viewvc?rev=1039578&view=rev
Log:
MAHOUT-540 Integration test for the DocumentProcessor
Added:
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
Added:
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java?rev=1039578&view=auto
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
(added)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
Fri Nov 26 22:20:48 2010
@@ -0,0 +1,54 @@
+package org.apache.mahout.vectorizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.StringTuple;
+import org.junit.Test;
+
+import java.util.Arrays;
+
+/**
+ * Tests tokenizing of <Text documentId, Text text> {...@link SequenceFile}s
by the {...@link DocumentProcessor} into
+ * <Text documentId, StringTuple tokens> sequence files
+ */
+public class DocumentProcessorTest extends MahoutTestCase {
+
+ @Test
+ public void testTokenizeDocuments() throws Exception {
+ Configuration configuration = new Configuration();
+ FileSystem fs = FileSystem.get(configuration);
+ Path input = new Path(getTestTempDirPath(), "inputDir");
+ Path output = new Path(getTestTempDirPath(), "outputDir");
+
+ String documentId1 = "123";
+ String text1 = "A test for the document processor";
+ String documentId2 = "456";
+ String text2 = "and another one";
+
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, configuration,
input, Text.class, Text.class);
+ writer.append(new Text(documentId1), new Text(text1));
+ writer.append(new Text(documentId2), new Text(text2));
+ writer.close();
+
+ DocumentProcessor.tokenizeDocuments(input, DefaultAnalyzer.class, output);
+
+ FileStatus[] statuses = fs.listStatus(output);
+ assertEquals(1, statuses.length);
+ Path filePath = statuses[0].getPath();
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath,
configuration);
+ Text key = reader.getKeyClass().asSubclass(Text.class).newInstance();
+ StringTuple value =
reader.getValueClass().asSubclass(StringTuple.class).newInstance();
+
+ reader.next(key, value);
+ assertEquals(documentId1, key.toString());
+ assertEquals(Arrays.asList("test", "document", "processor"),
value.getEntries());
+ reader.next(key, value);
+ assertEquals(documentId2, key.toString());
+ assertEquals(Arrays.asList("another", "one"), value.getEntries());
+ }
+}