Author: gsingers
Date: Fri Mar 25 14:39:25 2011
New Revision: 1085408
URL: http://svn.apache.org/viewvc?rev=1085408&view=rev
Log:
MAHOUT-588: partial commit, not including shell script yet
Added:
mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
(with props)
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
(with props)
mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
(with props)
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
(with props)
Added:
mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java?rev=1085408&view=auto
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
(added)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
Fri Mar 25 14:39:25 2011
@@ -0,0 +1,171 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LengthFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.PorterStemFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
+
+/**
+ * Custom Lucene Analyzer designed for aggressive feature reduction
+ * for clustering the ASF Mail Archives using an extended set of
+ * stop words, excluding non-alpha-numeric tokens, and porter stemming.
+ */
+public class MailArchivesClusteringAnalyzer extends Analyzer {
+
+ // extended set of stop words composed of common mail terms like "hi",
+ // HTML tags, and Java keywords asmany of the messages in the archives
+ // are subversion check-in notifications
+ private static final String[] STOP_WORDS = new String[] {
+
"3d","7bit","a0","about","above","abstract","across","additional","after",
+ "afterwards","again","against","align","all","almost","alone","along",
+
"already","also","although","always","am","among","amongst","amoungst",
+
"amount","an","and","another","any","anybody","anyhow","anyone","anything",
+ "anyway","anywhere","are","arial","around","as","ascii","assert","at",
+
"back","background","base64","bcc","be","became","because","become","becomes",
+
"becoming","been","before","beforehand","behind","being","below","beside",
+
"besides","between","beyond","bgcolor","blank","blockquote","body","boolean",
+
"border","both","br","break","but","by","can","cannot","cant","case","catch",
+
"cc","cellpadding","cellspacing","center","char","charset","cheers","class",
+
"co","color","colspan","com","con","const","continue","could","couldnt",
+
"cry","css","de","dear","default","did","didnt","different","div","do",
+
"does","doesnt","done","dont","double","down","due","during","each","eg",
+
"eight","either","else","elsewhere","empty","encoding","enough","enum",
+ "etc","eu","even","ever","every","everyone","everything","everywhere",
+
"except","extends","face","family","few","ffffff","final","finally","float",
+
"font","for","former","formerly","fri","from","further","get","give","go",
+
"good","got","goto","gt","h1","ha","had","has","hasnt","have","he","head",
+
"height","hello","helvetica","hence","her","here","hereafter","hereby",
+ "herein","hereupon","hers","herself","hi","him","himself","his","how",
+ "however","hr","href","html","http","https","id","ie","if","ill","im",
+
"image","img","implements","import","in","inc","instanceof","int","interface",
+
"into","is","isnt","iso-8859-1","it","its","itself","ive","just","keep",
+
"last","latter","latterly","least","left","less","li","like","long","look",
+
"lt","ltd","mail","mailto","many","margin","may","me","meanwhile","message",
+
"meta","might","mill","mine","mon","more","moreover","most","mostly","mshtml",
+
"mso","much","must","my","myself","name","namely","native","nbsp","need",
+
"neither","never","nevertheless","new","next","nine","no","nobody","none",
+
"noone","nor","not","nothing","now","nowhere","null","of","off","often",
+
"ok","on","once","only","onto","or","org","other","others","otherwise",
+
"our","ours","ourselves","out","over","own","package","pad","per","perhaps",
+
"plain","please","pm","printable","private","protected","public","put",
+
"quot","quote","r1","r2","rather","re","really","regards","reply","return",
+
"right","said","same","sans","sat","say","saying","see","seem","seemed",
+
"seeming","seems","serif","serious","several","she","short","should","show",
+
"side","since","sincere","six","sixty","size","so","solid","some","somehow",
+ "someone","something","sometime","sometimes","somewhere","span","src",
+
"static","still","strictfp","string","strong","style","stylesheet","subject",
+
"such","sun","super","sure","switch","synchronized","table","take","target",
+
"td","text","th","than","thanks","that","the","their","them","themselves",
+
"then","thence","there","thereafter","thereby","therefore","therein","thereupon",
+ "these","they","thick","thin","think","third","this","those","though",
+
"three","through","throughout","throw","throws","thru","thu","thus","tm",
+
"to","together","too","top","toward","towards","tr","transfer","transient",
+
"try","tue","type","ul","un","under","unsubscribe","until","up","upon",
+
"us","use","used","uses","using","valign","verdana","very","via","void",
+
"volatile","want","was","we","wed","weight","well","were","what","whatever",
+
"when","whence","whenever","where","whereafter","whereas","whereby","wherein",
+
"whereupon","wherever","whether","which","while","whither","who","whoever",
+ "whole","whom","whose","why","width","will","with","within","without",
+
"wont","would","wrote","www","yes","yet","you","your","yours","yourself",
+ "yourselves"
+ };
+
+ // Regex used to exclude non-alpha-numeric tokens
+ private static final Pattern alphaNumeric =
Pattern.compile("^[a-z][a-z0-9_]+$");
+ private final CharArraySet stopSet;
+
+ public MailArchivesClusteringAnalyzer() {
+ stopSet =
(CharArraySet)StopFilter.makeStopSet(Arrays.asList(STOP_WORDS));
+ java.util.TreeSet<String> tmp = new java.util.TreeSet<String>();
+ java.util.Iterator iter = stopSet.iterator();
+ while (iter.hasNext()) {
+ tmp.add((String)iter.next());
+ }
+ }
+
+ public MailArchivesClusteringAnalyzer(CharArraySet stopSet) {
+ this.stopSet = stopSet;
+ }
+
+ @Override
+ public TokenStream tokenStream(String fieldName, java.io.Reader reader)
{
+ @SuppressWarnings("deprecation")
+ TokenStream result = new
StandardTokenizer(Version.LUCENE_CURRENT, reader);
+ result = new StandardFilter(result);
+ result = new LowerCaseFilter(result);
+ result = new ASCIIFoldingFilter(result);
+ result = new AlphaNumericMaxLengthFilter(result);
+ result = new StopFilter(false, result, stopSet);
+ return new PorterStemFilter(result);
+ }
+
+ /**
+ * Matches alpha-numeric tokens between 2 and 40 chars long.
+ */
+ class AlphaNumericMaxLengthFilter extends TokenFilter {
+ private TermAttribute termAtt;
+ private final char[] output = new char[28];
+ private Matcher matcher;
+
+ public AlphaNumericMaxLengthFilter(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(TermAttribute.class);
+ matcher = alphaNumeric.matcher("foo");
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ // return the first alpha-numeric token between 2 and 40 length
+ while (input.incrementToken()) {
+ final int length = termAtt.termLength();
+ if (length >= 2 && length <= 28) {
+ final char[] buf = termAtt.termBuffer();
+ int at = 0;
+ for (int c=0; c < length; c++) {
+ final char ch = buf[c];
+ if (ch != '\'') {
+ output[at++] = ch;
+ }
+ }
+ final String term = new String(output, 0, at);
+ matcher.reset(term);
+ if (matcher.matches() && !term.startsWith("a0")) {
+ termAtt.setTermBuffer(term);
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+ }
+}
Propchange:
mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java?rev=1085408&view=auto
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
(added)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
Fri Mar 25 14:39:25 2011
@@ -0,0 +1,295 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.FileLineIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+
+/**
+ * Converts a directory of gzipped mail archives into SequenceFiles of
specified chunkSize.
+ * This class is similar to {@link SequenceFilesFromDirectory} except it uses
block-compressed
+ * {@link SequenceFile}s and parses out the subject and body text of each mail
message into
+ * a separate key/value pair.
+ */
+public final class SequenceFilesFromMailArchives {
+
+ private static final Logger log =
LoggerFactory.getLogger(SequenceFilesFromMailArchives.class);
+
+ private static ChunkedWriter createNewChunkedWriter(int chunkSizeInMB,
String outputDir) throws IOException {
+ return new ChunkedWriter(chunkSizeInMB, outputDir);
+ }
+
+ public void createSequenceFiles(File parentDir,
+ String outputDir,
+ String prefix,
+ int chunkSizeInMB,
+ Charset charset) throws IOException {
+ ChunkedWriter writer = createNewChunkedWriter(chunkSizeInMB, outputDir);
+ PrefixAdditionFilter filter = new PrefixAdditionFilter(prefix, writer,
charset);
+ parentDir.listFiles(filter);
+ writer.close();
+
+ log.info("Parsed "+filter.getMessageCount()+" messages from
"+parentDir.getAbsolutePath());
+ }
+
+ public static class ChunkedWriter implements Closeable {
+ private final int maxChunkSizeInBytes;
+ private final String outputDir;
+ private SequenceFile.Writer writer;
+ private int currentChunkID;
+ private int currentChunkSize;
+ private final Configuration conf = new Configuration();
+ private final FileSystem fs;
+
+ public ChunkedWriter(int chunkSizeInMB, String outputDir) throws
IOException {
+ if (chunkSizeInMB > 1984) {
+ chunkSizeInMB = 1984;
+ }
+ maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
+ this.outputDir = outputDir;
+ fs = FileSystem.get(conf);
+ currentChunkID = 0;
+
+ writer = SequenceFile.createWriter(fs, conf, getPath(currentChunkID),
Text.class, Text.class, SequenceFile.CompressionType.BLOCK);
+ }
+
+ private Path getPath(int chunkID) {
+ return new Path(outputDir + "/chunk-" + chunkID);
+ }
+
+ public void write(String key, String value) throws IOException {
+ if (currentChunkSize > maxChunkSizeInBytes) {
+ writer.close();
+ log.info("Chunk size ("+currentChunkSize+") reached MAX; creating new
chunk "+(currentChunkID+1));
+ writer = SequenceFile.createWriter(fs, conf,
getPath(currentChunkID++), Text.class, Text.class,
SequenceFile.CompressionType.BLOCK);
+ currentChunkSize = 0;
+ }
+
+ Text keyT = new Text(key);
+ Text valueT = new Text(value);
+ currentChunkSize += keyT.getBytes().length + valueT.getBytes().length;
// Overhead
+ writer.append(keyT, valueT);
+ }
+
+ @Override
+ public void close() throws IOException {
+ writer.close();
+ }
+ }
+
+ // regular expressions used to parse individual messages
+ private static final Pattern MESSAGE_START =
+ Pattern.compile("^From \\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
+ private static final Pattern MESSAGE_ID_PREFIX =
+ Pattern.compile("^message-id: <(.*)>$", Pattern.CASE_INSENSITIVE);
+ private static final Pattern SUBJECT_PREFIX =
+ Pattern.compile("^subject: (.*)$", Pattern.CASE_INSENSITIVE);
+
+ public class PrefixAdditionFilter implements FileFilter {
+ private final String prefix;
+ private final ChunkedWriter writer;
+ private final Charset charset;
+ private final StringBuilder file;
+ private int messageCount;
+
+ public PrefixAdditionFilter(String prefix, ChunkedWriter writer, Charset
charset) {
+ this.prefix = prefix;
+ this.writer = writer;
+ this.charset = charset;
+ this.file = new StringBuilder();
+ this.messageCount = 0;
+ }
+
+ public int getMessageCount() {
+ return messageCount;
+ }
+
+ @Override
+ public boolean accept(File current) {
+ if (current.isDirectory()) {
+ log.info("At "+current.getAbsolutePath());
+ PrefixAdditionFilter nested =
+ new PrefixAdditionFilter(prefix + File.separator +
current.getName(), writer, charset);
+ current.listFiles(nested);
+ int dirCount = nested.getMessageCount();
+ log.info("Parsed "+dirCount+" messages from directory
"+current.getAbsolutePath());
+ messageCount += dirCount;
+ } else {
+ parseFileLineByLine(current);
+ }
+ return false;
+ }
+
+ // extracts mail subject and body text from 0 or more mail messages
+ // embedded in the supplied file using simple pattern matching
+ private final void parseFileLineByLine(File current) {
+ try {
+ file.setLength(0); // reset the buffer
+
+ // tmps used during mail message parsing
+ String messageId = null;
+ boolean inBody = false;
+ Matcher subjectMatcher = SUBJECT_PREFIX.matcher("");
+ Matcher messageIdMatcher = MESSAGE_ID_PREFIX.matcher("");
+ Matcher messageBoundaryMatcher = MESSAGE_START.matcher("");
+
+ for (String nextLine : new FileLineIterable(current, charset, false)) {
+
+ // subject may come before message ID
+ subjectMatcher.reset(nextLine);
+ if (subjectMatcher.matches()) {
+ file.append(subjectMatcher.group(1)).append('\n');
+ }
+
+ // only start appending body content after we've seen a message ID
+ if (messageId != null) {
+ // first, see if we hit the end of the message
+ messageBoundaryMatcher.reset(nextLine);
+ if (messageBoundaryMatcher.matches()) {
+ // done parsing this message ... write it out
+ String key = prefix + File.separator + current.getName() +
File.separator + messageId;
+ writer.write(key, file.toString());
+ file.setLength(0); // reset the buffer
+ messageId = null;
+ inBody = false;
+ } else {
+ if (inBody) {
+ if (nextLine.length() > 0) {
+ file.append(nextLine).append('\n');
+ }
+ } else {
+ // first empty line we see after reading the message Id
+ // indicates that we are in the body ...
+ inBody = (nextLine.length() == 0);
+ }
+ }
+ } else {
+ if (nextLine.length() > 14) {
+ messageIdMatcher.reset(nextLine);
+ if (messageIdMatcher.matches()) {
+ messageId = messageIdMatcher.group(1);
+ ++messageCount;
+ }
+ }
+ }
+ }
+
+ // write the last message in the file if available
+ if (messageId != null) {
+ String key = prefix + File.separator + current.getName() +
File.separator + messageId;
+ writer.write(key, file.toString());
+ file.setLength(0); // reset the buffer
+ messageId = null;
+ inBody = false;
+ }
+ } catch (FileNotFoundException e) {
+ // Skip file.
+ } catch (IOException e) {
+ // TODO: report exceptions and continue;
+ throw new IllegalStateException(e);
+ }
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option parentOpt =
obuilder.withLongName("input").withRequired(true).withArgument(
+
abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The input dir containing the documents").withShortName("i").create();
+
+ Option outputDirOpt =
obuilder.withLongName("output").withRequired(true).withArgument(
+
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The output directory").withShortName("o").create();
+
+ Option chunkSizeOpt = obuilder.withLongName("chunkSize").withArgument(
+
abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The chunkSize in MegaBytes. Defaults to
64").withShortName("chunk").create();
+
+ Option keyPrefixOpt = obuilder.withLongName("keyPrefix").withArgument(
+
abuilder.withName("keyPrefix").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The prefix to be prepended to the
key").withShortName("prefix").create();
+
+ Option charsetOpt =
obuilder.withLongName("charset").withRequired(true).withArgument(
+
abuilder.withName("charset").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The name of the character encoding of the input
files").withShortName("c").create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out
help").withShortName("h")
+ .create();
+
+ Group group =
gbuilder.withName("Options").withOption(keyPrefixOpt).withOption(chunkSizeOpt).withOption(
+
charsetOpt).withOption(outputDirOpt).withOption(helpOpt).withOption(parentOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ parser.setHelpOption(helpOpt);
+ CommandLine cmdLine = parser.parse(args);
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+ File parentDir = new File((String) cmdLine.getValue(parentOpt));
+ String outputDir = (String) cmdLine.getValue(outputDirOpt);
+
+ int chunkSize = 64;
+ if (cmdLine.hasOption(chunkSizeOpt)) {
+ chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
+ }
+
+ String prefix = "";
+ if (cmdLine.hasOption(keyPrefixOpt)) {
+ prefix = (String) cmdLine.getValue(keyPrefixOpt);
+ }
+ Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
+ SequenceFilesFromMailArchives dir = new SequenceFilesFromMailArchives();
+
+ dir.createSequenceFiles(parentDir, outputDir, prefix, chunkSize,
charset);
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
+ }
+}
Propchange:
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java?rev=1085408&view=auto
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
(added)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
Fri Mar 25 14:39:25 2011
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.StringReader;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+import org.junit.Test;
+
+/**
+ * Unit tests for the MailArchivesClusteringAnalyzer text analyzer.
+ */
+public class MailArchivesClusteringAnalyzerTest {
+
+ @Test
+ public void testAnalysis() throws Exception {
+ MailArchivesClusteringAnalyzer analyzer = new
MailArchivesClusteringAnalyzer();
+
+ String text = "A test message\n";
+ text += "atokenthatistoolongtobeusefulforclustertextanalysis\n";
+ text += "Mahout is a scalable, machine-learning LIBRARY\n";
+ text += "we've added some additional stopwords such as html, mailto,
regards\t";
+ text += "apache_hadoop provides the foundation for scalability\n";
+ text += "www.nabble.com [email protected]\n";
+ text += "public void int protected package";
+ StringReader reader = new StringReader(text);
+
+ // if you change the text above, then you may need to change this as well
+ // order matters too
+ String[] expectedTokens = new String[] {
+ "test", "mahout", "scalabl", "machin", "learn", "librari", "weve",
"ad",
+ "stopword", "apach", "hadoop", "provid", "foundat", "scalabl"
+ };
+
+ TokenStream tokenStream = analyzer.tokenStream("test", reader);
+ assertNotNull(tokenStream);
+ TermAttribute termAtt = tokenStream.addAttribute(TermAttribute.class);
+ int e = -1;
+ while (tokenStream.incrementToken()) {
+ assertEquals(expectedTokens[++e], termAtt.term());
+ }
+ }
+}
Propchange:
mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java?rev=1085408&view=auto
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
(added)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
Fri Mar 25 14:39:25 2011
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Reader;
+import org.apache.hadoop.io.Text;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test case for the SequenceFilesFromMailArchives command-line application.
+ */
+public class SequenceFilesFromMailArchivesTest {
+
+ // TODO: Negative tests
+
+ private File inputDir = null;
+ private File outputDir = null;
+
+ /**
+ * Create the input and output directories needed for testing
+ * the SequenceFilesFromMailArchives application.
+ */
+ @Before
+ public void setupBeforeTesting() throws IOException {
+ // tread-lightly, create folder names using the timestamp
+ long now = System.currentTimeMillis();
+ inputDir = createTempDir("mail-archives-"+now+"-in");
+ outputDir = createTempDir("mail-archives-"+now+"-out");
+
+ // write test mail messages to a gzipped file in a nested directory
+ File subDir = new File(inputDir, "subdir");
+ subDir.mkdir();
+ File gzFile = new File(subDir, "mail-messages.gz");
+ GZIPOutputStream gzOut = null;
+ try {
+ gzOut = new GZIPOutputStream(new FileOutputStream(gzFile));
+ gzOut.write(testMailMessages.getBytes("UTF-8"));
+ gzOut.finish();
+ } finally {
+ if (gzOut != null) {
+ try {
+ gzOut.close();
+ } catch (Exception ignore) {}
+ }
+ }
+ }
+
+ /**
+ * Test the main method of the SequenceFilesFromMailArchives
+ * command-line application.
+ */
+ @Test
+ public void testMain() throws Exception {
+ String[] args = new String[] {
+ "--input", inputDir.getAbsolutePath(),
+ "--output", outputDir.getAbsolutePath(),
+ "--charset", "UTF-8",
+ "--keyPrefix", "TEST"
+ };
+
+ // run the application's main method
+ SequenceFilesFromMailArchives.main(args);
+
+ // app should create a single SequenceFile named "chunk-0"
+ // in the output dir
+ File expectedChunkFile = new File(outputDir, "chunk-0");
+ String expectedChunkPath = expectedChunkFile.getAbsolutePath();
+ assertTrue("Expected chunk file "+expectedChunkPath+" not found!",
+ expectedChunkFile.isFile());
+
+ Text key = new Text();
+ Text value = new Text();
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ SequenceFile.Reader seqFileReader = null;
+ try {
+ seqFileReader = new SequenceFile.Reader(fs, new Path(expectedChunkPath),
conf);
+ assertTrue("First key/value pair not found!", seqFileReader.next(key,
value));
+
+ assertEquals("TEST/subdir/mail-messages.gz/"+testVars[0][0],
key.toString());
+ assertEquals(testVars[0][1]+testVars[0][2], value.toString());
+
+ assertTrue("Second key/value pair not found!", seqFileReader.next(key,
value));
+ assertEquals("TEST/subdir/mail-messages.gz/"+testVars[1][0],
key.toString());
+ assertEquals(testVars[1][1]+testVars[1][2], value.toString());
+
+ assertFalse("Only two key/value pairs expected!",
seqFileReader.next(key, value));
+ } finally {
+ if (seqFileReader != null) {
+ try {
+ seqFileReader.close();
+ } catch (Exception ignore) {}
+ }
+ }
+ }
+
+ @After
+ public void cleanupAfterTesting() {
+ if (inputDir != null)
+ rmdir(inputDir);
+
+ if (outputDir != null)
+ rmdir(outputDir);
+ }
+
+ // creates a temp directory for storing test input / output
+ // fails if the directory cannot be created
+ private File createTempDir(String dirName) {
+ File tempDir = new File(System.getProperty("java.io.tmpdir"), dirName);
+ if (!tempDir.isDirectory()) {
+ tempDir.mkdirs();
+ if (!tempDir.isDirectory()) {
+ fail("Failed to create temp directory "+tempDir.getAbsolutePath());
+ }
+ }
+ return tempDir;
+ }
+
+ // recursively delete the temp directories created by this test
+ private void rmdir(File dir) {
+ if (dir.isDirectory()) {
+ File[] files = dir.listFiles();
+ for (int f=0; f<files.length; f++) {
+ if (files[f].isDirectory()) {
+ rmdir(files[f]);
+ } else {
+ files[f].delete();
+ }
+ }
+ }
+ dir.delete();
+ }
+
+ // Messages extracted and anonymized from the ASF mail archives
+ private static final String[][] testVars = new String[][] {
+ new String[] {
+ "[email protected]",
+ "Ant task for JDK1.1 collections build option",
+ "\nThis is just a test message\n--\nTesty McTester\n"
+ },
+ new String[] {
+ "[email protected]",
+ "Problem with build files in several directories",
+ "\nHi all,\nThis is another test message.\nRegards,\nAnother Test\n"
+ }
+ };
+
+ private static final String testMailMessages =
+ "From [email protected] Mon Jul 24 19:13:53 2000\n"+
+ "Return-Path: <[email protected]>\n"+
+ "Mailing-List: contact [email protected]; run by ezmlm\n"+
+ "Delivered-To: mailing list [email protected]\n"+
+ "Received: (qmail 49267 invoked from network); 24 Jul 2000 19:13:53
-0000\n"+
+ "Message-ID: <"+testVars[0][0]+">\n"+
+ "From: \"Testy McTester\" <[email protected]>\n"+
+ "To: <[email protected]>\n"+
+ "Subject: "+testVars[0][1]+"\n"+
+ "Date: Mon, 24 Jul 2000 12:24:56 -0700\n"+
+ "MIME-Version: 1.0\n"+
+ "Content-Type: text/plain;\n"+
+ " charset=\"Windows-1252\"\n"+
+ "Content-Transfer-Encoding: 7bit\n"+
+ "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"+
+ testVars[0][2]+
+ "\n"+
+ "From [email protected] Wed Jul 26 11:32:16 2000\n"+
+ "Return-Path: <[email protected]>\n"+
+ "Mailing-List: contact [email protected]; run by ezmlm\n"+
+ "Delivered-To: mailing list [email protected]\n"+
+ "Received: (qmail 73966 invoked from network); 26 Jul 2000 11:32:16
-0000\n"+
+ "User-Agent: Microsoft-Outlook-Express-Macintosh-Edition/5.02.2022\n"+
+ "Date: Wed, 26 Jul 2000 13:32:08 +0200\n"+
+ "Subject: "+testVars[1][1]+"\n"+
+ "From: Another Test <[email protected]>\n"+
+ "To: <[email protected]>\n"+
+ "Message-Id: <"+testVars[1][0]+">\n"+
+ "Mime-Version: 1.0\n"+
+ "Content-Type: text/plain; charset=\"US-ASCII\"\n"+
+ "Content-Transfer-Encoding: 7bit\n"+
+ "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"+
+ testVars[1][2];
+}
Propchange:
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
------------------------------------------------------------------------------
svn:eol-style = native