svn commit: r1085408 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/text/ test/java/org/apache/mahout/text/

gsingers Fri, 25 Mar 2011 07:39:50 -0700

Author: gsingers
Date: Fri Mar 25 14:39:25 2011
New Revision: 1085408

URL: http://svn.apache.org/viewvc?rev=1085408&view=rev
Log:
MAHOUT-588: partial commit, not including shell script yet


Added:
    
mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
   (with props)
    
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
   (with props)
    
mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
   (with props)
    
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
   (with props)

Added: 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java?rev=1085408&view=auto
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
 (added)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
 Fri Mar 25 14:39:25 2011
@@ -0,0 +1,171 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LengthFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.PorterStemFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
+
+/**
+ * Custom Lucene Analyzer designed for aggressive feature reduction
+ * for clustering the ASF Mail Archives using an extended set of
+ * stop words, excluding non-alpha-numeric tokens, and porter stemming.
+ */
+public class MailArchivesClusteringAnalyzer extends Analyzer {
+  
+  // extended set of stop words composed of common mail terms like "hi",
+  // HTML tags, and Java keywords asmany of the messages in the archives
+  // are subversion check-in notifications
+       private static final String[] STOP_WORDS = new String[] {
+         
"3d","7bit","a0","about","above","abstract","across","additional","after",
+         "afterwards","again","against","align","all","almost","alone","along",
+         
"already","also","although","always","am","among","amongst","amoungst",
+         
"amount","an","and","another","any","anybody","anyhow","anyone","anything",
+         "anyway","anywhere","are","arial","around","as","ascii","assert","at",
+         
"back","background","base64","bcc","be","became","because","become","becomes",
+         
"becoming","been","before","beforehand","behind","being","below","beside",
+         
"besides","between","beyond","bgcolor","blank","blockquote","body","boolean",
+         
"border","both","br","break","but","by","can","cannot","cant","case","catch",
+         
"cc","cellpadding","cellspacing","center","char","charset","cheers","class",
+         
"co","color","colspan","com","con","const","continue","could","couldnt",
+         
"cry","css","de","dear","default","did","didnt","different","div","do",
+         
"does","doesnt","done","dont","double","down","due","during","each","eg",
+         
"eight","either","else","elsewhere","empty","encoding","enough","enum",
+         "etc","eu","even","ever","every","everyone","everything","everywhere",
+         
"except","extends","face","family","few","ffffff","final","finally","float",
+         
"font","for","former","formerly","fri","from","further","get","give","go",
+         
"good","got","goto","gt","h1","ha","had","has","hasnt","have","he","head",
+         
"height","hello","helvetica","hence","her","here","hereafter","hereby",
+         "herein","hereupon","hers","herself","hi","him","himself","his","how",
+         "however","hr","href","html","http","https","id","ie","if","ill","im",
+         
"image","img","implements","import","in","inc","instanceof","int","interface",
+         
"into","is","isnt","iso-8859-1","it","its","itself","ive","just","keep",
+         
"last","latter","latterly","least","left","less","li","like","long","look",
+         
"lt","ltd","mail","mailto","many","margin","may","me","meanwhile","message",
+         
"meta","might","mill","mine","mon","more","moreover","most","mostly","mshtml",
+         
"mso","much","must","my","myself","name","namely","native","nbsp","need",
+         
"neither","never","nevertheless","new","next","nine","no","nobody","none",
+         
"noone","nor","not","nothing","now","nowhere","null","of","off","often",
+         
"ok","on","once","only","onto","or","org","other","others","otherwise",
+         
"our","ours","ourselves","out","over","own","package","pad","per","perhaps",
+         
"plain","please","pm","printable","private","protected","public","put",
+         
"quot","quote","r1","r2","rather","re","really","regards","reply","return",
+         
"right","said","same","sans","sat","say","saying","see","seem","seemed",
+         
"seeming","seems","serif","serious","several","she","short","should","show",
+         
"side","since","sincere","six","sixty","size","so","solid","some","somehow",
+         "someone","something","sometime","sometimes","somewhere","span","src",
+         
"static","still","strictfp","string","strong","style","stylesheet","subject",
+         
"such","sun","super","sure","switch","synchronized","table","take","target",
+         
"td","text","th","than","thanks","that","the","their","them","themselves",
+         
"then","thence","there","thereafter","thereby","therefore","therein","thereupon",
+         "these","they","thick","thin","think","third","this","those","though",
+         
"three","through","throughout","throw","throws","thru","thu","thus","tm",
+         
"to","together","too","top","toward","towards","tr","transfer","transient",
+         
"try","tue","type","ul","un","under","unsubscribe","until","up","upon",
+         
"us","use","used","uses","using","valign","verdana","very","via","void",
+         
"volatile","want","was","we","wed","weight","well","were","what","whatever",
+         
"when","whence","whenever","where","whereafter","whereas","whereby","wherein",
+         
"whereupon","wherever","whether","which","while","whither","who","whoever",
+         "whole","whom","whose","why","width","will","with","within","without",
+         
"wont","would","wrote","www","yes","yet","you","your","yours","yourself",
+         "yourselves"
+       };
+
+       // Regex used to exclude non-alpha-numeric tokens
+  private static final Pattern alphaNumeric = 
Pattern.compile("^[a-z][a-z0-9_]+$");
+  private final CharArraySet stopSet;
+
+       public MailArchivesClusteringAnalyzer() {
+               stopSet = 
(CharArraySet)StopFilter.makeStopSet(Arrays.asList(STOP_WORDS));
+               java.util.TreeSet<String> tmp = new java.util.TreeSet<String>();
+               java.util.Iterator iter = stopSet.iterator();
+               while (iter.hasNext()) {
+                 tmp.add((String)iter.next());
+               }
+       }
+
+       public MailArchivesClusteringAnalyzer(CharArraySet stopSet) {
+               this.stopSet = stopSet;
+       }
+
+       @Override
+       public TokenStream tokenStream(String fieldName, java.io.Reader reader) 
{
+               @SuppressWarnings("deprecation")
+               TokenStream result = new 
StandardTokenizer(Version.LUCENE_CURRENT, reader);
+               result = new StandardFilter(result);
+               result = new LowerCaseFilter(result);
+    result = new ASCIIFoldingFilter(result);
+    result = new AlphaNumericMaxLengthFilter(result);
+               result = new StopFilter(false, result, stopSet);
+               return new PorterStemFilter(result);
+       }
+
+  /**
+   * Matches alpha-numeric tokens between 2 and 40 chars long.
+   */
+       class AlphaNumericMaxLengthFilter extends TokenFilter {
+    private TermAttribute termAtt;
+    private final char[] output = new char[28];
+    private Matcher matcher;
+
+         public AlphaNumericMaxLengthFilter(TokenStream in) {
+           super(in);
+           termAtt = addAttribute(TermAttribute.class);
+           matcher = alphaNumeric.matcher("foo");
+         }
+
+         @Override
+         public final boolean incrementToken() throws IOException {
+           // return the first alpha-numeric token between 2 and 40 length
+           while (input.incrementToken()) {
+             final int length = termAtt.termLength();
+             if (length >= 2 && length <= 28) {
+               final char[] buf = termAtt.termBuffer();
+               int at = 0;
+               for (int c=0; c < length; c++) {
+                 final char ch = buf[c];
+                 if (ch != '\'') {
+                   output[at++] = ch;
+                 }
+               }
+               final String term = new String(output, 0, at);
+               matcher.reset(term);
+               if (matcher.matches() && !term.startsWith("a0")) {
+            termAtt.setTermBuffer(term);
+            return true;                   
+               }
+             }
+           }
+           return false;
+         }
+  }
+}

Propchange: 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java?rev=1085408&view=auto
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
 (added)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
 Fri Mar 25 14:39:25 2011
@@ -0,0 +1,295 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.FileLineIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+
+/**
+ * Converts a directory of gzipped mail archives into SequenceFiles of 
specified chunkSize.
+ * This class is similar to {@link SequenceFilesFromDirectory} except it uses 
block-compressed
+ * {@link SequenceFile}s and parses out the subject and body text of each mail 
message into
+ * a separate key/value pair.
+ */
+public final class SequenceFilesFromMailArchives {
+
+  private static final Logger log = 
LoggerFactory.getLogger(SequenceFilesFromMailArchives.class);
+  
+  private static ChunkedWriter createNewChunkedWriter(int chunkSizeInMB, 
String outputDir) throws IOException {
+    return new ChunkedWriter(chunkSizeInMB, outputDir);
+  }
+  
+  public void createSequenceFiles(File parentDir,
+                                  String outputDir,
+                                  String prefix,
+                                  int chunkSizeInMB,
+                                  Charset charset) throws IOException {
+    ChunkedWriter writer = createNewChunkedWriter(chunkSizeInMB, outputDir);
+    PrefixAdditionFilter filter = new PrefixAdditionFilter(prefix, writer, 
charset);
+    parentDir.listFiles(filter);
+    writer.close();
+    
+    log.info("Parsed "+filter.getMessageCount()+" messages from 
"+parentDir.getAbsolutePath());
+  }
+  
+  public static class ChunkedWriter implements Closeable {
+    private final int maxChunkSizeInBytes;
+    private final String outputDir;
+    private SequenceFile.Writer writer;
+    private int currentChunkID;
+    private int currentChunkSize;
+    private final Configuration conf = new Configuration();
+    private final FileSystem fs;
+    
+    public ChunkedWriter(int chunkSizeInMB, String outputDir) throws 
IOException {
+      if (chunkSizeInMB > 1984) {
+        chunkSizeInMB = 1984;
+      }
+      maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
+      this.outputDir = outputDir;
+      fs = FileSystem.get(conf);
+      currentChunkID = 0;
+      
+      writer = SequenceFile.createWriter(fs, conf, getPath(currentChunkID), 
Text.class, Text.class, SequenceFile.CompressionType.BLOCK);      
+    }
+    
+    private Path getPath(int chunkID) {
+      return new Path(outputDir + "/chunk-" + chunkID);
+    }
+    
+    public void write(String key, String value) throws IOException {
+      if (currentChunkSize > maxChunkSizeInBytes) {
+        writer.close();
+        log.info("Chunk size ("+currentChunkSize+") reached MAX; creating new 
chunk "+(currentChunkID+1));
+        writer = SequenceFile.createWriter(fs, conf, 
getPath(currentChunkID++), Text.class, Text.class, 
SequenceFile.CompressionType.BLOCK);
+        currentChunkSize = 0;        
+      }
+      
+      Text keyT = new Text(key);
+      Text valueT = new Text(value);
+      currentChunkSize += keyT.getBytes().length + valueT.getBytes().length; 
// Overhead
+      writer.append(keyT, valueT);
+    }
+    
+    @Override
+    public void close() throws IOException {
+      writer.close();
+    }
+  }
+  
+  // regular expressions used to parse individual messages
+  private static final Pattern MESSAGE_START = 
+    Pattern.compile("^From \\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
+  private static final Pattern MESSAGE_ID_PREFIX = 
+    Pattern.compile("^message-id: <(.*)>$", Pattern.CASE_INSENSITIVE);
+  private static final Pattern SUBJECT_PREFIX = 
+    Pattern.compile("^subject: (.*)$", Pattern.CASE_INSENSITIVE);  
+  
+  public class PrefixAdditionFilter implements FileFilter {
+    private final String prefix;
+    private final ChunkedWriter writer;
+    private final Charset charset;
+    private final StringBuilder file;
+    private int messageCount;
+    
+    public PrefixAdditionFilter(String prefix, ChunkedWriter writer, Charset 
charset) {
+      this.prefix = prefix;
+      this.writer = writer;
+      this.charset = charset;
+      this.file = new StringBuilder();
+      this.messageCount = 0;
+    }
+    
+    public int getMessageCount() {
+      return messageCount;
+    }
+    
+    @Override
+    public boolean accept(File current) {
+      if (current.isDirectory()) {
+        log.info("At "+current.getAbsolutePath());
+        PrefixAdditionFilter nested = 
+          new PrefixAdditionFilter(prefix + File.separator + 
current.getName(), writer, charset);
+        current.listFiles(nested);
+        int dirCount = nested.getMessageCount();
+        log.info("Parsed "+dirCount+" messages from directory 
"+current.getAbsolutePath());
+        messageCount += dirCount;
+      } else {
+        parseFileLineByLine(current);
+      }
+      return false;
+    }
+    
+    // extracts mail subject and body text from 0 or more mail messages
+    // embedded in the supplied file using simple pattern matching
+    private final void parseFileLineByLine(File current) {      
+      try {
+        file.setLength(0); // reset the buffer
+        
+        // tmps used during mail message parsing
+        String messageId = null;
+        boolean inBody = false;
+        Matcher subjectMatcher = SUBJECT_PREFIX.matcher("");
+        Matcher messageIdMatcher = MESSAGE_ID_PREFIX.matcher("");
+        Matcher messageBoundaryMatcher = MESSAGE_START.matcher("");
+        
+        for (String nextLine : new FileLineIterable(current, charset, false)) {
+
+          // subject may come before message ID
+          subjectMatcher.reset(nextLine);
+          if (subjectMatcher.matches()) {
+            file.append(subjectMatcher.group(1)).append('\n');
+          }
+          
+          // only start appending body content after we've seen a message ID
+          if (messageId != null) {            
+            // first, see if we hit the end of the message
+            messageBoundaryMatcher.reset(nextLine);              
+            if (messageBoundaryMatcher.matches()) {
+                // done parsing this message ... write it out
+                String key = prefix + File.separator + current.getName() + 
File.separator + messageId;
+                writer.write(key, file.toString());
+                file.setLength(0); // reset the buffer
+                messageId = null;
+                inBody = false;
+            } else {
+              if (inBody) {
+                if (nextLine.length() > 0) {
+                  file.append(nextLine).append('\n');
+                }
+              } else {
+                // first empty line we see after reading the message Id
+                // indicates that we are in the body ...
+                inBody = (nextLine.length() == 0);
+              }
+            }
+          } else {
+            if (nextLine.length() > 14) {
+              messageIdMatcher.reset(nextLine);
+              if (messageIdMatcher.matches()) {
+                messageId = messageIdMatcher.group(1);
+                ++messageCount;
+              }
+            }
+          }
+        }
+
+        // write the last message in the file if available
+        if (messageId != null) {
+          String key = prefix + File.separator + current.getName() + 
File.separator + messageId;
+          writer.write(key, file.toString());
+          file.setLength(0); // reset the buffer
+          messageId = null;
+          inBody = false;
+        }
+      } catch (FileNotFoundException e) {
+        // Skip file.
+      } catch (IOException e) {
+        // TODO: report exceptions and continue;
+        throw new IllegalStateException(e);
+      }      
+    }
+  }
+  
+  public static void main(String[] args) throws Exception {
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+    
+    Option parentOpt = 
obuilder.withLongName("input").withRequired(true).withArgument(
+      
abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The input dir containing the documents").withShortName("i").create();
+    
+    Option outputDirOpt = 
obuilder.withLongName("output").withRequired(true).withArgument(
+      
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The output directory").withShortName("o").create();
+    
+    Option chunkSizeOpt = obuilder.withLongName("chunkSize").withArgument(
+      
abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The chunkSize in MegaBytes. Defaults to 
64").withShortName("chunk").create();
+    
+    Option keyPrefixOpt = obuilder.withLongName("keyPrefix").withArgument(
+      
abuilder.withName("keyPrefix").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The prefix to be prepended to the 
key").withShortName("prefix").create();
+    
+    Option charsetOpt = 
obuilder.withLongName("charset").withRequired(true).withArgument(
+      
abuilder.withName("charset").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The name of the character encoding of the input 
files").withShortName("c").create();
+    
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
+        .create();
+    
+    Group group = 
gbuilder.withName("Options").withOption(keyPrefixOpt).withOption(chunkSizeOpt).withOption(
+      
charsetOpt).withOption(outputDirOpt).withOption(helpOpt).withOption(parentOpt).create();
+    
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      parser.setHelpOption(helpOpt);
+      CommandLine cmdLine = parser.parse(args);
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return;
+      }
+      File parentDir = new File((String) cmdLine.getValue(parentOpt));
+      String outputDir = (String) cmdLine.getValue(outputDirOpt);
+      
+      int chunkSize = 64;
+      if (cmdLine.hasOption(chunkSizeOpt)) {
+        chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
+      }
+      
+      String prefix = "";
+      if (cmdLine.hasOption(keyPrefixOpt)) {
+        prefix = (String) cmdLine.getValue(keyPrefixOpt);
+      }
+      Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
+      SequenceFilesFromMailArchives dir = new SequenceFilesFromMailArchives();
+      
+      dir.createSequenceFiles(parentDir, outputDir, prefix, chunkSize, 
charset);
+    } catch (OptionException e) {
+      log.error("Exception", e);
+      CommandLineUtil.printHelp(group);
+    }
+  }
+}

Propchange: 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java?rev=1085408&view=auto
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
 (added)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
 Fri Mar 25 14:39:25 2011
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.StringReader;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+import org.junit.Test;
+
+/**
+ * Unit tests for the MailArchivesClusteringAnalyzer text analyzer.
+ */
+public class MailArchivesClusteringAnalyzerTest {
+  
+  @Test
+  public void testAnalysis() throws Exception {
+    MailArchivesClusteringAnalyzer analyzer = new 
MailArchivesClusteringAnalyzer();
+    
+    String text = "A test message\n";
+    text += "atokenthatistoolongtobeusefulforclustertextanalysis\n";
+    text += "Mahout is a scalable, machine-learning LIBRARY\n";
+    text += "we've added some additional stopwords such as html, mailto, 
regards\t";
+    text += "apache_hadoop provides the foundation for scalability\n";
+    text += "www.nabble.com [email protected]\n";
+    text += "public void int protected package";
+    StringReader reader = new StringReader(text);
+    
+    // if you change the text above, then you may need to change this as well
+    // order matters too
+    String[] expectedTokens = new String[] {
+        "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", 
"ad",
+        "stopword", "apach", "hadoop", "provid", "foundat", "scalabl"
+    };
+        
+    TokenStream tokenStream = analyzer.tokenStream("test", reader);
+    assertNotNull(tokenStream);    
+    TermAttribute termAtt = tokenStream.addAttribute(TermAttribute.class);
+    int e = -1;
+    while (tokenStream.incrementToken()) {
+      assertEquals(expectedTokens[++e], termAtt.term());
+    } 
+  }
+}

Propchange: 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java?rev=1085408&view=auto
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
 (added)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
 Fri Mar 25 14:39:25 2011
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Reader;
+import org.apache.hadoop.io.Text;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test case for the SequenceFilesFromMailArchives command-line application.
+ */
+public class SequenceFilesFromMailArchivesTest {
+  
+  // TODO: Negative tests
+
+  private File inputDir = null;
+  private File outputDir = null;
+
+  /**
+   * Create the input and output directories needed for testing
+   * the SequenceFilesFromMailArchives application.
+   */
+  @Before
+  public void setupBeforeTesting() throws IOException {
+    // tread-lightly, create folder names using the timestamp
+    long now = System.currentTimeMillis();
+    inputDir = createTempDir("mail-archives-"+now+"-in");
+    outputDir = createTempDir("mail-archives-"+now+"-out");
+    
+    // write test mail messages to a gzipped file in a nested directory
+    File subDir = new File(inputDir, "subdir");
+    subDir.mkdir();
+    File gzFile = new File(subDir, "mail-messages.gz");
+    GZIPOutputStream gzOut = null;
+    try {
+      gzOut = new GZIPOutputStream(new FileOutputStream(gzFile));
+      gzOut.write(testMailMessages.getBytes("UTF-8"));
+      gzOut.finish();
+    } finally {
+      if (gzOut != null) {
+        try {
+          gzOut.close();
+        } catch (Exception ignore) {}
+      }
+    }    
+  }
+
+  /**
+   * Test the main method of the SequenceFilesFromMailArchives
+   * command-line application.
+   */
+  @Test
+  public void testMain() throws Exception {
+    String[] args = new String[] {
+      "--input", inputDir.getAbsolutePath(),  
+      "--output", outputDir.getAbsolutePath(),
+      "--charset", "UTF-8",
+      "--keyPrefix", "TEST"
+    };
+    
+    // run the application's main method
+    SequenceFilesFromMailArchives.main(args);
+    
+    // app should create a single SequenceFile named "chunk-0"
+    // in the output dir
+    File expectedChunkFile = new File(outputDir, "chunk-0");
+    String expectedChunkPath = expectedChunkFile.getAbsolutePath();
+    assertTrue("Expected chunk file "+expectedChunkPath+" not found!", 
+        expectedChunkFile.isFile());
+
+    Text key = new Text();
+    Text value = new Text();
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(conf);
+    SequenceFile.Reader seqFileReader = null;
+    try {
+      seqFileReader = new SequenceFile.Reader(fs, new Path(expectedChunkPath), 
conf);
+      assertTrue("First key/value pair not found!", seqFileReader.next(key, 
value));
+      
+      assertEquals("TEST/subdir/mail-messages.gz/"+testVars[0][0], 
key.toString());
+      assertEquals(testVars[0][1]+testVars[0][2], value.toString());
+  
+      assertTrue("Second key/value pair not found!", seqFileReader.next(key, 
value));
+      assertEquals("TEST/subdir/mail-messages.gz/"+testVars[1][0], 
key.toString());
+      assertEquals(testVars[1][1]+testVars[1][2], value.toString());
+  
+      assertFalse("Only two key/value pairs expected!", 
seqFileReader.next(key, value));
+    } finally {
+      if (seqFileReader != null) {
+        try {
+          seqFileReader.close();
+        } catch (Exception ignore) {}
+      }
+    }
+  }
+
+  @After
+  public void cleanupAfterTesting() {
+    if (inputDir != null)
+      rmdir(inputDir);
+    
+    if (outputDir != null)
+      rmdir(outputDir);
+  }
+
+  // creates a temp directory for storing test input / output
+  // fails if the directory cannot be created
+  private File createTempDir(String dirName) {
+    File tempDir = new File(System.getProperty("java.io.tmpdir"), dirName);
+    if (!tempDir.isDirectory()) {
+      tempDir.mkdirs();
+      if (!tempDir.isDirectory()) {
+        fail("Failed to create temp directory "+tempDir.getAbsolutePath());
+      }
+    }
+    return tempDir;
+  }
+
+  // recursively delete the temp directories created by this test
+  private void rmdir(File dir) {
+    if (dir.isDirectory()) {
+      File[] files = dir.listFiles();
+      for (int f=0; f<files.length; f++) {
+        if (files[f].isDirectory()) {
+          rmdir(files[f]);
+        } else {
+          files[f].delete();
+        }
+      }
+    }
+    dir.delete();
+  }
+  
+  // Messages extracted and anonymized from the ASF mail archives
+  private static final String[][] testVars = new String[][] {
+    new String[] {
+      "[email protected]",
+      "Ant task for JDK1.1 collections build option", 
+      "\nThis is just a test message\n--\nTesty McTester\n"
+    },
+    new String[] {
+      "[email protected]",
+      "Problem with build files in several directories",
+      "\nHi all,\nThis is another test message.\nRegards,\nAnother Test\n"
+    }
+  };
+  
+  private static final String testMailMessages =
+    "From [email protected]  Mon Jul 24 19:13:53 2000\n"+
+    "Return-Path: <[email protected]>\n"+
+    "Mailing-List: contact [email protected]; run by ezmlm\n"+
+    "Delivered-To: mailing list [email protected]\n"+
+    "Received: (qmail 49267 invoked from network); 24 Jul 2000 19:13:53 
-0000\n"+
+    "Message-ID: <"+testVars[0][0]+">\n"+
+    "From: \"Testy McTester\" <[email protected]>\n"+
+    "To: <[email protected]>\n"+
+    "Subject: "+testVars[0][1]+"\n"+
+    "Date: Mon, 24 Jul 2000 12:24:56 -0700\n"+
+    "MIME-Version: 1.0\n"+
+    "Content-Type: text/plain;\n"+
+    "  charset=\"Windows-1252\"\n"+
+    "Content-Transfer-Encoding: 7bit\n"+
+    "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"+
+    testVars[0][2]+
+    "\n"+
+    "From [email protected]  Wed Jul 26 11:32:16 2000\n"+
+    "Return-Path: <[email protected]>\n"+
+    "Mailing-List: contact [email protected]; run by ezmlm\n"+
+    "Delivered-To: mailing list [email protected]\n"+
+    "Received: (qmail 73966 invoked from network); 26 Jul 2000 11:32:16 
-0000\n"+
+    "User-Agent: Microsoft-Outlook-Express-Macintosh-Edition/5.02.2022\n"+
+    "Date: Wed, 26 Jul 2000 13:32:08 +0200\n"+
+    "Subject: "+testVars[1][1]+"\n"+
+    "From: Another Test <[email protected]>\n"+
+    "To: <[email protected]>\n"+
+    "Message-Id: <"+testVars[1][0]+">\n"+
+    "Mime-Version: 1.0\n"+
+    "Content-Type: text/plain; charset=\"US-ASCII\"\n"+
+    "Content-Transfer-Encoding: 7bit\n"+
+    "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"+
+    testVars[1][2];
+}

Propchange: 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

svn commit: r1085408 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/text/ test/java/org/apache/mahout/text/

Reply via email to