Author: gsingers
Date: Sat Nov  5 17:11:57 2011
New Revision: 1197992

URL: http://svn.apache.org/viewvc?rev=1197992&view=rev
Log:
MAHOUT-403: add in some regex transformation capabilities for converting raw 
content

Added:
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/
    
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
    
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
    mahout/trunk/src/conf/driver.classes.props

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java?rev=1197992&r1=1197991&r2=1197992&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java 
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java 
Sat Nov  5 17:11:57 2011
@@ -47,7 +47,9 @@ import org.apache.hadoop.mapreduce.lib.i
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.util.Tool;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.vectorizer.DefaultAnalyzer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -448,4 +450,15 @@ public abstract class AbstractJob extend
     FileInputFormat.setInputPaths(job, inputPathOne.makeQualified(fs), 
inputPathTwo.makeQualified(fs));
   }
 
+  protected Class<? extends Analyzer> getAnalyzerClassFromOption() throws 
ClassNotFoundException {
+    Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
+    if (hasOption(DefaultOptionCreator.ANALYZER_NAME_OPTION)) {
+      String className = 
getOption(DefaultOptionCreator.ANALYZER_NAME_OPTION).toString();
+      analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
+      // try instantiating it, b/c there isn't any point in setting it if
+      // you can't instantiate it
+      ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
+    }
+    return analyzerClass;
+  }
 }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=1197992&r1=1197991&r2=1197992&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
 Sat Nov  5 17:11:57 2011
@@ -23,6 +23,7 @@ import org.apache.commons.cli2.builder.D
 import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
 import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
 import org.apache.mahout.common.kernel.TriangularKernelProfile;
+import org.apache.mahout.vectorizer.DefaultAnalyzer;
 
 public final class DefaultOptionCreator {
   
@@ -67,6 +68,8 @@ public final class DefaultOptionCreator 
   public static final String MAPREDUCE_METHOD = "mapreduce";
   
   public static final String KERNEL_PROFILE_OPTION = "kernelProfile";
+
+  public static final String ANALYZER_NAME_OPTION = "analyzerName";
   
   private DefaultOptionCreator() {}
   
@@ -321,6 +324,24 @@ public static DefaultOptionBuilder clust
             "If present, run clustering after the iterations have taken place")
         .withShortName("cl");
   }
+
+  /**
+   * Returns a default command line option for specifying a Lucene analyzer 
class
+   * @return {@link DefaultOptionBuilder}
+   */
+  public static DefaultOptionBuilder analyzerOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(ANALYZER_NAME_OPTION)
+        .withRequired(false)
+        .withDescription(
+            "If present, the name of a Lucene analyzer class to use")
+        .withArgument(
+                new 
ArgumentBuilder().withName(ANALYZER_NAME_OPTION).withDefault(DefaultAnalyzer.class.getName())
+                .withMinimum(1).withMaximum(1).create()
+        )
+        .withShortName("an");
+  }
+
   
   /**
    * Returns a default command line option for specifying the emitMostLikely

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java?rev=1197992&r1=1197991&r2=1197992&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
 Sat Nov  5 17:11:57 2011
@@ -45,7 +45,7 @@ public final class EncodedVectorsFromSeq
   public int run(String[] args) throws Exception {
     addInputOption();
     addOutputOption();
-    addOption("analyzerName", "an", "The class name of the analyzer", 
DefaultAnalyzer.class.getName());
+    addOption(DefaultOptionCreator.analyzerOption().create());
     addOption(buildOption("sequentialAccessVector", "seq", "(Optional) Whether 
output vectors should be SequentialAccessVectors. If set true else false", 
false, false, null));
     addOption(buildOption("namedVector", "nv", "Create named vectors using the 
key.  False by default", false, false, null));
     addOption("cardinality", "c", "The cardinality to use for creating the 
vectors.  Default is 5000", String.valueOf(5000));
@@ -63,14 +63,7 @@ public final class EncodedVectorsFromSeq
       HadoopUtil.delete(getConf(), output);
     }
 
-    Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
-    if (hasOption("analyzerName")) {
-      String className = getOption("analyzerName").toString();
-      analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
-      // try instantiating it, b/c there isn't any point in setting it if
-      // you can't instantiate it
-      ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
-    }
+    Class<? extends Analyzer> analyzerClass = getAnalyzerClassFromOption();
 
 
     Configuration conf = getConf();

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,58 @@
+package org.apache.mahout.utils.regex;
+
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+import org.apache.mahout.common.lucene.TokenStreamIterator;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ *
+ *
+ **/
+public class AnalyzerTransformer implements RegexTransformer {
+  protected Analyzer analyzer;
+  protected String fieldName = "text";
+
+  public AnalyzerTransformer() {
+    this(new StandardAnalyzer(Version.LUCENE_34), "text");
+  }
+
+  public AnalyzerTransformer(Analyzer analyzer) {
+    this(analyzer, "text");
+  }
+
+  public AnalyzerTransformer(Analyzer analyzer, String fieldName) {
+    this.analyzer = analyzer;
+    this.fieldName = fieldName;
+  }
+
+  @Override
+  public String transformMatch(String match) {
+    StringBuilder result = new StringBuilder();
+    try {
+      TokenStream ts = analyzer.reusableTokenStream(fieldName, new 
StringReader(match));
+      ts.addAttribute(CharTermAttribute.class);
+      TokenStreamIterator iter = new TokenStreamIterator(ts);
+      while (iter.hasNext()) {
+        result.append(iter.next()).append(" ");
+      }
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+    return result.toString();
+  }
+
+  public Analyzer getAnalyzer() {
+    return analyzer;
+  }
+
+  public void setAnalyzer(Analyzer analyzer) {
+    this.analyzer = analyzer;
+  }
+}

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,39 @@
+package org.apache.mahout.utils.regex;
+
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Chain together several {@link 
org.apache.mahout.utils.regex.RegexTransformer} and apply them to the match
+ * in succession
+ *
+ **/
+public class ChainTransformer implements RegexTransformer {
+
+  private List<RegexTransformer> chain = new ArrayList<RegexTransformer>();
+
+  public ChainTransformer() {
+  }
+
+  public ChainTransformer(List<RegexTransformer> chain) {
+    this.chain = chain;
+  }
+
+  @Override
+  public String transformMatch(String match) {
+    String result = match;
+    for (RegexTransformer transformer : chain) {
+      result = transformer.transformMatch(result);
+    }
+    return result;
+  }
+
+  public List<RegexTransformer> getChain() {
+    return chain;
+  }
+
+  public void setChain(List<RegexTransformer> chain) {
+    this.chain = chain;
+  }
+}

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,32 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.regex.Pattern;
+
+/**
+ *  Collapses/converts all whitespace to a single tab
+ *
+ **/
+public class FPGFormatter implements RegexFormatter {
+  private static final Pattern WHITESPACE = Pattern.compile("\\W+");
+  @Override
+  public String format(String toFormat) {
+    return "\t" + WHITESPACE.matcher(toFormat).replaceAll("|");
+  }
+}

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,14 @@
+package org.apache.mahout.utils.regex;
+
+
+/**
+ *
+ *
+ **/
+public class IdentityFormatter implements RegexFormatter {
+
+  @Override
+  public String format(String toFormat) {
+    return toFormat;
+  }
+}

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,28 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * No-op
+ */
+public class IdentityTransformer implements RegexTransformer {
+  @Override
+  public String transformMatch(String match) {
+    return match;
+  }
+}

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,103 @@
+package org.apache.mahout.utils.regex;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.vectorizer.DefaultAnalyzer;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Experimental
+ */
+public class RegexConverterDriver extends AbstractJob {
+
+  @Override
+  public int run(String[] args) throws Exception {
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    addOption("regex", "regex",
+            "The regular expression to use", true);
+    addOption("groupsToKeep", "g",
+            "The number of the capturing groups to keep", false);
+    addOption("transformerClass", "t",
+            "The optional class specifying the Regex Transformer", false);
+    addOption("formatterClass", "t",
+            "The optional class specifying the Regex Formatter", false);
+    addOption(DefaultOptionCreator.analyzerOption().create());
+
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+
+    Configuration conf = getConf();
+    //TODO: How to deal with command line escaping?
+    conf.set(RegexMapper.REGEX, getOption("regex"));//
+    String gtk = getOption("groupsToKeep");
+    if (gtk != null) {
+      conf.set(RegexMapper.GROUP_MATCHERS, gtk);
+    }
+    String trans = getOption("transformerClass");
+    if (trans != null) {
+      if (trans.equalsIgnoreCase("url")) {
+        trans = URLDecodeTransformer.class.getName();
+      }
+      conf.set(RegexMapper.TRANSFORMER_CLASS, trans);
+    }
+    String formatter = getOption("formatterClass");
+    if (formatter != null) {
+      if (formatter.equalsIgnoreCase("fpg")) {
+        formatter = FPGFormatter.class.getName();
+      }
+      conf.set(RegexMapper.FORMATTER_CLASS, formatter);
+    }
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), output);
+    }
+    Class<? extends Analyzer> analyzerClass = getAnalyzerClassFromOption();
+    if (analyzerClass != null) {
+      conf.set(RegexMapper.ANALYZER_NAME, analyzerClass.getName());
+    }
+    Job job = prepareJob(input, output,
+            TextInputFormat.class,
+            RegexMapper.class,
+            LongWritable.class,
+            Text.class,
+            TextOutputFormat.class);
+    job.waitForCompletion(true);
+
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new RegexConverterDriver(), args);
+  }
+
+}

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,26 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ *
+ *
+ **/
+public interface RegexFormatter {
+  public String format(String toFormat);
+}

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,82 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.mahout.common.ClassUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+
+/**
+ *
+ *
+ **/
+public class RegexMapper extends Mapper<LongWritable, Text, LongWritable, 
Text> {
+  public static final String REGEX = "regex";
+  public static final String GROUP_MATCHERS = "regex.groups";
+  public static final String TRANSFORMER_CLASS = "transformer.class";
+  public static final String FORMATTER_CLASS = "formatter.class";
+
+  private Pattern regex;
+  private List<Integer> groupsToKeep;
+  private RegexTransformer transformer = RegexUtils.IDENTITY_TRANSFORMER;
+  private RegexFormatter formatter = RegexUtils.IDENTITY_FORMATTER;
+  public static final String ANALYZER_NAME = "analyzerName";
+
+
+  @Override
+  protected void setup(Context context) throws IOException, 
InterruptedException {
+    groupsToKeep = new ArrayList<Integer>();
+    Configuration config = context.getConfiguration();
+    String regexStr = config.get(REGEX);
+    regex = Pattern.compile(regexStr);
+    String[] groups = config.getStrings(GROUP_MATCHERS);
+    if (groups != null) {
+      for (int i = 0; i < groups.length; i++) {
+        groupsToKeep.add(Integer.parseInt(groups[i]));
+      }
+    }
+
+    transformer = ClassUtils.instantiateAs(config.get(TRANSFORMER_CLASS, 
IdentityTransformer.class.getName()), RegexTransformer.class);
+    String analyzerName = config.get(ANALYZER_NAME);
+    if (analyzerName != null && transformer instanceof AnalyzerTransformer) {
+      Analyzer analyzer = ClassUtils.instantiateAs(analyzerName, 
Analyzer.class);
+      ((AnalyzerTransformer)transformer).setAnalyzer(analyzer);
+    }
+
+    formatter = ClassUtils.instantiateAs(config.get(FORMATTER_CLASS, 
IdentityFormatter.class.getName()), RegexFormatter.class);
+
+  }
+
+
+  @Override
+  protected void map(LongWritable key, Text text, Context context) throws 
IOException, InterruptedException {
+    String result = RegexUtils.extract(text.toString(), regex, groupsToKeep, " 
", transformer);
+    if (result != null && result.length() > 0) {
+      String format = formatter.format(result);
+      context.write(key, new Text(format));
+    }
+  }
+}

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,26 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Transforms the match of a regular expression.
+ */
+public interface RegexTransformer {
+  public String transformMatch(String match);
+
+}

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,69 @@
+package org.apache.mahout.utils.regex;
+
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ *
+ *
+ **/
+public class RegexUtils {
+  public static final RegexTransformer IDENTITY_TRANSFORMER = new 
IdentityTransformer();
+  public static final RegexFormatter IDENTITY_FORMATTER = new 
IdentityFormatter();
+
+  public static String extract(String line, Pattern pattern, List<Integer> 
groupsToKeep,
+                               String separator, RegexTransformer transformer) 
{
+    StringBuilder bldr = new StringBuilder();
+    extract(line, bldr, pattern, groupsToKeep, separator, transformer);
+    return bldr.toString();
+  }
+
+  public static void extract(String line, StringBuilder outputBuffer,
+                             Pattern pattern, List<Integer> groupsToKeep, 
String separator,
+                             RegexTransformer transformer) {
+    if (transformer == null) {
+      transformer = IDENTITY_TRANSFORMER;
+    }
+    Matcher matcher = pattern.matcher(line);
+    String match;
+    if (groupsToKeep.isEmpty() == false) {
+      while (matcher.find() == true) {
+        for (Integer groupNum : groupsToKeep) {
+          match = matcher.group(groupNum);
+          if (match != null) {
+            
outputBuffer.append(transformer.transformMatch(match)).append(separator);
+          }
+        }
+      }
+    } else {
+      while (matcher.find() == true) {
+        match = matcher.group();
+        if (match != null) {
+          
outputBuffer.append(transformer.transformMatch(match)).append(separator);
+        }
+      }
+    }
+    //trim off the last separator, which is always there
+    if (outputBuffer.length() > 0) {
+      outputBuffer.setLength(outputBuffer.length() - separator.length());
+    }
+  }
+}

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,46 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+
+
+/**
+ *
+ *
+ **/
+public class URLDecodeTransformer implements RegexTransformer {
+  private String enc;
+
+  public URLDecodeTransformer() {
+    enc = "UTF-8";
+  }
+
+  public URLDecodeTransformer(String encoding) {
+    this.enc = encoding;
+  }
+
+  @Override
+  public String transformMatch(String match) {
+    try {
+      return URLDecoder.decode(match, enc);
+    } catch (UnsupportedEncodingException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}

Added: 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
 (added)
+++ 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,114 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.DummyRecordWriter;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.util.List;
+
+
+/**
+ *
+ *
+ **/
+
+public class RegexMapperTest extends MahoutTestCase {
+
+
+  @Test
+  public void testRegex() throws Exception {
+    RegexMapper mapper = new RegexMapper();
+    Configuration conf = new Configuration();
+    conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
+    conf.set(RegexMapper.TRANSFORMER_CLASS, 
URLDecodeTransformer.class.getName());
+    //conf.set(RegexMapper.);
+    DummyRecordWriter<LongWritable, Text> mapWriter = new 
DummyRecordWriter<LongWritable, Text>();
+    Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = 
DummyRecordWriter
+            .build(mapper, conf, mapWriter);
+
+    mapper.setup(mapContext);
+    for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+      String testStr = RegexUtilsTest.TEST_STRS[i];
+
+      LongWritable key = new LongWritable(i);
+      mapper.map(key, new Text(testStr), mapContext);
+      List<Text> value = mapWriter.getValue(key);
+      if (RegexUtilsTest.GOLD[i].equals("") == false) {
+        assertEquals(1, value.size());
+        assertEquals(RegexUtilsTest.GOLD[i], value.get(0).toString());
+      }
+    }
+  }
+
+  @Test
+  public void testGroups() throws Exception {
+    RegexMapper mapper = new RegexMapper();
+    Configuration conf = new Configuration();
+    conf.set(RegexMapper.REGEX, "(\\d+)\\.(\\d+)\\.(\\d+)");
+    conf.set(RegexMapper.TRANSFORMER_CLASS, 
URLDecodeTransformer.class.getName());
+    conf.setStrings(RegexMapper.GROUP_MATCHERS, "1", "3");
+    //conf.set(RegexMapper.);
+    DummyRecordWriter<LongWritable, Text> mapWriter = new 
DummyRecordWriter<LongWritable, Text>();
+    Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = 
DummyRecordWriter
+            .build(mapper, conf, mapWriter);
+
+    mapper.setup(mapContext);
+    for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+      String testStr = RegexUtilsTest.TEST_STRS[i];
+
+      LongWritable key = new LongWritable(i);
+      mapper.map(key, new Text(testStr), mapContext);
+      List<Text> value = mapWriter.getValue(key);
+      String gold = "127 0";
+      assertEquals(1, value.size());
+      assertEquals(gold, value.get(0).toString());
+    }
+  }
+
+  @Test
+  public void testFPGFormatter() throws Exception {
+    RegexMapper mapper = new RegexMapper();
+    Configuration conf = new Configuration();
+    conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
+    conf.set(RegexMapper.TRANSFORMER_CLASS, 
URLDecodeTransformer.class.getName());
+    conf.set(RegexMapper.FORMATTER_CLASS, FPGFormatter.class.getName());
+    //conf.set(RegexMapper.);
+    DummyRecordWriter<LongWritable, Text> mapWriter = new 
DummyRecordWriter<LongWritable, Text>();
+    Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = 
DummyRecordWriter
+            .build(mapper, conf, mapWriter);
+
+    mapper.setup(mapContext);
+    FPGFormatter formatter = new FPGFormatter();
+    for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+      String testStr = RegexUtilsTest.TEST_STRS[i];
+
+      LongWritable key = new LongWritable(i);
+      mapper.map(key, new Text(testStr), mapContext);
+      List<Text> value = mapWriter.getValue(key);
+      if (RegexUtilsTest.GOLD[i].equals("") == false) {
+        assertEquals(1, value.size());
+        assertEquals(formatter.format(RegexUtilsTest.GOLD[i]), 
value.get(0).toString());
+      }
+    }
+  }
+}

Added: 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java?rev=1197992&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
 (added)
+++ 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
 Sat Nov  5 17:11:57 2011
@@ -0,0 +1,67 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Pattern;
+
+
+/**
+ *
+ *
+ **/
+public class RegexUtilsTest extends MahoutTestCase {
+  public static final String[] TEST_STRS = new String[]{
+          "127.0.0.1 -  -  [01/10/2011:00:01:51 +0000] \"GET 
/solr/collection1/browse?q=foo&rows=10&wt=json&hl=true&hl.fl=body&hl.fl=content",
+          "127.0.0.1 -  -  [01/10/2011:00:20:58 +0000] \"GET 
/solr/collection1/browse?q=Using+Solr+Search+RDBMS&fq=%7B%21tag%3Dsource%7D%28%28source%3Alucid+AND+lucid_facet%3A%28site%29%29%29&rows=10",
+          "127.0.0.1 -  -  [01/10/2011:00:21:21 +0000] \"GET 
/solr/collection1/browse?q=language+detection&start=560&rows=10 HTTP/1.1\" 200 
45071",
+          "127.0.0.1 -  -  [01/10/2011:00:21:21 +0000] \"GET 
/solr/collection1/browse?q=&start=560&rows=10 HTTP/1.1\" 200 45071"
+  };
+  public static final String[] GOLD = new String[]{"foo", "Using Solr Search 
RDBMS", "language detection", ""};
+
+  @Test
+  public void testExtract() throws Exception {
+    String line = "127.0.0.1 -  -  [24/05/2010:01:19:22 +0000] \"GET 
/solr/select?q=import statement&start=1 HTTP/1.1\" 200 37571";
+    String res;
+    Pattern pattern;
+    pattern = Pattern.compile("(?<=(\\?|&)q=).*?(?=&|$)");
+    res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), 
" ", RegexUtils.IDENTITY_TRANSFORMER);
+    assertTrue(res, res.equals("import statement"));
+
+    for (int i = 0; i < TEST_STRS.length; i++) {
+      String testStr = TEST_STRS[i];
+      res = RegexUtils.extract(testStr, pattern, 
Collections.<Integer>emptyList(), " ", new URLDecodeTransformer());
+      assertEquals(GOLD[i], res);
+    }
+
+    pattern = 
Pattern.compile("((?<=(\\?|&)q=)(.*?)(?=(&|$))|(?<=((\\?|&)start=))(\\d+))");
+    res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), 
" ", RegexUtils.IDENTITY_TRANSFORMER);
+    assertTrue(res, res.equals("import statement 1"));
+
+    pattern = Pattern.compile("(start=1) HTTP");
+    List<Integer> groupsToKeep = new ArrayList<Integer>();
+    groupsToKeep.add(1);
+    res = RegexUtils.extract(line, pattern, groupsToKeep, " ", 
RegexUtils.IDENTITY_TRANSFORMER);
+    assertTrue(res, res.equals("start=1"));
+  }
+}

Modified: mahout/trunk/src/conf/driver.classes.props
URL: 
http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.props?rev=1197992&r1=1197991&r2=1197992&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.props (original)
+++ mahout/trunk/src/conf/driver.classes.props Sat Nov  5 17:11:57 2011
@@ -7,6 +7,7 @@ org.apache.mahout.utils.vectors.arff.Dri
 org.apache.mahout.utils.vectors.RowIdJob = rowid : Map 
SequenceFile<Text,VectorWritable> to {SequenceFile<IntWritable,VectorWritable>, 
SequenceFile<IntWritable,Text>}
 org.apache.mahout.utils.SplitInput = split : Split Input data into test and 
train sets
 org.apache.mahout.utils.MatrixDumper = matrixdump : Dump matrix in CSV format
+org.apache.mahout.utils.regex.RegexConverterDriver = regexconverter : Convert 
text files on a per line basis based on regular expressions
 org.apache.mahout.text.SequenceFilesFromDirectory = seqdirectory : Generate 
sequence files (of Text) from a directory
 org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles = seq2sparse: 
Sparse Vector generation from Text sequence files
 org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles = seq2encoded: 
Encoded Sparse Vector generation from Text sequence files
@@ -34,25 +35,30 @@ org.apache.mahout.clustering.spectral.km
 #Freq. Itemset Mining
 org.apache.mahout.fpm.pfpgrowth.FPGrowthDriver = fpg : Frequent Pattern Growth
 #Classification
+#old bayes
+org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups = 
prepare20newsgroups : Reformat 20 newsgroups data
+org.apache.mahout.classifier.bayes.WikipediaXmlSplitter = wikipediaXMLSplitter 
: Reads wikipedia data and creates ch
+org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorDriver = 
wikipediaDataSetCreator : Splits data set of wikipedia wrt feature like country
 org.apache.mahout.classifier.bayes.TestClassifier = testclassifier : Test the 
text based Bayes Classifier
 org.apache.mahout.classifier.bayes.TrainClassifier = trainclassifier : Train 
the text based Bayes Classifier
-org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups = 
prepare20newsgroups : Reformat 20 newsgroups data
+#new bayes
+org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob = trainnb 
: Train the Vector-based Bayes classifier
+org.apache.mahout.classifier.naivebayes.test.TestNaiveBayesDriver = testnb : 
Test the Vector-based Bayes classifier
+#SGD
 org.apache.mahout.classifier.sgd.TrainLogistic = trainlogistic : Train a 
logistic regression using stochastic gradient descent
 org.apache.mahout.classifier.sgd.RunLogistic = runlogistic : Run a logistic 
regression model against CSV data
 org.apache.mahout.classifier.sgd.PrintResourceOrFile = cat : Print a file or 
resource as the logistic regression models would see it
 org.apache.mahout.classifier.sgd.TrainAdaptiveLogistic = trainAdaptiveLogistic 
: Train an AdaptivelogisticRegression model
 org.apache.mahout.classifier.sgd.ValidateAdaptiveLogistic = 
validateAdaptiveLogistic : Validate an AdaptivelogisticRegression model against 
hold-out data set
 org.apache.mahout.classifier.sgd.RunAdaptiveLogistic = runAdaptiveLogistic : 
Score new production data using a probably trained and validated 
AdaptivelogisticRegression model
-org.apache.mahout.classifier.bayes.WikipediaXmlSplitter = wikipediaXMLSplitter 
: Reads wikipedia data and creates ch
-org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorDriver = 
wikipediaDataSetCreator : Splits data set of wikipedia wrt feature like country
+#HMM
 org.apache.mahout.classifier.sequencelearning.hmm.BaumWelchTrainer = baumwelch 
: Baum-Welch algorithm for unsupervised HMM training
 org.apache.mahout.classifier.sequencelearning.hmm.ViterbiEvaluator = viterbi : 
Viterbi decoding of hidden states from given output states sequence
 org.apache.mahout.classifier.sequencelearning.hmm.RandomSequenceGenerator = 
hmmpredict : Generate random sequence of observations by given HMM
-org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob = trainnb 
: Train the Vector-based Bayes classifier
-org.apache.mahout.classifier.naivebayes.test.TestNaiveBayesDriver = testnb : 
Test the Vector-based Bayes classifier
+#Classifier Utils
 org.apache.mahout.classifier.ConfusionMatrixDumper = cmdump : Dump confusion 
matrix in HTML or text formats
 
-
+#Recommenders
 org.apache.mahout.cf.taste.hadoop.als.DatasetSplitter = splitDataset : split a 
rating dataset into training and probe parts
 org.apache.mahout.cf.taste.hadoop.als.FactorizationEvaluator = 
evaluateFactorization : compute RMSE and MAE of a rating matrix factorization 
against probes
 org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob = 
itemsimilarity : Compute the item-item-similarities for item-based 
collaborative filtering


Reply via email to