Author: gsingers
Date: Thu Nov 3 23:02:00 2011
New Revision: 1197342
URL: http://svn.apache.org/viewvc?rev=1197342&view=rev
Log:
put in some comments
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java?rev=1197342&r1=1197341&r2=1197342&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
Thu Nov 3 23:02:00 2011
@@ -17,9 +17,6 @@
package org.apache.mahout.classifier.naivebayes.training;
-import java.io.IOException;
-import java.util.Map;
-
import com.google.common.base.Splitter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
@@ -29,8 +26,8 @@ import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -40,7 +37,12 @@ import org.apache.mahout.common.iterator
import org.apache.mahout.common.mapreduce.VectorSumReducer;
import org.apache.mahout.math.VectorWritable;
-/** This class trains a Naive Bayes Classifier (Parameters for both Naive
Bayes and Complementary Naive Bayes) */
+import java.io.IOException;
+import java.util.Map;
+
+/**
+ * This class trains a Naive Bayes Classifier (Parameters for both Naive Bayes
and Complementary Naive Bayes)
+ */
public final class TrainNaiveBayesJob extends AbstractJob {
public static final String WEIGHTS_PER_FEATURE = "__SPF";
@@ -67,7 +69,7 @@ public final class TrainNaiveBayesJob ex
addOption(buildOption("trainComplementary", "c", "train complementary?",
false, false, String.valueOf(false)));
addOption("labelIndex", "li", "The path to store the label index in",
false);
addOption(DefaultOptionCreator.overwriteOption().create());
- Map<String,String> parsedArgs = parseArguments(args);
+ Map<String, String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
@@ -77,7 +79,7 @@ public final class TrainNaiveBayesJob ex
}
Path labPath;
String labPathStr = parsedArgs.get("--labelIndex");
- if (labPathStr != null){
+ if (labPathStr != null) {
labPath = new Path(labPathStr);
} else {
labPath = getTempPath("labelIndex");
@@ -90,29 +92,30 @@ public final class TrainNaiveBayesJob ex
HadoopUtil.setSerializations(getConf());
HadoopUtil.cacheFiles(labPath, getConf());
+ //add up all the vectors with the same labels, while mapping the labels
into our index
Job indexInstances = prepareJob(getInputPath(),
getTempPath(SUMMED_OBSERVATIONS), SequenceFileInputFormat.class,
- IndexInstancesMapper.class, IntWritable.class, VectorWritable.class,
VectorSumReducer.class, IntWritable.class,
- VectorWritable.class, SequenceFileOutputFormat.class);
+ IndexInstancesMapper.class, IntWritable.class,
VectorWritable.class, VectorSumReducer.class, IntWritable.class,
+ VectorWritable.class, SequenceFileOutputFormat.class);
indexInstances.setCombinerClass(VectorSumReducer.class);
indexInstances.waitForCompletion(true);
-
+ //sum up all the weights from the previous step, per label and per feature
Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS),
getTempPath(WEIGHTS),
- SequenceFileInputFormat.class, WeightsMapper.class, Text.class,
VectorWritable.class, VectorSumReducer.class,
- Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
+ SequenceFileInputFormat.class, WeightsMapper.class, Text.class,
VectorWritable.class, VectorSumReducer.class,
+ Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS,
String.valueOf(labelSize));
weightSummer.setCombinerClass(VectorSumReducer.class);
weightSummer.waitForCompletion(true);
-
+ //put the per label and per feature vectors into the cache
HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());
-
+ //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors --
TODO: add reference here to the part of the Rennie paper that discusses this
Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS),
getTempPath(THETAS),
- SequenceFileInputFormat.class, ThetaMapper.class, Text.class,
VectorWritable.class, VectorSumReducer.class,
- Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
+ SequenceFileInputFormat.class, ThetaMapper.class, Text.class,
VectorWritable.class, VectorSumReducer.class,
+ Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
thetaSummer.setCombinerClass(VectorSumReducer.class);
thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY,
trainComplementary);
thetaSummer.waitForCompletion(true);
-
+ //validate our model and then write it out to the official output
NaiveBayesModel naiveBayesModel =
BayesUtils.readModelFromDir(getTempPath(), getConf());
naiveBayesModel.validate();
naiveBayesModel.serialize(getOutputPath(), getConf());
@@ -122,12 +125,12 @@ public final class TrainNaiveBayesJob ex
private long createLabelIndex(Map<String, String> parsedArgs, Path labPath)
throws IOException {
long labelSize = 0;
- if (parsedArgs.containsKey("--labels")){
+ if (parsedArgs.containsKey("--labels")) {
Iterable<String> labels =
Splitter.on(",").split(parsedArgs.get("--labels"));
labelSize = BayesUtils.writeLabelIndex(getConf(), labels, labPath);
- } else if (parsedArgs.containsKey("--extractLabels")){
+ } else if (parsedArgs.containsKey("--extractLabels")) {
SequenceFileDirIterable<Text, IntWritable> iterable =
- new SequenceFileDirIterable<Text, IntWritable>(getInputPath(),
PathType.LIST, PathFilters.logsCRCFilter(), getConf());
+ new SequenceFileDirIterable<Text, IntWritable>(getInputPath(),
PathType.LIST, PathFilters.logsCRCFilter(), getConf());
labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable);
}
return labelSize;