TrainNaiveBayesJob.java

gsingers Thu, 03 Nov 2011 16:02:27 -0700

Author: gsingers
Date: Thu Nov  3 23:02:00 2011
New Revision: 1197342

URL: http://svn.apache.org/viewvc?rev=1197342&view=rev
Log:
put in  some comments


Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java?rev=1197342&r1=1197341&r2=1197342&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
 Thu Nov  3 23:02:00 2011
@@ -17,9 +17,6 @@
 
 package org.apache.mahout.classifier.naivebayes.training;
 
-import java.io.IOException;
-import java.util.Map;
-
 import com.google.common.base.Splitter;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
@@ -29,8 +26,8 @@ import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
 import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -40,7 +37,12 @@ import org.apache.mahout.common.iterator
 import org.apache.mahout.common.mapreduce.VectorSumReducer;
 import org.apache.mahout.math.VectorWritable;
 
-/** This class trains a Naive Bayes Classifier (Parameters for both Naive 
Bayes and Complementary Naive Bayes) */
+import java.io.IOException;
+import java.util.Map;
+
+/**
+ * This class trains a Naive Bayes Classifier (Parameters for both Naive Bayes 
and Complementary Naive Bayes)
+ */
 public final class TrainNaiveBayesJob extends AbstractJob {
 
   public static final String WEIGHTS_PER_FEATURE = "__SPF";
@@ -67,7 +69,7 @@ public final class TrainNaiveBayesJob ex
     addOption(buildOption("trainComplementary", "c", "train complementary?", 
false, false, String.valueOf(false)));
     addOption("labelIndex", "li", "The path to store the label index in", 
false);
     addOption(DefaultOptionCreator.overwriteOption().create());
-    Map<String,String> parsedArgs = parseArguments(args);
+    Map<String, String> parsedArgs = parseArguments(args);
     if (parsedArgs == null) {
       return -1;
     }
@@ -77,7 +79,7 @@ public final class TrainNaiveBayesJob ex
     }
     Path labPath;
     String labPathStr = parsedArgs.get("--labelIndex");
-    if (labPathStr != null){
+    if (labPathStr != null) {
       labPath = new Path(labPathStr);
     } else {
       labPath = getTempPath("labelIndex");
@@ -90,29 +92,30 @@ public final class TrainNaiveBayesJob ex
     HadoopUtil.setSerializations(getConf());
     HadoopUtil.cacheFiles(labPath, getConf());
 
+    //add up all the vectors with the same labels, while mapping the labels 
into our index
     Job indexInstances = prepareJob(getInputPath(), 
getTempPath(SUMMED_OBSERVATIONS), SequenceFileInputFormat.class,
-        IndexInstancesMapper.class, IntWritable.class, VectorWritable.class, 
VectorSumReducer.class, IntWritable.class,
-        VectorWritable.class, SequenceFileOutputFormat.class);
+            IndexInstancesMapper.class, IntWritable.class, 
VectorWritable.class, VectorSumReducer.class, IntWritable.class,
+            VectorWritable.class, SequenceFileOutputFormat.class);
     indexInstances.setCombinerClass(VectorSumReducer.class);
     indexInstances.waitForCompletion(true);
-
+    //sum up all the weights from the previous step, per label and per feature
     Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), 
getTempPath(WEIGHTS),
-        SequenceFileInputFormat.class, WeightsMapper.class, Text.class, 
VectorWritable.class, VectorSumReducer.class,
-        Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
+            SequenceFileInputFormat.class, WeightsMapper.class, Text.class, 
VectorWritable.class, VectorSumReducer.class,
+            Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
     weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, 
String.valueOf(labelSize));
     weightSummer.setCombinerClass(VectorSumReducer.class);
     weightSummer.waitForCompletion(true);
-
+    //put the per label and per feature vectors into the cache
     HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());
-
+    //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors -- 
TODO: add reference here to the part of the Rennie paper that discusses this
     Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), 
getTempPath(THETAS),
-        SequenceFileInputFormat.class, ThetaMapper.class, Text.class, 
VectorWritable.class, VectorSumReducer.class,
-        Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
+            SequenceFileInputFormat.class, ThetaMapper.class, Text.class, 
VectorWritable.class, VectorSumReducer.class,
+            Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
     thetaSummer.setCombinerClass(VectorSumReducer.class);
     thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
     thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, 
trainComplementary);
     thetaSummer.waitForCompletion(true);
-
+    //validate our model and then write it out to the official output
     NaiveBayesModel naiveBayesModel = 
BayesUtils.readModelFromDir(getTempPath(), getConf());
     naiveBayesModel.validate();
     naiveBayesModel.serialize(getOutputPath(), getConf());
@@ -122,12 +125,12 @@ public final class TrainNaiveBayesJob ex
 
   private long createLabelIndex(Map<String, String> parsedArgs, Path labPath) 
throws IOException {
     long labelSize = 0;
-    if (parsedArgs.containsKey("--labels")){
+    if (parsedArgs.containsKey("--labels")) {
       Iterable<String> labels = 
Splitter.on(",").split(parsedArgs.get("--labels"));
       labelSize = BayesUtils.writeLabelIndex(getConf(), labels, labPath);
-    } else if (parsedArgs.containsKey("--extractLabels")){
+    } else if (parsedArgs.containsKey("--extractLabels")) {
       SequenceFileDirIterable<Text, IntWritable> iterable =
-          new SequenceFileDirIterable<Text, IntWritable>(getInputPath(), 
PathType.LIST, PathFilters.logsCRCFilter(), getConf());
+              new SequenceFileDirIterable<Text, IntWritable>(getInputPath(), 
PathType.LIST, PathFilters.logsCRCFilter(), getConf());
       labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable);
     }
     return labelSize;

svn commit: r1197342 - /mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java

Reply via email to