svn commit: r945777 - in /mahout/trunk: conf/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/lda/ core/src/main/java/org/apache/mahout/common/commandline/ examples/bin/

jeastman Tue, 18 May 2010 10:24:44 -0700

Author: jeastman
Date: Tue May 18 17:24:16 2010
New Revision: 945777

URL: http://svn.apache.org/viewvc?rev=945777&view=rev
Log:
MAHOUT-294: 
- cleaned up LDADriver by refactoring options to DefaultOptionsCreator
- updated lda.props and fkmeans.props default numReducers
- added lda invocation comment to build-reuters.sh
MAHOUT-297: 
- removed redundant clone() from Canopy constructor


Modified:
    mahout/trunk/conf/fkmeans.props
    mahout/trunk/conf/lda.props
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
    mahout/trunk/examples/bin/build-reuters.sh

Modified: mahout/trunk/conf/fkmeans.props
URL: 
http://svn.apache.org/viewvc/mahout/trunk/conf/fkmeans.props?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
--- mahout/trunk/conf/fkmeans.props (original)
+++ mahout/trunk/conf/fkmeans.props Tue May 18 17:24:16 2010
@@ -10,7 +10,7 @@
 #dm|distance = <distance measure class name. Default: 
SquaredEuclideanDistanceMeasure>
 #cd|convergenceDelta = <the convergence threshold. Default: 0.5>
 #u|numMap <the number of mapper tasks to launch. Default: 10>
-#r|numReduce = <the number of reduce tasks to launch. Default: 1>
+#r|numReduce = <the number of reduce tasks to launch. Default: 2>
 #cl|clustering = <cluster points if present>
 #e|emitMostLikely = <emit most likely cluster if clustering. Default: true>
 #t|threshold = <threshold if clustering and not emitMostLikely. Default: 0.0>

Modified: mahout/trunk/conf/lda.props
URL: 
http://svn.apache.org/viewvc/mahout/trunk/conf/lda.props?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
--- mahout/trunk/conf/lda.props (original)
+++ mahout/trunk/conf/lda.props Tue May 18 17:24:16 2010
@@ -7,5 +7,5 @@
 # The following parameters all have default values if not specified
 #a|topicSmoothing = <topic smoothing. Default: 50/numTopics>
 #maxIter|maxIter = <maximum number of iterations. Default: -1 (until 
converged)>
-#numReducers|numReducers = <the number of reducers. Default: 10>
+#numReducers|numReducers = <the number of reducers. Default: 2>
 

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java 
(original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java 
Tue May 18 17:24:16 2010
@@ -49,7 +49,7 @@ public class Canopy extends ClusterBase 
    */
   public Canopy(Vector point, int canopyId) {
     this.setId(canopyId);
-    this.setCenter(new RandomAccessSparseVector(point.clone()));
+    this.setCenter(new RandomAccessSparseVector(point));
     this.setPointTotal(getCenter().clone());
     this.setNumPoints(1);
   }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java 
(original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java 
Tue May 18 17:24:16 2010
@@ -43,6 +43,7 @@ import org.apache.mahout.common.CommandL
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.IntPairWritable;
 import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.math.DenseMatrix;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -52,115 +53,71 @@ import org.slf4j.LoggerFactory;
  * it outputs a matrix of log probabilities of each topic.
  */
 public final class LDADriver {
-  
+
   static final String STATE_IN_KEY = 
"org.apache.mahout.clustering.lda.stateIn";
+
   static final String NUM_TOPICS_KEY = 
"org.apache.mahout.clustering.lda.numTopics";
+
   static final String NUM_WORDS_KEY = 
"org.apache.mahout.clustering.lda.numWords";
+
   static final String TOPIC_SMOOTHING_KEY = 
"org.apache.mahout.clustering.lda.topicSmoothing";
-  
+
   static final int LOG_LIKELIHOOD_KEY = -2;
+
   static final int TOPIC_SUM_KEY = -1;
+
   static final double OVERALL_CONVERGENCE = 1.0E-5;
-  
+
   private static final Logger log = LoggerFactory.getLogger(LDADriver.class);
-  
-  private LDADriver() {}
-  
+
+  private LDADriver() {
+  }
+
   public static void main(String[] args) throws ClassNotFoundException, 
IOException, InterruptedException {
-    
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-    
-    Option inputOpt = 
obuilder.withLongName("input").withRequired(true).withArgument(
-      
abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Path for input Vectors. Must be a SequenceFile of Writable, 
Vector").withShortName("i").create();
-    
-    Option outputOpt = 
obuilder.withLongName("output").withRequired(true).withArgument(
-      
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Output Working Directory").withShortName("o").create();
-    
-    Option overwriteOutput = 
obuilder.withLongName("overwrite").withRequired(false).withDescription(
-      "If set, overwrite the output directory").withShortName("w").create();
-    
-    Option topicsOpt = 
obuilder.withLongName("numTopics").withRequired(true).withArgument(
-      
abuilder.withName("numTopics").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The number of topics").withShortName("k").create();
-    
-    Option wordsOpt = 
obuilder.withLongName("numWords").withRequired(true).withArgument(
-      
abuilder.withName("numWords").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The total number of words in the corpus").withShortName("v").create();
-    
-    Option topicSmOpt = 
obuilder.withLongName("topicSmoothing").withRequired(false).withArgument(
-      
abuilder.withName("topicSmoothing").withDefault(-1.0).withMinimum(0).withMaximum(1).create())
-        .withDescription("Topic smoothing parameter. Default is 
50/numTopics.").withShortName("a").create();
-    
-    Option maxIterOpt = 
obuilder.withLongName("maxIter").withRequired(false).withArgument(
-      
abuilder.withName("maxIter").withDefault(-1).withMinimum(0).withMaximum(1).create()).withDescription(
-      "Max iterations to run (or until convergence). -1 (default) waits until 
convergence.").create();
-    
-    Option numReducOpt = 
obuilder.withLongName("numReducers").withRequired(false).withArgument(
-      
abuilder.withName("numReducers").withDefault(10).withMinimum(0).withMaximum(1).create())
-        .withDescription("Max iterations to run (or until convergence). 
Default 10").create();
-    
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
-        .create();
-    
-    Group group = 
gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
-      
topicsOpt).withOption(wordsOpt).withOption(topicSmOpt).withOption(maxIterOpt).withOption(numReducOpt)
-        .withOption(overwriteOutput).withOption(helpOpt).create();
+    Option inputOpt = DefaultOptionCreator.inputOption().create();
+    Option outputOpt = DefaultOptionCreator.outputOption().create();
+    Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
+    Option topicsOpt = DefaultOptionCreator.numTopicsOption().create();
+    Option wordsOpt = DefaultOptionCreator.numWordsOption().create();
+    Option topicSmOpt = DefaultOptionCreator.topicSmoothingOption().create();
+    Option maxIterOpt = 
DefaultOptionCreator.maxIterationsOption().withRequired(false).create();
+    Option numReducOpt = DefaultOptionCreator.numReducersOption().create();
+    Option helpOpt = DefaultOptionCreator.helpOption();
+
+    Group group = new 
GroupBuilder().withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(topicsOpt)
+        
.withOption(wordsOpt).withOption(topicSmOpt).withOption(maxIterOpt).withOption(numReducOpt).withOption(overwriteOutput)
+        .withOption(helpOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-      
+
       if (cmdLine.hasOption(helpOpt)) {
         CommandLineUtil.printHelp(group);
         return;
       }
       Path input = new Path(cmdLine.getValue(inputOpt).toString());
       Path output = new Path(cmdLine.getValue(outputOpt).toString());
-      
-      int maxIterations = -1;
-      if (cmdLine.hasOption(maxIterOpt)) {
-        maxIterations = 
Integer.parseInt(cmdLine.getValue(maxIterOpt).toString());
-      }
-      
-      int numReduceTasks = 2;
-      if (cmdLine.hasOption(numReducOpt)) {
-        numReduceTasks = 
Integer.parseInt(cmdLine.getValue(numReducOpt).toString());
-      }
-      
-      int numTopics = 20;
-      if (cmdLine.hasOption(topicsOpt)) {
-        numTopics = Integer.parseInt(cmdLine.getValue(topicsOpt).toString());
-      }
-      
-      int numWords = 20;
-      if (cmdLine.hasOption(wordsOpt)) {
-        numWords = Integer.parseInt(cmdLine.getValue(wordsOpt).toString());
-      }
-      
       if (cmdLine.hasOption(overwriteOutput)) {
         HadoopUtil.overwriteOutput(output);
       }
-      
-      double topicSmoothing = -1.0;
-      if (cmdLine.hasOption(topicSmOpt)) {
-        topicSmoothing = 
Double.parseDouble(cmdLine.getValue(maxIterOpt).toString());
-      }
+      int maxIterations = 
Integer.parseInt(cmdLine.getValue(maxIterOpt).toString());
+      int numReduceTasks = 
Integer.parseInt(cmdLine.getValue(numReducOpt).toString());
+      int numTopics = Integer.parseInt(cmdLine.getValue(topicsOpt).toString());
+      int numWords = Integer.parseInt(cmdLine.getValue(wordsOpt).toString());
+      double topicSmoothing = 
Double.parseDouble(cmdLine.getValue(maxIterOpt).toString());
       if (topicSmoothing < 1) {
         topicSmoothing = 50.0 / numTopics;
       }
-      
+
       runJob(input, output, numTopics, numWords, topicSmoothing, 
maxIterations, numReduceTasks);
-      
+
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);
     }
   }
-  
+
   /**
    * Run the job using supplied arguments
    * 
@@ -180,50 +137,44 @@ public final class LDADriver {
    *          the number of Reducers desired
    * @throws IOException
    */
-  public static void runJob(Path input,
-                            Path output,
-                            int numTopics,
-                            int numWords,
-                            double topicSmoothing,
-                            int maxIterations,
-                            int numReducers) throws IOException, 
InterruptedException, ClassNotFoundException {
-    
+  public static void runJob(Path input, Path output, int numTopics, int 
numWords, double topicSmoothing, int maxIterations,
+      int numReducers) throws IOException, InterruptedException, 
ClassNotFoundException {
+
     Path stateIn = new Path(output, "state-0");
     writeInitialState(stateIn, numTopics, numWords);
     double oldLL = Double.NEGATIVE_INFINITY;
     boolean converged = false;
-    
+
     for (int iteration = 0; ((maxIterations < 1) || (iteration < 
maxIterations)) && !converged; iteration++) {
       log.info("Iteration {}", iteration);
       // point the output to a new directory per iteration
       Path stateOut = new Path(output, "state-" + (iteration + 1));
       double ll = runIteration(input, stateIn, stateOut, numTopics, numWords, 
topicSmoothing, numReducers);
       double relChange = (oldLL - ll) / oldLL;
-      
+
       // now point the input to the old output directory
       log.info("Iteration {} finished. Log Likelihood: {}", iteration, ll);
       log.info("(Old LL: {})", oldLL);
       log.info("(Rel Change: {})", relChange);
-      
+
       converged = (iteration > 2) && (relChange < OVERALL_CONVERGENCE);
       stateIn = stateOut;
       oldLL = ll;
     }
   }
-  
+
   private static void writeInitialState(Path statePath, int numTopics, int 
numWords) throws IOException {
     Configuration job = new Configuration();
     FileSystem fs = statePath.getFileSystem(job);
-    
+
     DoubleWritable v = new DoubleWritable();
-    
+
     Random random = RandomUtils.getRandom();
-    
+
     for (int k = 0; k < numTopics; ++k) {
       Path path = new Path(statePath, "part-" + k);
-      SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, 
IntPairWritable.class,
-          DoubleWritable.class);
-      
+      SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, 
IntPairWritable.class, DoubleWritable.class);
+
       double total = 0.0; // total number of pseudo counts we made
       for (int w = 0; w < numWords; ++w) {
         IntPairWritable kw = new IntPairWritable(k, w);
@@ -236,16 +187,16 @@ public final class LDADriver {
       IntPairWritable kTsk = new IntPairWritable(k, TOPIC_SUM_KEY);
       v.set(Math.log(total));
       writer.append(kTsk, v);
-      
+
       writer.close();
     }
   }
-  
+
   private static double findLL(Path statePath, Configuration job) throws 
IOException {
     FileSystem fs = statePath.getFileSystem(job);
-    
+
     double ll = 0.0;
-    
+
     IntPairWritable key = new IntPairWritable();
     DoubleWritable value = new DoubleWritable();
     for (FileStatus status : fs.globStatus(new Path(statePath, "part-*"))) {
@@ -259,10 +210,10 @@ public final class LDADriver {
       }
       reader.close();
     }
-    
+
     return ll;
   }
-  
+
   /**
    * Run the job using supplied arguments
    * 
@@ -277,28 +228,21 @@ public final class LDADriver {
    * @param numReducers
    *          the number of Reducers desired
    */
-  public static double runIteration(Path input,
-                                    Path stateIn,
-                                    Path stateOut,
-                                    int numTopics,
-                                    int numWords,
-                                    double topicSmoothing,
-                                    int numReducers) throws IOException,
-                                                    InterruptedException,
-                                                    ClassNotFoundException {
+  public static double runIteration(Path input, Path stateIn, Path stateOut, 
int numTopics, int numWords, double topicSmoothing,
+      int numReducers) throws IOException, InterruptedException, 
ClassNotFoundException {
     Configuration conf = new Configuration();
     conf.set(STATE_IN_KEY, stateIn.toString());
     conf.set(NUM_TOPICS_KEY, Integer.toString(numTopics));
     conf.set(NUM_WORDS_KEY, Integer.toString(numWords));
     conf.set(TOPIC_SMOOTHING_KEY, Double.toString(topicSmoothing));
-    
+
     Job job = new Job(conf);
-    
+
     job.setOutputKeyClass(IntPairWritable.class);
     job.setOutputValueClass(DoubleWritable.class);
     FileInputFormat.addInputPaths(job, input.toString());
     FileOutputFormat.setOutputPath(job, stateOut);
-    
+
     job.setMapperClass(LDAMapper.class);
     job.setReducerClass(LDAReducer.class);
     job.setCombinerClass(LDAReducer.class);
@@ -306,24 +250,24 @@ public final class LDADriver {
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
     job.setInputFormatClass(SequenceFileInputFormat.class);
     job.setJarByClass(LDADriver.class);
-    
+
     job.waitForCompletion(true);
     return findLL(stateOut, conf);
   }
-  
+
   static LDAState createState(Configuration job) throws IOException {
     String statePath = job.get(STATE_IN_KEY);
     int numTopics = Integer.parseInt(job.get(NUM_TOPICS_KEY));
     int numWords = Integer.parseInt(job.get(NUM_WORDS_KEY));
     double topicSmoothing = Double.parseDouble(job.get(TOPIC_SMOOTHING_KEY));
-    
+
     Path dir = new Path(statePath);
     FileSystem fs = dir.getFileSystem(job);
-    
+
     DenseMatrix pWgT = new DenseMatrix(numTopics, numWords);
     double[] logTotals = new double[numTopics];
     double ll = 0.0;
-    
+
     IntPairWritable key = new IntPairWritable();
     DoubleWritable value = new DoubleWritable();
     for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) {
@@ -354,7 +298,7 @@ public final class LDADriver {
       }
       reader.close();
     }
-    
+
     return new LDAState(numTopics, numWords, topicSmoothing, pWgT, logTotals, 
ll);
   }
 }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
 Tue May 18 17:24:16 2010
@@ -30,14 +30,14 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for help.
+   * Returns a default command line option for help. Used by all clustering 
jobs and many others
    * */
   public static Option helpOption() {
     return new 
DefaultOptionBuilder().withLongName("help").withDescription("Print out 
help").withShortName("h").create();
   }
 
   /**
-   * Returns a default command line option for input directory specification.
+   * Returns a default command line option for input directory specification. 
Used by all clustering jobs plus others
    */
   public static DefaultOptionBuilder inputOption() {
     return new 
DefaultOptionBuilder().withLongName("input").withRequired(true).withShortName("i").withArgument(
@@ -45,6 +45,9 @@ public final class DefaultOptionCreator 
         "Path to job input directory. Must be a SequenceFile of 
VectorWritable");
   }
 
+  /**
+   * Returns a default command line option for clusters input directory 
specification. Used by FuzzyKmeans, Kmeans
+   */
   public static DefaultOptionBuilder clustersInOption() {
     return new 
DefaultOptionBuilder().withLongName("clusters").withRequired(true).withArgument(
         new 
ArgumentBuilder().withName("clusters").withMinimum(1).withMaximum(1).create()).withDescription(
@@ -52,7 +55,7 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for output directory specification.
+   * Returns a default command line option for output directory specification. 
Used by all clustering jobs plus others
    */
   public static DefaultOptionBuilder outputOption() {
     return new 
DefaultOptionBuilder().withLongName("output").withRequired(true).withShortName("o").withArgument(
@@ -61,7 +64,7 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for output directory overwriting
+   * Returns a default command line option for output directory overwriting. 
Used by all clustering jobs
    */
   public static DefaultOptionBuilder overwriteOption() {
     return new 
DefaultOptionBuilder().withLongName("overwrite").withRequired(false).withDescription(
@@ -69,15 +72,7 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for clustering specification
-   */
-  public static DefaultOptionBuilder clusteringOption() {
-    return new 
DefaultOptionBuilder().withLongName("clustering").withRequired(false).withDescription(
-        "If present, run clustering after the iterations have taken 
place").withShortName("cl");
-  }
-
-  /**
-   * Returns a default command line option for specification of distance 
measure class to use.
+   * Returns a default command line option for specification of distance 
measure class to use. Used by Canopy, FuzzyKmeans, Kmeans, MeanShift
    */
   public static DefaultOptionBuilder distanceMeasureOption() {
     return new 
DefaultOptionBuilder().withLongName("distanceMeasure").withRequired(false).withShortName("dm").withArgument(
@@ -85,12 +80,18 @@ public final class DefaultOptionCreator 
             1).withMaximum(1).create()).withDescription("The classname of the 
DistanceMeasure. Default is SquaredEuclidean");
   }
 
+  /**
+   * Returns a default command line option for specification of T1. Used by 
Canopy, MeanShift
+   */
   public static DefaultOptionBuilder t1Option() {
     return new 
DefaultOptionBuilder().withLongName("t1").withRequired(true).withArgument(
         new 
ArgumentBuilder().withName("t1").withMinimum(1).withMaximum(1).create()).withDescription("T1
 threshold value")
         .withShortName("t1");
   }
 
+  /**
+   * Returns a default command line option for specification of T2. Used by 
Canopy, MeanShift
+   */
   public static DefaultOptionBuilder t2Option() {
     return new 
DefaultOptionBuilder().withLongName("t2").withRequired(true).withArgument(
         new 
ArgumentBuilder().withName("t2").withMinimum(1).withMaximum(1).create()).withDescription("T2
 threshold value")
@@ -98,16 +99,17 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for specification of max number of 
iterations.
+   * Returns a default command line option for specification of max number of 
iterations. Used by Dirichlet, FuzzyKmeans, Kmeans, LDA
    */
   public static DefaultOptionBuilder maxIterationsOption() {
+    // default value used by LDA which overrides withRequired(false)
     return new 
DefaultOptionBuilder().withLongName("maxIter").withRequired(true).withShortName("x").withArgument(
-        new 
ArgumentBuilder().withName("maxIter").withMinimum(1).withMaximum(1).create()).withDescription(
+        new 
ArgumentBuilder().withName("maxIter").withDefault("-1").withMinimum(1).withMaximum(1).create()).withDescription(
         "The maximum number of iterations.");
   }
 
   /**
-   * Returns a default command line option for specification of numbers of 
clusters to create.
+   * Returns a default command line option for specification of numbers of 
clusters to create. Used by Dirichlet, FuzzyKmeans, Kmeans
    */
   public static DefaultOptionBuilder kOption() {
     return new 
DefaultOptionBuilder().withLongName("k").withRequired(true).withArgument(
@@ -116,7 +118,7 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for convergence delta specification.
+   * Returns a default command line option for convergence delta 
specification. Used by FuzzyKmeans, Kmeans, MeanShift
    */
   public static DefaultOptionBuilder convergenceOption() {
     return new 
DefaultOptionBuilder().withLongName("convergenceDelta").withRequired(false).withShortName("cd").withArgument(
@@ -125,7 +127,7 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for alpha specification
+   * Returns a default command line option for alpha specification. Used by 
Dirichlet
    */
   public static DefaultOptionBuilder alphaOption() {
     return new 
DefaultOptionBuilder().withLongName("alpha").withRequired(false).withShortName("m").withArgument(
@@ -134,7 +136,7 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for model distribution class 
specification
+   * Returns a default command line option for model distribution class 
specification. Used by Dirichlet
    */
   public static DefaultOptionBuilder modelDistributionOption() {
     return new 
DefaultOptionBuilder().withLongName("modelDistClass").withRequired(false).withShortName("md").withArgument(
@@ -143,7 +145,7 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for model prototype class 
specification
+   * Returns a default command line option for model prototype class 
specification. Used by Dirichlet
    */
   public static DefaultOptionBuilder modelPrototypeOption() {
     return new 
DefaultOptionBuilder().withLongName("modelPrototypeClass").withRequired(false).withShortName("mp").withArgument(
@@ -153,7 +155,7 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for specifying the number of Mappers
+   * Returns a default command line option for specifying the number of 
Mappers. Used by FuzzyKmeans
    */
   public static DefaultOptionBuilder numMappersOption() {
     return new 
DefaultOptionBuilder().withLongName("numMap").withRequired(false).withArgument(
@@ -162,25 +164,34 @@ public final class DefaultOptionCreator 
   }
 
   /**
-   * Returns a default command line option for specifying the max number of 
reducers
+   * Returns a default command line option for specifying the max number of 
reducers. Used by Dirichlet, FuzzyKmeans, Kmeans and LDA
    */
   public static DefaultOptionBuilder numReducersOption() {
     return new 
DefaultOptionBuilder().withLongName("maxRed").withRequired(false).withShortName("r").withArgument(
-        new 
ArgumentBuilder().withName("maxRed").withDefault("1").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The number of reduce tasks. Defaults to 1");
+        new 
ArgumentBuilder().withName("maxRed").withDefault("2").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The number of reduce tasks. Defaults to 2");
+  }
+
+  /**
+   * Returns a default command line option for clustering specification. Used 
by all clustering except LDA
+   */
+  public static DefaultOptionBuilder clusteringOption() {
+    return new 
DefaultOptionBuilder().withLongName("clustering").withRequired(false).withDescription(
+        "If present, run clustering after the iterations have taken 
place").withShortName("cl");
   }
 
   /**
-   * Returns a default command line option for specifying the emitMostLikely 
+   * Returns a default command line option for specifying the emitMostLikely 
flag. Used by Dirichlet and FuzzyKmeans
    */
   public static DefaultOptionBuilder emitMostLikelyOption() {
     return new 
DefaultOptionBuilder().withLongName("emitMostLikely").withRequired(false).withShortName("e").withArgument(
-        new 
ArgumentBuilder().withName("emitMostLikely").withDefault("false").withMinimum(1).withMaximum(1).create())
-        .withDescription("True if clustering should emit the most likely point 
only, false for threshold clustering");
+        new 
ArgumentBuilder().withName("emitMostLikely").withDefault("true").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+            "True if clustering should emit the most likely point only, false 
for threshold clustering. Default is true");
   }
 
   /**
-   * Returns a default command line option for specifying the clustering 
threshold value
+   * Returns a default command line option for specifying the clustering 
threshold value. Used by Dirichlet and FuzzyKmeans
    */
   public static DefaultOptionBuilder thresholdOption() {
     return new 
DefaultOptionBuilder().withLongName("threshold").withRequired(false).withShortName("t").withArgument(
@@ -206,4 +217,31 @@ public final class DefaultOptionCreator 
         "If present, the input directory already contains MeanShiftCanopies");
   }
 
+  /**
+   * Returns a default command line option for specifying the LDA number of 
topics option
+   */
+  public static DefaultOptionBuilder numTopicsOption() {
+    return new 
DefaultOptionBuilder().withLongName("numTopics").withRequired(true).withArgument(
+        new 
ArgumentBuilder().withName("numTopics").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The total number of topics in the corpus").withShortName("k");
+  }
+
+  /**
+   * Returns a default command line option for specifying the LDA number of 
words option
+   */
+  public static DefaultOptionBuilder numWordsOption() {
+    return new 
DefaultOptionBuilder().withLongName("numWords").withRequired(true).withArgument(
+        new 
ArgumentBuilder().withName("numWords").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The total number of words in the corpus (can be approximate, needs to 
exceed the actual value)").withShortName("v");
+  }
+
+  /**
+   * Returns a default command line option for specifying the LDA topic 
smoothing option
+   */
+  public static DefaultOptionBuilder topicSmoothingOption() {
+    return new 
DefaultOptionBuilder().withLongName("topicSmoothing").withRequired(false).withArgument(
+        new 
ArgumentBuilder().withName("topicSmoothing").withDefault(-1.0).withMinimum(0).withMaximum(1).create()).withDescription(
+        "Topic smoothing parameter. Default is 
50/numTopics.").withShortName("a");
+  }
+
 }

Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Tue May 18 17:24:16 2010
@@ -40,4 +40,6 @@ cd ../..
 ./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters 
./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
 ./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o 
./examples/bin/work/reuters-out-seqdir -c UTF-8
 ./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o 
./examples/bin/work/reuters-out-seqdir-sparse
-#./bin/mahout kmeans -i 
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c 
./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -k 20 -w
+#./bin/mahout kmeans -i 
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c 
./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -k 20 -ow
+#./bin/mahout lda -i 
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors -o 
./examples/bin/work/reuters-lda -k 20 -v 50000 -ow
+

svn commit: r945777 - in /mahout/trunk: conf/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/lda/ core/src/main/java/org/apache/mahout/common/commandline/ examples/bin/

Reply via email to