Author: jeastman
Date: Tue May 18 17:24:16 2010
New Revision: 945777
URL: http://svn.apache.org/viewvc?rev=945777&view=rev
Log:
MAHOUT-294:
- cleaned up LDADriver by refactoring options to DefaultOptionsCreator
- updated lda.props and fkmeans.props default numReducers
- added lda invocation comment to build-reuters.sh
MAHOUT-297:
- removed redundant clone() from Canopy constructor
Modified:
mahout/trunk/conf/fkmeans.props
mahout/trunk/conf/lda.props
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
mahout/trunk/examples/bin/build-reuters.sh
Modified: mahout/trunk/conf/fkmeans.props
URL:
http://svn.apache.org/viewvc/mahout/trunk/conf/fkmeans.props?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
--- mahout/trunk/conf/fkmeans.props (original)
+++ mahout/trunk/conf/fkmeans.props Tue May 18 17:24:16 2010
@@ -10,7 +10,7 @@
#dm|distance = <distance measure class name. Default:
SquaredEuclideanDistanceMeasure>
#cd|convergenceDelta = <the convergence threshold. Default: 0.5>
#u|numMap <the number of mapper tasks to launch. Default: 10>
-#r|numReduce = <the number of reduce tasks to launch. Default: 1>
+#r|numReduce = <the number of reduce tasks to launch. Default: 2>
#cl|clustering = <cluster points if present>
#e|emitMostLikely = <emit most likely cluster if clustering. Default: true>
#t|threshold = <threshold if clustering and not emitMostLikely. Default: 0.0>
Modified: mahout/trunk/conf/lda.props
URL:
http://svn.apache.org/viewvc/mahout/trunk/conf/lda.props?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
--- mahout/trunk/conf/lda.props (original)
+++ mahout/trunk/conf/lda.props Tue May 18 17:24:16 2010
@@ -7,5 +7,5 @@
# The following parameters all have default values if not specified
#a|topicSmoothing = <topic smoothing. Default: 50/numTopics>
#maxIter|maxIter = <maximum number of iterations. Default: -1 (until
converged)>
-#numReducers|numReducers = <the number of reducers. Default: 10>
+#numReducers|numReducers = <the number of reducers. Default: 2>
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
Tue May 18 17:24:16 2010
@@ -49,7 +49,7 @@ public class Canopy extends ClusterBase
*/
public Canopy(Vector point, int canopyId) {
this.setId(canopyId);
- this.setCenter(new RandomAccessSparseVector(point.clone()));
+ this.setCenter(new RandomAccessSparseVector(point));
this.setPointTotal(getCenter().clone());
this.setNumPoints(1);
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
Tue May 18 17:24:16 2010
@@ -43,6 +43,7 @@ import org.apache.mahout.common.CommandL
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.IntPairWritable;
import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.DenseMatrix;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -52,115 +53,71 @@ import org.slf4j.LoggerFactory;
* it outputs a matrix of log probabilities of each topic.
*/
public final class LDADriver {
-
+
static final String STATE_IN_KEY =
"org.apache.mahout.clustering.lda.stateIn";
+
static final String NUM_TOPICS_KEY =
"org.apache.mahout.clustering.lda.numTopics";
+
static final String NUM_WORDS_KEY =
"org.apache.mahout.clustering.lda.numWords";
+
static final String TOPIC_SMOOTHING_KEY =
"org.apache.mahout.clustering.lda.topicSmoothing";
-
+
static final int LOG_LIKELIHOOD_KEY = -2;
+
static final int TOPIC_SUM_KEY = -1;
+
static final double OVERALL_CONVERGENCE = 1.0E-5;
-
+
private static final Logger log = LoggerFactory.getLogger(LDADriver.class);
-
- private LDADriver() {}
-
+
+ private LDADriver() {
+ }
+
public static void main(String[] args) throws ClassNotFoundException,
IOException, InterruptedException {
-
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option inputOpt =
obuilder.withLongName("input").withRequired(true).withArgument(
-
abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Path for input Vectors. Must be a SequenceFile of Writable,
Vector").withShortName("i").create();
-
- Option outputOpt =
obuilder.withLongName("output").withRequired(true).withArgument(
-
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Output Working Directory").withShortName("o").create();
-
- Option overwriteOutput =
obuilder.withLongName("overwrite").withRequired(false).withDescription(
- "If set, overwrite the output directory").withShortName("w").create();
-
- Option topicsOpt =
obuilder.withLongName("numTopics").withRequired(true).withArgument(
-
abuilder.withName("numTopics").withMinimum(1).withMaximum(1).create()).withDescription(
- "The number of topics").withShortName("k").create();
-
- Option wordsOpt =
obuilder.withLongName("numWords").withRequired(true).withArgument(
-
abuilder.withName("numWords").withMinimum(1).withMaximum(1).create()).withDescription(
- "The total number of words in the corpus").withShortName("v").create();
-
- Option topicSmOpt =
obuilder.withLongName("topicSmoothing").withRequired(false).withArgument(
-
abuilder.withName("topicSmoothing").withDefault(-1.0).withMinimum(0).withMaximum(1).create())
- .withDescription("Topic smoothing parameter. Default is
50/numTopics.").withShortName("a").create();
-
- Option maxIterOpt =
obuilder.withLongName("maxIter").withRequired(false).withArgument(
-
abuilder.withName("maxIter").withDefault(-1).withMinimum(0).withMaximum(1).create()).withDescription(
- "Max iterations to run (or until convergence). -1 (default) waits until
convergence.").create();
-
- Option numReducOpt =
obuilder.withLongName("numReducers").withRequired(false).withArgument(
-
abuilder.withName("numReducers").withDefault(10).withMinimum(0).withMaximum(1).create())
- .withDescription("Max iterations to run (or until convergence).
Default 10").create();
-
- Option helpOpt = obuilder.withLongName("help").withDescription("Print out
help").withShortName("h")
- .create();
-
- Group group =
gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
-
topicsOpt).withOption(wordsOpt).withOption(topicSmOpt).withOption(maxIterOpt).withOption(numReducOpt)
- .withOption(overwriteOutput).withOption(helpOpt).create();
+ Option inputOpt = DefaultOptionCreator.inputOption().create();
+ Option outputOpt = DefaultOptionCreator.outputOption().create();
+ Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
+ Option topicsOpt = DefaultOptionCreator.numTopicsOption().create();
+ Option wordsOpt = DefaultOptionCreator.numWordsOption().create();
+ Option topicSmOpt = DefaultOptionCreator.topicSmoothingOption().create();
+ Option maxIterOpt =
DefaultOptionCreator.maxIterationsOption().withRequired(false).create();
+ Option numReducOpt = DefaultOptionCreator.numReducersOption().create();
+ Option helpOpt = DefaultOptionCreator.helpOption();
+
+ Group group = new
GroupBuilder().withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(topicsOpt)
+
.withOption(wordsOpt).withOption(topicSmOpt).withOption(maxIterOpt).withOption(numReducOpt).withOption(overwriteOutput)
+ .withOption(helpOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return;
}
Path input = new Path(cmdLine.getValue(inputOpt).toString());
Path output = new Path(cmdLine.getValue(outputOpt).toString());
-
- int maxIterations = -1;
- if (cmdLine.hasOption(maxIterOpt)) {
- maxIterations =
Integer.parseInt(cmdLine.getValue(maxIterOpt).toString());
- }
-
- int numReduceTasks = 2;
- if (cmdLine.hasOption(numReducOpt)) {
- numReduceTasks =
Integer.parseInt(cmdLine.getValue(numReducOpt).toString());
- }
-
- int numTopics = 20;
- if (cmdLine.hasOption(topicsOpt)) {
- numTopics = Integer.parseInt(cmdLine.getValue(topicsOpt).toString());
- }
-
- int numWords = 20;
- if (cmdLine.hasOption(wordsOpt)) {
- numWords = Integer.parseInt(cmdLine.getValue(wordsOpt).toString());
- }
-
if (cmdLine.hasOption(overwriteOutput)) {
HadoopUtil.overwriteOutput(output);
}
-
- double topicSmoothing = -1.0;
- if (cmdLine.hasOption(topicSmOpt)) {
- topicSmoothing =
Double.parseDouble(cmdLine.getValue(maxIterOpt).toString());
- }
+ int maxIterations =
Integer.parseInt(cmdLine.getValue(maxIterOpt).toString());
+ int numReduceTasks =
Integer.parseInt(cmdLine.getValue(numReducOpt).toString());
+ int numTopics = Integer.parseInt(cmdLine.getValue(topicsOpt).toString());
+ int numWords = Integer.parseInt(cmdLine.getValue(wordsOpt).toString());
+ double topicSmoothing =
Double.parseDouble(cmdLine.getValue(maxIterOpt).toString());
if (topicSmoothing < 1) {
topicSmoothing = 50.0 / numTopics;
}
-
+
runJob(input, output, numTopics, numWords, topicSmoothing,
maxIterations, numReduceTasks);
-
+
} catch (OptionException e) {
log.error("Exception", e);
CommandLineUtil.printHelp(group);
}
}
-
+
/**
* Run the job using supplied arguments
*
@@ -180,50 +137,44 @@ public final class LDADriver {
* the number of Reducers desired
* @throws IOException
*/
- public static void runJob(Path input,
- Path output,
- int numTopics,
- int numWords,
- double topicSmoothing,
- int maxIterations,
- int numReducers) throws IOException,
InterruptedException, ClassNotFoundException {
-
+ public static void runJob(Path input, Path output, int numTopics, int
numWords, double topicSmoothing, int maxIterations,
+ int numReducers) throws IOException, InterruptedException,
ClassNotFoundException {
+
Path stateIn = new Path(output, "state-0");
writeInitialState(stateIn, numTopics, numWords);
double oldLL = Double.NEGATIVE_INFINITY;
boolean converged = false;
-
+
for (int iteration = 0; ((maxIterations < 1) || (iteration <
maxIterations)) && !converged; iteration++) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
Path stateOut = new Path(output, "state-" + (iteration + 1));
double ll = runIteration(input, stateIn, stateOut, numTopics, numWords,
topicSmoothing, numReducers);
double relChange = (oldLL - ll) / oldLL;
-
+
// now point the input to the old output directory
log.info("Iteration {} finished. Log Likelihood: {}", iteration, ll);
log.info("(Old LL: {})", oldLL);
log.info("(Rel Change: {})", relChange);
-
+
converged = (iteration > 2) && (relChange < OVERALL_CONVERGENCE);
stateIn = stateOut;
oldLL = ll;
}
}
-
+
private static void writeInitialState(Path statePath, int numTopics, int
numWords) throws IOException {
Configuration job = new Configuration();
FileSystem fs = statePath.getFileSystem(job);
-
+
DoubleWritable v = new DoubleWritable();
-
+
Random random = RandomUtils.getRandom();
-
+
for (int k = 0; k < numTopics; ++k) {
Path path = new Path(statePath, "part-" + k);
- SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path,
IntPairWritable.class,
- DoubleWritable.class);
-
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path,
IntPairWritable.class, DoubleWritable.class);
+
double total = 0.0; // total number of pseudo counts we made
for (int w = 0; w < numWords; ++w) {
IntPairWritable kw = new IntPairWritable(k, w);
@@ -236,16 +187,16 @@ public final class LDADriver {
IntPairWritable kTsk = new IntPairWritable(k, TOPIC_SUM_KEY);
v.set(Math.log(total));
writer.append(kTsk, v);
-
+
writer.close();
}
}
-
+
private static double findLL(Path statePath, Configuration job) throws
IOException {
FileSystem fs = statePath.getFileSystem(job);
-
+
double ll = 0.0;
-
+
IntPairWritable key = new IntPairWritable();
DoubleWritable value = new DoubleWritable();
for (FileStatus status : fs.globStatus(new Path(statePath, "part-*"))) {
@@ -259,10 +210,10 @@ public final class LDADriver {
}
reader.close();
}
-
+
return ll;
}
-
+
/**
* Run the job using supplied arguments
*
@@ -277,28 +228,21 @@ public final class LDADriver {
* @param numReducers
* the number of Reducers desired
*/
- public static double runIteration(Path input,
- Path stateIn,
- Path stateOut,
- int numTopics,
- int numWords,
- double topicSmoothing,
- int numReducers) throws IOException,
- InterruptedException,
- ClassNotFoundException {
+ public static double runIteration(Path input, Path stateIn, Path stateOut,
int numTopics, int numWords, double topicSmoothing,
+ int numReducers) throws IOException, InterruptedException,
ClassNotFoundException {
Configuration conf = new Configuration();
conf.set(STATE_IN_KEY, stateIn.toString());
conf.set(NUM_TOPICS_KEY, Integer.toString(numTopics));
conf.set(NUM_WORDS_KEY, Integer.toString(numWords));
conf.set(TOPIC_SMOOTHING_KEY, Double.toString(topicSmoothing));
-
+
Job job = new Job(conf);
-
+
job.setOutputKeyClass(IntPairWritable.class);
job.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPaths(job, input.toString());
FileOutputFormat.setOutputPath(job, stateOut);
-
+
job.setMapperClass(LDAMapper.class);
job.setReducerClass(LDAReducer.class);
job.setCombinerClass(LDAReducer.class);
@@ -306,24 +250,24 @@ public final class LDADriver {
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setJarByClass(LDADriver.class);
-
+
job.waitForCompletion(true);
return findLL(stateOut, conf);
}
-
+
static LDAState createState(Configuration job) throws IOException {
String statePath = job.get(STATE_IN_KEY);
int numTopics = Integer.parseInt(job.get(NUM_TOPICS_KEY));
int numWords = Integer.parseInt(job.get(NUM_WORDS_KEY));
double topicSmoothing = Double.parseDouble(job.get(TOPIC_SMOOTHING_KEY));
-
+
Path dir = new Path(statePath);
FileSystem fs = dir.getFileSystem(job);
-
+
DenseMatrix pWgT = new DenseMatrix(numTopics, numWords);
double[] logTotals = new double[numTopics];
double ll = 0.0;
-
+
IntPairWritable key = new IntPairWritable();
DoubleWritable value = new DoubleWritable();
for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) {
@@ -354,7 +298,7 @@ public final class LDADriver {
}
reader.close();
}
-
+
return new LDAState(numTopics, numWords, topicSmoothing, pWgT, logTotals,
ll);
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
Tue May 18 17:24:16 2010
@@ -30,14 +30,14 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for help.
+ * Returns a default command line option for help. Used by all clustering
jobs and many others
* */
public static Option helpOption() {
return new
DefaultOptionBuilder().withLongName("help").withDescription("Print out
help").withShortName("h").create();
}
/**
- * Returns a default command line option for input directory specification.
+ * Returns a default command line option for input directory specification.
Used by all clustering jobs plus others
*/
public static DefaultOptionBuilder inputOption() {
return new
DefaultOptionBuilder().withLongName("input").withRequired(true).withShortName("i").withArgument(
@@ -45,6 +45,9 @@ public final class DefaultOptionCreator
"Path to job input directory. Must be a SequenceFile of
VectorWritable");
}
+ /**
+ * Returns a default command line option for clusters input directory
specification. Used by FuzzyKmeans, Kmeans
+ */
public static DefaultOptionBuilder clustersInOption() {
return new
DefaultOptionBuilder().withLongName("clusters").withRequired(true).withArgument(
new
ArgumentBuilder().withName("clusters").withMinimum(1).withMaximum(1).create()).withDescription(
@@ -52,7 +55,7 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for output directory specification.
+ * Returns a default command line option for output directory specification.
Used by all clustering jobs plus others
*/
public static DefaultOptionBuilder outputOption() {
return new
DefaultOptionBuilder().withLongName("output").withRequired(true).withShortName("o").withArgument(
@@ -61,7 +64,7 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for output directory overwriting
+ * Returns a default command line option for output directory overwriting.
Used by all clustering jobs
*/
public static DefaultOptionBuilder overwriteOption() {
return new
DefaultOptionBuilder().withLongName("overwrite").withRequired(false).withDescription(
@@ -69,15 +72,7 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for clustering specification
- */
- public static DefaultOptionBuilder clusteringOption() {
- return new
DefaultOptionBuilder().withLongName("clustering").withRequired(false).withDescription(
- "If present, run clustering after the iterations have taken
place").withShortName("cl");
- }
-
- /**
- * Returns a default command line option for specification of distance
measure class to use.
+ * Returns a default command line option for specification of distance
measure class to use. Used by Canopy, FuzzyKmeans, Kmeans, MeanShift
*/
public static DefaultOptionBuilder distanceMeasureOption() {
return new
DefaultOptionBuilder().withLongName("distanceMeasure").withRequired(false).withShortName("dm").withArgument(
@@ -85,12 +80,18 @@ public final class DefaultOptionCreator
1).withMaximum(1).create()).withDescription("The classname of the
DistanceMeasure. Default is SquaredEuclidean");
}
+ /**
+ * Returns a default command line option for specification of T1. Used by
Canopy, MeanShift
+ */
public static DefaultOptionBuilder t1Option() {
return new
DefaultOptionBuilder().withLongName("t1").withRequired(true).withArgument(
new
ArgumentBuilder().withName("t1").withMinimum(1).withMaximum(1).create()).withDescription("T1
threshold value")
.withShortName("t1");
}
+ /**
+ * Returns a default command line option for specification of T2. Used by
Canopy, MeanShift
+ */
public static DefaultOptionBuilder t2Option() {
return new
DefaultOptionBuilder().withLongName("t2").withRequired(true).withArgument(
new
ArgumentBuilder().withName("t2").withMinimum(1).withMaximum(1).create()).withDescription("T2
threshold value")
@@ -98,16 +99,17 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for specification of max number of
iterations.
+ * Returns a default command line option for specification of max number of
iterations. Used by Dirichlet, FuzzyKmeans, Kmeans, LDA
*/
public static DefaultOptionBuilder maxIterationsOption() {
+ // default value used by LDA which overrides withRequired(false)
return new
DefaultOptionBuilder().withLongName("maxIter").withRequired(true).withShortName("x").withArgument(
- new
ArgumentBuilder().withName("maxIter").withMinimum(1).withMaximum(1).create()).withDescription(
+ new
ArgumentBuilder().withName("maxIter").withDefault("-1").withMinimum(1).withMaximum(1).create()).withDescription(
"The maximum number of iterations.");
}
/**
- * Returns a default command line option for specification of numbers of
clusters to create.
+ * Returns a default command line option for specification of numbers of
clusters to create. Used by Dirichlet, FuzzyKmeans, Kmeans
*/
public static DefaultOptionBuilder kOption() {
return new
DefaultOptionBuilder().withLongName("k").withRequired(true).withArgument(
@@ -116,7 +118,7 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for convergence delta specification.
+ * Returns a default command line option for convergence delta
specification. Used by FuzzyKmeans, Kmeans, MeanShift
*/
public static DefaultOptionBuilder convergenceOption() {
return new
DefaultOptionBuilder().withLongName("convergenceDelta").withRequired(false).withShortName("cd").withArgument(
@@ -125,7 +127,7 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for alpha specification
+ * Returns a default command line option for alpha specification. Used by
Dirichlet
*/
public static DefaultOptionBuilder alphaOption() {
return new
DefaultOptionBuilder().withLongName("alpha").withRequired(false).withShortName("m").withArgument(
@@ -134,7 +136,7 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for model distribution class
specification
+ * Returns a default command line option for model distribution class
specification. Used by Dirichlet
*/
public static DefaultOptionBuilder modelDistributionOption() {
return new
DefaultOptionBuilder().withLongName("modelDistClass").withRequired(false).withShortName("md").withArgument(
@@ -143,7 +145,7 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for model prototype class
specification
+ * Returns a default command line option for model prototype class
specification. Used by Dirichlet
*/
public static DefaultOptionBuilder modelPrototypeOption() {
return new
DefaultOptionBuilder().withLongName("modelPrototypeClass").withRequired(false).withShortName("mp").withArgument(
@@ -153,7 +155,7 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for specifying the number of Mappers
+ * Returns a default command line option for specifying the number of
Mappers. Used by FuzzyKmeans
*/
public static DefaultOptionBuilder numMappersOption() {
return new
DefaultOptionBuilder().withLongName("numMap").withRequired(false).withArgument(
@@ -162,25 +164,34 @@ public final class DefaultOptionCreator
}
/**
- * Returns a default command line option for specifying the max number of
reducers
+ * Returns a default command line option for specifying the max number of
reducers. Used by Dirichlet, FuzzyKmeans, Kmeans and LDA
*/
public static DefaultOptionBuilder numReducersOption() {
return new
DefaultOptionBuilder().withLongName("maxRed").withRequired(false).withShortName("r").withArgument(
- new
ArgumentBuilder().withName("maxRed").withDefault("1").withMinimum(1).withMaximum(1).create()).withDescription(
- "The number of reduce tasks. Defaults to 1");
+ new
ArgumentBuilder().withName("maxRed").withDefault("2").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The number of reduce tasks. Defaults to 2");
+ }
+
+ /**
+ * Returns a default command line option for clustering specification. Used
by all clustering except LDA
+ */
+ public static DefaultOptionBuilder clusteringOption() {
+ return new
DefaultOptionBuilder().withLongName("clustering").withRequired(false).withDescription(
+ "If present, run clustering after the iterations have taken
place").withShortName("cl");
}
/**
- * Returns a default command line option for specifying the emitMostLikely
+ * Returns a default command line option for specifying the emitMostLikely
flag. Used by Dirichlet and FuzzyKmeans
*/
public static DefaultOptionBuilder emitMostLikelyOption() {
return new
DefaultOptionBuilder().withLongName("emitMostLikely").withRequired(false).withShortName("e").withArgument(
- new
ArgumentBuilder().withName("emitMostLikely").withDefault("false").withMinimum(1).withMaximum(1).create())
- .withDescription("True if clustering should emit the most likely point
only, false for threshold clustering");
+ new
ArgumentBuilder().withName("emitMostLikely").withDefault("true").withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "True if clustering should emit the most likely point only, false
for threshold clustering. Default is true");
}
/**
- * Returns a default command line option for specifying the clustering
threshold value
+ * Returns a default command line option for specifying the clustering
threshold value. Used by Dirichlet and FuzzyKmeans
*/
public static DefaultOptionBuilder thresholdOption() {
return new
DefaultOptionBuilder().withLongName("threshold").withRequired(false).withShortName("t").withArgument(
@@ -206,4 +217,31 @@ public final class DefaultOptionCreator
"If present, the input directory already contains MeanShiftCanopies");
}
+ /**
+ * Returns a default command line option for specifying the LDA number of
topics option
+ */
+ public static DefaultOptionBuilder numTopicsOption() {
+ return new
DefaultOptionBuilder().withLongName("numTopics").withRequired(true).withArgument(
+ new
ArgumentBuilder().withName("numTopics").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The total number of topics in the corpus").withShortName("k");
+ }
+
+ /**
+ * Returns a default command line option for specifying the LDA number of
words option
+ */
+ public static DefaultOptionBuilder numWordsOption() {
+ return new
DefaultOptionBuilder().withLongName("numWords").withRequired(true).withArgument(
+ new
ArgumentBuilder().withName("numWords").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The total number of words in the corpus (can be approximate, needs to
exceed the actual value)").withShortName("v");
+ }
+
+ /**
+ * Returns a default command line option for specifying the LDA topic
smoothing option
+ */
+ public static DefaultOptionBuilder topicSmoothingOption() {
+ return new
DefaultOptionBuilder().withLongName("topicSmoothing").withRequired(false).withArgument(
+ new
ArgumentBuilder().withName("topicSmoothing").withDefault(-1.0).withMinimum(0).withMaximum(1).create()).withDescription(
+ "Topic smoothing parameter. Default is
50/numTopics.").withShortName("a");
+ }
+
}
Modified: mahout/trunk/examples/bin/build-reuters.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=945777&r1=945776&r2=945777&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Tue May 18 17:24:16 2010
@@ -40,4 +40,6 @@ cd ../..
./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters
./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o
./examples/bin/work/reuters-out-seqdir -c UTF-8
./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o
./examples/bin/work/reuters-out-seqdir-sparse
-#./bin/mahout kmeans -i
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c
./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -k 20 -w
+#./bin/mahout kmeans -i
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c
./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -k 20 -ow
+#./bin/mahout lda -i
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors -o
./examples/bin/work/reuters-lda -k 20 -v 50000 -ow
+