Author: jeastman
Date: Sun May 23 15:22:28 2010
New Revision: 947427
URL: http://svn.apache.org/viewvc?rev=947427&view=rev
Log:
MAHOUT-294: fixed -k option as optional but added required=true for Dirichlet
MAHOUT-398: added minimal vector renaming to improve clarity
MAHOUT-397: fixes to allow setting -nr in vector output stages
Tests all ran before I installed Java update. Will test on EC2 again today.
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
mahout/trunk/examples/bin/build-reuters.sh
mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
Sun May 23 15:22:28 2010
@@ -83,7 +83,7 @@ public class DirichletDriver {
Option inputOpt = DefaultOptionCreator.inputOption().create();
Option outputOpt = DefaultOptionCreator.outputOption().create();
Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().create();
- Option kOpt = DefaultOptionCreator.kOption().create();
+ Option kOpt = DefaultOptionCreator.kOption().withRequired(true).create();
Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
Option alphaOpt = DefaultOptionCreator.alphaOption().create();
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
Sun May 23 15:22:28 2010
@@ -145,10 +145,10 @@ public final class LDADriver {
double oldLL = Double.NEGATIVE_INFINITY;
boolean converged = false;
- for (int iteration = 0; ((maxIterations< 1) || (iteration<
maxIterations))&& !converged; iteration++) {
+ for (int iteration = 1; ((maxIterations< 1) || (iteration<=
maxIterations))&& !converged; iteration++) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
- Path stateOut = new Path(output, "state-" + (iteration + 1));
+ Path stateOut = new Path(output, "state-" + iteration);
double ll = runIteration(input, stateIn, stateOut, numTopics, numWords,
topicSmoothing, numReducers);
double relChange = (oldLL - ll) / oldLL;
@@ -157,7 +157,7 @@ public final class LDADriver {
log.info("(Old LL: {})", oldLL);
log.info("(Rel Change: {})", relChange);
- converged = (iteration> 2)&& (relChange< OVERALL_CONVERGENCE);
+ converged = (iteration> 3)&& (relChange< OVERALL_CONVERGENCE);
stateIn = stateOut;
oldLL = ll;
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
Sun May 23 15:22:28 2010
@@ -112,7 +112,7 @@ public final class DefaultOptionCreator
* Returns a default command line option for specification of numbers of
clusters to create. Used by Dirichlet, FuzzyKmeans, Kmeans
*/
public static DefaultOptionBuilder kOption() {
- return new
DefaultOptionBuilder().withLongName("k").withRequired(true).withArgument(
+ return new
DefaultOptionBuilder().withLongName("k").withRequired(false).withArgument(
new
ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
"The number of clusters to create").withShortName("k");
}
Modified: mahout/trunk/examples/bin/build-reuters.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Sun May 23 15:22:28 2010
@@ -38,14 +38,15 @@ fi
cd ../..
./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters
./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
-./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o
./examples/bin/work/reuters-out-seqdir -c UTF-8
-./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o
./examples/bin/work/reuters-out-seqdir-sparse
+./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o
./examples/bin/work/reuters-out-seqdir -c UTF-8 -chunk 5
-# to use k-Means clustering, uncomment the next two lines
-#./bin/mahout kmeans -i
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c
./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20
-ow
+# to use k-Means clustering, uncomment the next three lines
+#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o
./examples/bin/work/reuters-out-seqdir-sparse
+#./bin/mahout kmeans -i
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/tfidf-vectors/ -c
./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20
-ow
#./bin/mahout clusterdump -s examples/bin/work/reuters-kmeans/clusters-10 -d
examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
-b 100 -n 20
-# to use LDA clustering, uncomment the next two lines
-#./bin/mahout lda -i
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors -o
./examples/bin/work/reuters-lda -k 20 -v 50000 -ow
-#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-9 -d
./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
+# to use LDA clustering, uncomment the next three lines
+#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o
./examples/bin/work/reuters-out-seqdir-sparse -wt tf -seq -nr 3
+#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors
-o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20
+#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-20 -d
./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
Sun May 23 15:22:28 2010
@@ -79,9 +79,7 @@ public final class SequenceFilesFromDire
private final FileSystem fs;
public ChunkedWriter(int chunkSizeInMB, String outputDir) throws
IOException {
- if (chunkSizeInMB< 64) {
- chunkSizeInMB = 64;
- } else if (chunkSizeInMB> 1984) {
+ if (chunkSizeInMB> 1984) {
chunkSizeInMB = 1984;
}
maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
Sun May 23 15:22:28 2010
@@ -101,14 +101,14 @@ public final class SparseVectorsFromSequ
abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
.withDescription(
"(Optional) The maximum size of ngrams to create"
- + " (2 = bigrams, 3 = trigrams, etc) Default
Value:2").withShortName("ng").create();
+ + " (2 = bigrams, 3 = trigrams, etc) Default
Value:1").withShortName("ng").create();
Option sequentialAccessVectorOpt =
obuilder.withLongName("sequentialAccessVector").withRequired(false)
.withDescription(
- "(Optional) Whether output vectors should be SequentialAccessVectors If
set true else false")
+ "(Optional) Whether output vectors should be SequentialAccessVectors. If
set true else false")
.withShortName("seq").create();
Option overwriteOutput =
obuilder.withLongName("overwrite").withRequired(false).withDescription(
- "If set, overwrite the output directory").withShortName("w").create();
+ "If set, overwrite the output directory").withShortName("ow").create();
Option helpOpt = obuilder.withLongName("help").withDescription("Print out
help").withShortName("h")
.create();
@@ -165,7 +165,7 @@ public final class SparseVectorsFromSequ
if (cmdLine.hasOption(numReduceTasksOpt)) {
reduceTasks =
Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
}
- log.info("Pass1 reduce tasks: {}", reduceTasks);
+ log.info("Number of reduce tasks: {}", reduceTasks);
Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
if (cmdLine.hasOption(analyzerNameOpt)) {
@@ -224,7 +224,7 @@ public final class SparseVectorsFromSequ
TFIDFConverter.processTfIdf(
new Path(outputDir,
DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize,
minDf, maxDFPercent, norm,
- sequentialAccessOutput);
+ sequentialAccessOutput, reduceTasks);
}
} catch (OptionException e) {
log.error("Exception", e);
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
Sun May 23 15:22:28 2010
@@ -69,13 +69,16 @@ public final class PartialVectorMerger {
* output directory were the partial vectors have to be created
* @param normPower
* The normalization value. Must be greater than or equal to 0 or
equal to {...@link #NO_NORMALIZING}
+ * @param numReducers
+ * The number of reducers to spawn
* @throws IOException
*/
public static void mergePartialVectors(List<Path> partialVectorPaths,
Path output,
float normPower,
int dimension,
- boolean sequentialAccess) throws
IOException {
+ boolean sequentialAccess,
+ int numReducers) throws IOException {
if (normPower != NO_NORMALIZING&& normPower< 0) {
throw new IllegalArgumentException("normPower must either be -1 or>=
0");
}
@@ -101,6 +104,7 @@ public final class PartialVectorMerger {
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setReducerClass(PartialVectorMergeReducer.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
+ conf.setNumReduceTasks(numReducers);
HadoopUtil.overwriteOutput(output);
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
Sun May 23 15:22:28 2010
@@ -59,7 +59,7 @@ import org.apache.mahout.utils.vectors.t
*/
public final class DictionaryVectorizer {
- public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
+ public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
public static final String MIN_SUPPORT = "min.support";
@@ -153,7 +153,7 @@ public final class DictionaryVectorizer
Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER +
partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
makePartialVectors(input, maxNGramSize, dictionaryChunk,
partialVectorOutputPath,
- maxTermDimension[0], sequentialAccess);
+ maxTermDimension[0], sequentialAccess, numReducers);
}
Configuration conf = new Configuration();
@@ -162,7 +162,7 @@ public final class DictionaryVectorizer
Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
if (dictionaryChunks.size()> 1) {
PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir,
-1, maxTermDimension[0],
- sequentialAccess);
+ sequentialAccess, numReducers);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -245,6 +245,8 @@ public final class DictionaryVectorizer
* location of the chunk of features and the id's
* @param output
* output directory were the partial vectors have to be created
+ * @param numReducers
+ * the desired number of reducer tasks
* @throws IOException
*/
private static void makePartialVectors(Path input,
@@ -252,7 +254,8 @@ public final class DictionaryVectorizer
Path dictionaryFilePath,
Path output,
int dimension,
- boolean sequentialAccess) throws
IOException {
+ boolean sequentialAccess,
+ int numReducers) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(DictionaryVectorizer.class);
@@ -279,6 +282,7 @@ public final class DictionaryVectorizer
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setReducerClass(TFPartialVectorReducer.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
+ conf.setNumReduceTasks(numReducers);
HadoopUtil.overwriteOutput(output);
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
Sun May 23 15:22:28 2010
@@ -66,7 +66,7 @@ public final class TFIDFConverter {
public static final String TFIDF_OUTPUT_FOLDER = "tfidf";
- private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
+ private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
private static final String FREQUENCY_FILE = "frequency.file-";
@@ -99,17 +99,21 @@ public final class TFIDFConverter {
* @param output
* output directory where {...@link
org.apache.mahout.math.RandomAccessSparseVector}'s of the document
* are generated
- * @param minDf
- * The minimum document frequency. Default 1
- * @param maxDFPercent
- * The max percentage of vectors for the DF. Can be used to remove
really high frequency features.
- * Expressed as an integer between 0 and 100. Default 99
* @param chunkSizeInMegabytes
* the size in MB of the feature => id chunk to be kept in memory
at each node during Map/Reduce
* stage. Its recommended you calculated this based on the number
of cores and the free memory
* available to you per node. Say, you have 2 cores and around 1GB
extra memory to spare we
* recommend you use a split size of around 400-500MB so that two
simultaneous reducers can create
* partial vectors without thrashing the system due to increased
swapping
+ * @param minDf
+ * The minimum document frequency. Default 1
+ * @param maxDFPercent
+ * The max percentage of vectors for the DF. Can be used to remove
really high frequency features.
+ * Expressed as an integer between 0 and 100. Default 99
+ * @param numReducers
+ * The number of reducers to spawn. This also affects the possible
parallelism since each reducer
+ * will typically produce a single output file containing tf-idf
vectors for a subset of the
+ * documents in the corpus.
* @throws IOException
*/
public static void processTfIdf(Path input,
@@ -118,7 +122,8 @@ public final class TFIDFConverter {
int minDf,
int maxDFPercent,
float normPower,
- boolean sequentialAccessOutput) throws
IOException {
+ boolean sequentialAccessOutput,
+ int numReducers) throws IOException {
if (chunkSizeInMegabytes< MIN_CHUNKSIZE) {
chunkSizeInMegabytes = MIN_CHUNKSIZE;
} else if (chunkSizeInMegabytes> MAX_CHUNKSIZE) { // 10GB
@@ -158,7 +163,7 @@ public final class TFIDFConverter {
Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
if (dictionaryChunks.size()> 1) {
PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir,
normPower,
- datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput);
+ datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput,
numReducers);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Sun May 23 15:22:28 2010
@@ -119,7 +119,7 @@ public class DictionaryVectorizerTest ex
DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
getTestTempDirPath("output/wordcount"), 2, 1, 0.0f, 1, 100, false);
TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/vectors"),
- getTestTempDirPath("output/tfidf"), 100, 1,
99, 1.0f, false);
+ getTestTempDirPath("output/tfidf"), 100, 1,
99, 1.0f, false, 1);
}
}