ap...

Jeff Eastman Sun, 23 May 2010 08:46:03 -0700

Tests all run on Java 1.6.0_10

On 5/23/10 8:22 AM, [email protected] wrote:

Author: jeastman
Date: Sun May 23 15:22:28 2010
New Revision: 947427


URL: http://svn.apache.org/viewvc?rev=947427&view=rev
Log:
MAHOUT-294: fixed -k option as optional but added required=true for Dirichlet
MAHOUT-398: added minimal vector renaming to improve clarity
MAHOUT-397: fixes to allow setting -nr in vector output stages

Tests all ran before I installed Java update. Will test on EC2 again today.

Modified:
     
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
     
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
     
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
     mahout/trunk/examples/bin/build-reuters.sh
     
mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
     
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
     
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
     
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
     
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
     
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
 Sun May 23 15:22:28 2010
@@ -83,7 +83,7 @@ public class DirichletDriver {
      Option inputOpt = DefaultOptionCreator.inputOption().create();
      Option outputOpt = DefaultOptionCreator.outputOption().create();
      Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().create();
-    Option kOpt = DefaultOptionCreator.kOption().create();
+    Option kOpt = DefaultOptionCreator.kOption().withRequired(true).create();
      Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
      Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
      Option alphaOpt = DefaultOptionCreator.alphaOption().create();

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java 
(original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java 
Sun May 23 15:22:28 2010
@@ -145,10 +145,10 @@ public final class LDADriver {
      double oldLL = Double.NEGATIVE_INFINITY;
      boolean converged = false;

-    for (int iteration = 0; ((maxIterations<  1) || (iteration<  
maxIterations))&&  !converged; iteration++) {
+    for (int iteration = 1; ((maxIterations<  1) || (iteration<= 
maxIterations))&&  !converged; iteration++) {
        log.info("Iteration {}", iteration);
        // point the output to a new directory per iteration
-      Path stateOut = new Path(output, "state-" + (iteration + 1));
+      Path stateOut = new Path(output, "state-" + iteration);
        double ll = runIteration(input, stateIn, stateOut, numTopics, numWords, 
topicSmoothing, numReducers);
        double relChange = (oldLL - ll) / oldLL;

@@ -157,7 +157,7 @@ public final class LDADriver {
        log.info("(Old LL: {})", oldLL);
        log.info("(Rel Change: {})", relChange);

-      converged = (iteration>  2)&&  (relChange<  OVERALL_CONVERGENCE);
+      converged = (iteration>  3)&&  (relChange<  OVERALL_CONVERGENCE);
        stateIn = stateOut;
        oldLL = ll;
      }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
 Sun May 23 15:22:28 2010
@@ -112,7 +112,7 @@ public final class DefaultOptionCreator
     * Returns a default command line option for specification of numbers of 
clusters to create. Used by Dirichlet, FuzzyKmeans, Kmeans
     */
    public static DefaultOptionBuilder kOption() {
-    return new 
DefaultOptionBuilder().withLongName("k").withRequired(true).withArgument(
+    return new 
DefaultOptionBuilder().withLongName("k").withRequired(false).withArgument(
          new 
ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
          "The number of clusters to create").withShortName("k");
    }

Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Sun May 23 15:22:28 2010
@@ -38,14 +38,15 @@ fi

  cd ../..
  ./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters 
./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
-./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o 
./examples/bin/work/reuters-out-seqdir -c UTF-8
-./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o 
./examples/bin/work/reuters-out-seqdir-sparse
+./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o 
./examples/bin/work/reuters-out-seqdir -c UTF-8 -chunk 5

-# to use k-Means clustering, uncomment the next two lines
-#./bin/mahout kmeans -i 
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c 
./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 
-ow
+# to use k-Means clustering, uncomment the next three lines
+#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o 
./examples/bin/work/reuters-out-seqdir-sparse
+#./bin/mahout kmeans -i 
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/tfidf-vectors/ -c 
./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 
-ow
  #./bin/mahout clusterdump -s examples/bin/work/reuters-kmeans/clusters-10 -d 
examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile 
-b 100 -n 20

-# to use LDA clustering, uncomment the next two lines
-#./bin/mahout lda -i 
./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors -o 
./examples/bin/work/reuters-lda -k 20 -v 50000 -ow
-#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-9 -d 
./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
+# to use LDA clustering, uncomment the next three lines
+#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o 
./examples/bin/work/reuters-out-seqdir-sparse -wt tf -seq -nr 3
+#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors 
-o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20
+#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-20 -d 
./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile


Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
 Sun May 23 15:22:28 2010
@@ -79,9 +79,7 @@ public final class SequenceFilesFromDire
      private final FileSystem fs;

      public ChunkedWriter(int chunkSizeInMB, String outputDir) throws 
IOException {
-      if (chunkSizeInMB<  64) {
-        chunkSizeInMB = 64;
-      } else if (chunkSizeInMB>  1984) {
+      if (chunkSizeInMB>  1984) {
          chunkSizeInMB = 1984;
        }
        maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
 Sun May 23 15:22:28 2010
@@ -101,14 +101,14 @@ public final class SparseVectorsFromSequ
        abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
          .withDescription(
            "(Optional) The maximum size of ngrams to create"
-              + " (2 = bigrams, 3 = trigrams, etc) Default 
Value:2").withShortName("ng").create();
+              + " (2 = bigrams, 3 = trigrams, etc) Default 
Value:1").withShortName("ng").create();
      Option sequentialAccessVectorOpt = 
obuilder.withLongName("sequentialAccessVector").withRequired(false)
          .withDescription(
-          "(Optional) Whether output vectors should be SequentialAccessVectors If 
set true else false")
+          "(Optional) Whether output vectors should be SequentialAccessVectors. If 
set true else false")
          .withShortName("seq").create();

      Option overwriteOutput = 
obuilder.withLongName("overwrite").withRequired(false).withDescription(
-      "If set, overwrite the output directory").withShortName("w").create();
+      "If set, overwrite the output directory").withShortName("ow").create();
      Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
          .create();

@@ -165,7 +165,7 @@ public final class SparseVectorsFromSequ
        if (cmdLine.hasOption(numReduceTasksOpt)) {
          reduceTasks = 
Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
-      log.info("Pass1 reduce tasks: {}", reduceTasks);
+      log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer>  analyzerClass = DefaultAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
@@ -224,7 +224,7 @@ public final class SparseVectorsFromSequ
          TFIDFConverter.processTfIdf(
            new Path(outputDir, 
DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
            new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, 
minDf, maxDFPercent, norm,
-          sequentialAccessOutput);
+          sequentialAccessOutput, reduceTasks);
        }
      } catch (OptionException e) {
        log.error("Exception", e);

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
 Sun May 23 15:22:28 2010
@@ -69,13 +69,16 @@ public final class PartialVectorMerger {
     *          output directory were the partial vectors have to be created
     * @param normPower
     *          The normalization value. Must be greater than or equal to 0 or 
equal to {...@link #NO_NORMALIZING}
+   * @param numReducers
+   *          The number of reducers to spawn
     * @throws IOException
     */
    public static void mergePartialVectors(List<Path>  partialVectorPaths,
                                           Path output,
                                           float normPower,
                                           int dimension,
-                                         boolean sequentialAccess) throws 
IOException {
+                                         boolean sequentialAccess,
+                                         int numReducers) throws IOException {
      if (normPower != NO_NORMALIZING&&  normPower<  0) {
        throw new IllegalArgumentException("normPower must either be -1 or>= 
0");
      }
@@ -101,6 +104,7 @@ public final class PartialVectorMerger {
      conf.setInputFormat(SequenceFileInputFormat.class);
      conf.setReducerClass(PartialVectorMergeReducer.class);
      conf.setOutputFormat(SequenceFileOutputFormat.class);
+    conf.setNumReduceTasks(numReducers);

      HadoopUtil.overwriteOutput(output);


Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
 Sun May 23 15:22:28 2010
@@ -59,7 +59,7 @@ import org.apache.mahout.utils.vectors.t
   */
  public final class DictionaryVectorizer {

-  public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
+  public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";

    public static final String MIN_SUPPORT = "min.support";

@@ -153,7 +153,7 @@ public final class DictionaryVectorizer
        Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + 
partialVectorIndex++);
        partialVectorPaths.add(partialVectorOutputPath);
        makePartialVectors(input, maxNGramSize, dictionaryChunk, 
partialVectorOutputPath,
-        maxTermDimension[0], sequentialAccess);
+        maxTermDimension[0], sequentialAccess, numReducers);
      }

      Configuration conf = new Configuration();
@@ -162,7 +162,7 @@ public final class DictionaryVectorizer
      Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
      if (dictionaryChunks.size()>  1) {
        PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, 
-1, maxTermDimension[0],
-        sequentialAccess);
+        sequentialAccess, numReducers);
        HadoopUtil.deletePaths(partialVectorPaths, fs);
      } else {
        Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -245,6 +245,8 @@ public final class DictionaryVectorizer
     *          location of the chunk of features and the id's
     * @param output
     *          output directory were the partial vectors have to be created
+   * @param numReducers
+   *          the desired number of reducer tasks
     * @throws IOException
     */
    private static void makePartialVectors(Path input,
@@ -252,7 +254,8 @@ public final class DictionaryVectorizer
                                           Path dictionaryFilePath,
                                           Path output,
                                           int dimension,
-                                         boolean sequentialAccess) throws 
IOException {
+                                         boolean sequentialAccess,
+                                         int numReducers) throws IOException {

      Configurable client = new JobClient();
      JobConf conf = new JobConf(DictionaryVectorizer.class);
@@ -279,6 +282,7 @@ public final class DictionaryVectorizer
      conf.setInputFormat(SequenceFileInputFormat.class);
      conf.setReducerClass(TFPartialVectorReducer.class);
      conf.setOutputFormat(SequenceFileOutputFormat.class);
+    conf.setNumReduceTasks(numReducers);

      HadoopUtil.overwriteOutput(output);


Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
 Sun May 23 15:22:28 2010
@@ -66,7 +66,7 @@ public final class TFIDFConverter {

    public static final String TFIDF_OUTPUT_FOLDER = "tfidf";

-  private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
+  private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";

    private static final String FREQUENCY_FILE = "frequency.file-";

@@ -99,17 +99,21 @@ public final class TFIDFConverter {
     * @param output
     *          output directory where {...@link 
org.apache.mahout.math.RandomAccessSparseVector}'s of the document
     *          are generated
-   * @param minDf
-   *          The minimum document frequency. Default 1
-   * @param maxDFPercent
-   *          The max percentage of vectors for the DF. Can be used to remove 
really high frequency features.
-   *          Expressed as an integer between 0 and 100. Default 99
     * @param chunkSizeInMegabytes
     *          the size in MB of the feature =>  id chunk to be kept in memory 
at each node during Map/Reduce
     *          stage. Its recommended you calculated this based on the number 
of cores and the free memory
     *          available to you per node. Say, you have 2 cores and around 1GB 
extra memory to spare we
     *          recommend you use a split size of around 400-500MB so that two 
simultaneous reducers can create
     *          partial vectors without thrashing the system due to increased 
swapping
+   * @param minDf
+   *          The minimum document frequency. Default 1
+   * @param maxDFPercent
+   *          The max percentage of vectors for the DF. Can be used to remove 
really high frequency features.
+   *          Expressed as an integer between 0 and 100. Default 99
+   * @param numReducers
+   *          The number of reducers to spawn. This also affects the possible 
parallelism since each reducer
+   *          will typically produce a single output file containing tf-idf 
vectors for a subset of the
+   *          documents in the corpus.
     * @throws IOException
     */
    public static void processTfIdf(Path input,
@@ -118,7 +122,8 @@ public final class TFIDFConverter {
                                    int minDf,
                                    int maxDFPercent,
                                    float normPower,
-                                  boolean sequentialAccessOutput) throws 
IOException {
+                                  boolean sequentialAccessOutput,
+                                  int numReducers) throws IOException {
      if (chunkSizeInMegabytes<  MIN_CHUNKSIZE) {
        chunkSizeInMegabytes = MIN_CHUNKSIZE;
      } else if (chunkSizeInMegabytes>  MAX_CHUNKSIZE) { // 10GB
@@ -158,7 +163,7 @@ public final class TFIDFConverter {
      Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
      if (dictionaryChunks.size()>  1) {
        PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, 
normPower,
-        datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput);
+        datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, 
numReducers);
        HadoopUtil.deletePaths(partialVectorPaths, fs);
      } else {
        Path singlePartialVectorOutputPath = partialVectorPaths.get(0);

Modified: 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
 (original)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
 Sun May 23 15:22:28 2010
@@ -119,7 +119,7 @@ public class DictionaryVectorizerTest ex
      
DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
        getTestTempDirPath("output/wordcount"), 2, 1, 0.0f, 1, 100, false);
      
TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/vectors"),
-                                getTestTempDirPath("output/tfidf"), 100, 1, 
99, 1.0f, false);
+                                getTestTempDirPath("output/tfidf"), 100, 1, 
99, 1.0f, false, 1);

    }
  }

Re: svn commit: r947427 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/dirichlet/ core/src/main/java/org/apache/mahout/clustering/lda/ core/src/main/java/org/apache/mahout/common/commandline/ examples/bin/ examples/src/main/java/org/ap...

Reply via email to