Author: gsingers
Date: Mon Jul  5 21:47:03 2010
New Revision: 960714

URL: http://svn.apache.org/viewvc?rev=960714&view=rev
Log:
Added an option to print out just the size of the vectors, which should be 
useful in conjunction w/ SVD.

Modified:
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=960714&r1=960713&r2=960714&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
 Mon Jul  5 21:47:03 2010
@@ -17,12 +17,6 @@
 
 package org.apache.mahout.utils.vectors;
 
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.Option;
@@ -38,79 +32,89 @@ import org.apache.hadoop.io.SequenceFile
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.jobcontrol.Job;
+import org.apache.mahout.math.NamedVector;
 import org.apache.mahout.math.Vector;
 import 
org.apache.mahout.utils.vectors.SequenceFileVectorIterable.SeqFileIterator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
 /**
  * Can read in a {...@link org.apache.hadoop.io.SequenceFile} of {...@link 
org.apache.mahout.math.Vector}s and dump
  * out the results using {...@link 
org.apache.mahout.math.Vector#asFormatString()} to either the console or to a
  * file.
  */
 public final class VectorDumper {
-  
+
   private static final Logger log = 
LoggerFactory.getLogger(VectorDumper.class);
-  
-  private VectorDumper() { }
-  
+
+  private VectorDumper() {
+  }
+
   public static void main(String[] args) throws IOException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-    
+
     Option seqOpt = 
obuilder.withLongName("seqFile").withRequired(false).withArgument(
-      
abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Sequence File containing the Vectors").withShortName("s").create();
+            
abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
+            "The Sequence File containing the 
Vectors").withShortName("s").create();
     Option vectorAsKeyOpt = 
obuilder.withLongName("useKey").withRequired(false).withDescription(
-      "If the Key is a vector, then dump that 
instead").withShortName("u").create();
+            "If the Key is a vector, then dump that 
instead").withShortName("u").create();
     Option printKeyOpt = 
obuilder.withLongName("printKey").withRequired(false).withDescription(
-      "Print out the key as well, delimited by a tab (or the value if useKey 
is true)").withShortName("p")
-        .create();
+            "Print out the key as well, delimited by a tab (or the value if 
useKey is true)").withShortName("p")
+            .create();
     Option outputOpt = 
obuilder.withLongName("output").withRequired(false).withArgument(
-      
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The output file.  If not specified, dumps to the 
console").withShortName("o").create();
+            
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+            "The output file.  If not specified, dumps to the 
console").withShortName("o").create();
     Option dictOpt = 
obuilder.withLongName("dictionary").withRequired(false).withArgument(
-      
abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The dictionary file. ").withShortName("d").create();
+            
abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
+            "The dictionary file. ").withShortName("d").create();
     Option dictTypeOpt = 
obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
-      
abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The dictionary file type 
(text|sequencefile)").withShortName("dt").create();
+            
abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
+            "The dictionary file type 
(text|sequencefile)").withShortName("dt").create();
     Option centroidJSonOpt = 
obuilder.withLongName("json").withRequired(false).withDescription(
-      "Output the centroid as JSON.  Otherwise it substitutes in the terms for 
vector cell entries")
-        .withShortName("j").create();
+            "Output the centroid as JSON.  Otherwise it substitutes in the 
terms for vector cell entries")
+            .withShortName("j").create();
+    Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(true).
+            withDescription("Dump only the size of the 
vector").withShortName("sz").create();
     Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
-        .create();
-    
+            .create();
+
     Group group = 
gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
-      
dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
-      printKeyOpt).create();
-    
+            
dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
+            printKeyOpt).withOption(sizeOpt).create();
+
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-      
+
       if (cmdLine.hasOption(helpOpt)) {
-        
+
         printHelp(group);
         return;
       }
-      
+
       if (cmdLine.hasOption(seqOpt)) {
         Path path = new Path(cmdLine.getValue(seqOpt).toString());
         //System.out.println("Input Path: " + path); interferes with output?
         JobClient client = new JobClient();
         JobConf conf = new JobConf(Job.class);
         client.setConf(conf);
-        
+
         FileSystem fs = FileSystem.get(path.toUri(), conf);
-        
+
         String dictionaryType = "text";
         if (cmdLine.hasOption(dictTypeOpt)) {
           dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
         }
-        
+
         String[] dictionary = null;
         if (cmdLine.hasOption(dictOpt)) {
           if ("text".equals(dictionaryType)) {
@@ -122,26 +126,38 @@ public final class VectorDumper {
           }
         }
         boolean useJSON = cmdLine.hasOption(centroidJSonOpt);
-        
+        boolean sizeOnly = cmdLine.hasOption(sizeOpt);
         SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
         try {
           Iterable<Vector> vectorIterable = new 
SequenceFileVectorIterable(reader, cmdLine.hasOption(vectorAsKeyOpt));
           Writer writer = cmdLine.hasOption(outputOpt)
-              ? new FileWriter(cmdLine.getValue(outputOpt).toString())
-              : new OutputStreamWriter(System.out);
+                  ? new FileWriter(cmdLine.getValue(outputOpt).toString())
+                  : new OutputStreamWriter(System.out);
           try {
             boolean printKey = cmdLine.hasOption(printKeyOpt);
             SeqFileIterator iterator = (SeqFileIterator) 
vectorIterable.iterator();
-            //int i = 0;
+            long i = 0;
             while (iterator.hasNext()) {
               Vector vector = iterator.next();
               if (printKey) {
                 writer.write(iterator.key().toString());
                 writer.write("\t");
               }
-              String fmtStr = useJSON ? vector.asFormatString() : 
VectorHelper.vectorToString(vector, dictionary);
-              writer.write(fmtStr);
-              writer.write('\n');
+              if (sizeOnly == false) {
+                String fmtStr = useJSON ? vector.asFormatString() : 
VectorHelper.vectorToString(vector, dictionary);
+                writer.write(fmtStr);
+                writer.write('\n');
+              } else {
+                if (vector instanceof NamedVector){
+                  writer.write(((NamedVector)vector).getName());
+                  writer.write(":");
+                } else {
+                  writer.write(String.valueOf(i++));
+                  writer.write(":");
+                }
+                writer.write(String.valueOf(vector.size()));
+                writer.write('\n');
+              }
               //i++;
             }
             //System.out.println("Dumped " + i + " Vectors");
@@ -152,14 +168,14 @@ public final class VectorDumper {
           reader.close();
         }
       }
-      
+
     } catch (OptionException e) {
       log.error("Exception", e);
       printHelp(group);
     }
-    
+
   }
-  
+
   private static void printHelp(Group group) {
     HelpFormatter formatter = new HelpFormatter();
     formatter.setGroup(group);


Reply via email to