Author: gsingers
Date: Mon Jul 5 21:47:03 2010
New Revision: 960714
URL: http://svn.apache.org/viewvc?rev=960714&view=rev
Log:
Added an option to print out just the size of the vectors, which should be
useful in conjunction w/ SVD.
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=960714&r1=960713&r2=960714&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
Mon Jul 5 21:47:03 2010
@@ -17,12 +17,6 @@
package org.apache.mahout.utils.vectors;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
@@ -38,79 +32,89 @@ import org.apache.hadoop.io.SequenceFile
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.jobcontrol.Job;
+import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import
org.apache.mahout.utils.vectors.SequenceFileVectorIterable.SeqFileIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
/**
* Can read in a {...@link org.apache.hadoop.io.SequenceFile} of {...@link
org.apache.mahout.math.Vector}s and dump
* out the results using {...@link
org.apache.mahout.math.Vector#asFormatString()} to either the console or to a
* file.
*/
public final class VectorDumper {
-
+
private static final Logger log =
LoggerFactory.getLogger(VectorDumper.class);
-
- private VectorDumper() { }
-
+
+ private VectorDumper() {
+ }
+
public static void main(String[] args) throws IOException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option seqOpt =
obuilder.withLongName("seqFile").withRequired(false).withArgument(
-
abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Sequence File containing the Vectors").withShortName("s").create();
+
abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Sequence File containing the
Vectors").withShortName("s").create();
Option vectorAsKeyOpt =
obuilder.withLongName("useKey").withRequired(false).withDescription(
- "If the Key is a vector, then dump that
instead").withShortName("u").create();
+ "If the Key is a vector, then dump that
instead").withShortName("u").create();
Option printKeyOpt =
obuilder.withLongName("printKey").withRequired(false).withDescription(
- "Print out the key as well, delimited by a tab (or the value if useKey
is true)").withShortName("p")
- .create();
+ "Print out the key as well, delimited by a tab (or the value if
useKey is true)").withShortName("p")
+ .create();
Option outputOpt =
obuilder.withLongName("output").withRequired(false).withArgument(
-
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
- "The output file. If not specified, dumps to the
console").withShortName("o").create();
+
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The output file. If not specified, dumps to the
console").withShortName("o").create();
Option dictOpt =
obuilder.withLongName("dictionary").withRequired(false).withArgument(
-
abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
- "The dictionary file. ").withShortName("d").create();
+
abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The dictionary file. ").withShortName("d").create();
Option dictTypeOpt =
obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
-
abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
- "The dictionary file type
(text|sequencefile)").withShortName("dt").create();
+
abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The dictionary file type
(text|sequencefile)").withShortName("dt").create();
Option centroidJSonOpt =
obuilder.withLongName("json").withRequired(false).withDescription(
- "Output the centroid as JSON. Otherwise it substitutes in the terms for
vector cell entries")
- .withShortName("j").create();
+ "Output the centroid as JSON. Otherwise it substitutes in the
terms for vector cell entries")
+ .withShortName("j").create();
+ Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(true).
+ withDescription("Dump only the size of the
vector").withShortName("sz").create();
Option helpOpt = obuilder.withLongName("help").withDescription("Print out
help").withShortName("h")
- .create();
-
+ .create();
+
Group group =
gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
-
dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
- printKeyOpt).create();
-
+
dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
+ printKeyOpt).withOption(sizeOpt).create();
+
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
if (cmdLine.hasOption(helpOpt)) {
-
+
printHelp(group);
return;
}
-
+
if (cmdLine.hasOption(seqOpt)) {
Path path = new Path(cmdLine.getValue(seqOpt).toString());
//System.out.println("Input Path: " + path); interferes with output?
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
client.setConf(conf);
-
+
FileSystem fs = FileSystem.get(path.toUri(), conf);
-
+
String dictionaryType = "text";
if (cmdLine.hasOption(dictTypeOpt)) {
dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
}
-
+
String[] dictionary = null;
if (cmdLine.hasOption(dictOpt)) {
if ("text".equals(dictionaryType)) {
@@ -122,26 +126,38 @@ public final class VectorDumper {
}
}
boolean useJSON = cmdLine.hasOption(centroidJSonOpt);
-
+ boolean sizeOnly = cmdLine.hasOption(sizeOpt);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
try {
Iterable<Vector> vectorIterable = new
SequenceFileVectorIterable(reader, cmdLine.hasOption(vectorAsKeyOpt));
Writer writer = cmdLine.hasOption(outputOpt)
- ? new FileWriter(cmdLine.getValue(outputOpt).toString())
- : new OutputStreamWriter(System.out);
+ ? new FileWriter(cmdLine.getValue(outputOpt).toString())
+ : new OutputStreamWriter(System.out);
try {
boolean printKey = cmdLine.hasOption(printKeyOpt);
SeqFileIterator iterator = (SeqFileIterator)
vectorIterable.iterator();
- //int i = 0;
+ long i = 0;
while (iterator.hasNext()) {
Vector vector = iterator.next();
if (printKey) {
writer.write(iterator.key().toString());
writer.write("\t");
}
- String fmtStr = useJSON ? vector.asFormatString() :
VectorHelper.vectorToString(vector, dictionary);
- writer.write(fmtStr);
- writer.write('\n');
+ if (sizeOnly == false) {
+ String fmtStr = useJSON ? vector.asFormatString() :
VectorHelper.vectorToString(vector, dictionary);
+ writer.write(fmtStr);
+ writer.write('\n');
+ } else {
+ if (vector instanceof NamedVector){
+ writer.write(((NamedVector)vector).getName());
+ writer.write(":");
+ } else {
+ writer.write(String.valueOf(i++));
+ writer.write(":");
+ }
+ writer.write(String.valueOf(vector.size()));
+ writer.write('\n');
+ }
//i++;
}
//System.out.println("Dumped " + i + " Vectors");
@@ -152,14 +168,14 @@ public final class VectorDumper {
reader.close();
}
}
-
+
} catch (OptionException e) {
log.error("Exception", e);
printHelp(group);
}
-
+
}
-
+
private static void printHelp(Group group) {
HelpFormatter formatter = new HelpFormatter();
formatter.setGroup(group);