MAHOUT-1690:CLONE - Some vector dumper flags are expecting arguments. This closes apache/mahout#122
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/a3f78bde Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/a3f78bde Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/a3f78bde Branch: refs/heads/mahout-0.10.x Commit: a3f78bde9bf87d3d37931f878015b490761e75ce Parents: 6a3f93e Author: Suneel Marthi <[email protected]> Authored: Wed Apr 22 23:04:56 2015 -0400 Committer: Suneel Marthi <[email protected]> Committed: Wed Apr 22 23:04:56 2015 -0400 ---------------------------------------------------------------------- CHANGELOG | 4 +- .../mahout/utils/vectors/VectorDumper.java | 114 ++++++++++--------- 2 files changed, 61 insertions(+), 57 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/a3f78bde/CHANGELOG ---------------------------------------------------------------------- diff --git a/CHANGELOG b/CHANGELOG index 5611588..52799ba 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,9 +2,11 @@ Mahout Change Log Release 0.11.0 - unreleased + MAHOUT-1690: CLONE - Some vector dumper flags are expecting arguments. (smarthi) + MAHOUT-1693: FunctionalMatrixView materializes row vectors in scala shell (apalumbo) - MAHOUT-1680: Renamed mahout-distribution to apache-mahout-distribution + MAHOUT-1680: Renamed mahout-distribution to apache-mahout-distribution (sslavic) Release 0.10.0 - 2015-04-11 http://git-wip-us.apache.org/repos/asf/mahout/blob/a3f78bde/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java index 93ad0d5..e1c3fbc 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,14 +17,7 @@ package org.apache.mahout.utils.vectors; -import java.io.File; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Set; - +import com.google.common.collect.Sets; import com.google.common.io.Closeables; import com.google.common.io.Files; import org.apache.commons.io.Charsets; @@ -46,6 +39,13 @@ import org.apache.mahout.math.VectorWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.File; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.Iterator; +import java.util.Set; + /** * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s and dump * out the results using {@link Vector#asFormatString()} to either the console or to a @@ -55,7 +55,8 @@ public final class VectorDumper extends AbstractJob { private static final Logger log = LoggerFactory.getLogger(VectorDumper.class); - private VectorDumper() {} + private VectorDumper() { + } @Override public int run(String[] args) throws Exception { @@ -84,9 +85,9 @@ public final class VectorDumper extends AbstractJob { addOption("sizeOnly", "sz", "Dump only the size of the vector"); addOption("numItems", "ni", "Output at most <n> vecors", false); addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in" - + " conjunction with -sort", false); - addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." - + " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null)); + + " conjunction with -sort", false); + addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." + + " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null)); if (parseArguments(args, false, true) == null) { return -1; @@ -120,19 +121,22 @@ public final class VectorDumper extends AbstractJob { String[] dictionary = null; if (hasOption("dictionary")) { String dictFile = getOption("dictionary"); - if ("text".equals(dictionaryType)) { - dictionary = VectorHelper.loadTermDictionary(new File(dictFile)); - } else if ("sequencefile".equals(dictionaryType)) { - dictionary = VectorHelper.loadTermDictionary(conf, dictFile); - } else { - //TODO: support Lucene's FST as a dictionary type - throw new IOException("Invalid dictionary type: " + dictionaryType); + switch (dictionaryType) { + case "text": + dictionary = VectorHelper.loadTermDictionary(new File(dictFile)); + break; + case "sequencefile": + dictionary = VectorHelper.loadTermDictionary(conf, dictFile); + break; + default: + //TODO: support Lucene's FST as a dictionary type + throw new IOException("Invalid dictionary type: " + dictionaryType); } } Set<String> filters; if (hasOption("filter")) { - filters = new HashSet<>(getOptions("filter")); + filters = Sets.newHashSet(getOptions("filter")); } else { filters = null; } @@ -175,8 +179,8 @@ public final class VectorDumper extends AbstractJob { } } int maxIndexesPerVector = hasOption("vectorSize") - ? Integer.parseInt(getOption("vectorSize")) - : Integer.MAX_VALUE; + ? Integer.parseInt(getOption("vectorSize")) + : Integer.MAX_VALUE; long itemCount = 0; int fileCount = 0; for (Path path : pathArr) { @@ -201,10 +205,10 @@ public final class VectorDumper extends AbstractJob { Vector vector; try { vector = ((VectorWritable) - (transposeKeyValue ? keyWritable : valueWritable)).get(); + (transposeKeyValue ? keyWritable : valueWritable)).get(); } catch (ClassCastException e) { if ((transposeKeyValue ? keyWritable : valueWritable) - instanceof WeightedPropertyVectorWritable) { + instanceof WeightedPropertyVectorWritable) { vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).getVector(); @@ -212,39 +216,37 @@ public final class VectorDumper extends AbstractJob { throw e; } } - if (filters != null - && vector instanceof NamedVector - && !filters.contains(((NamedVector) vector).getName())) { - //we are filtering out this item, skip - continue; - } - if (sizeOnly) { - if (vector instanceof NamedVector) { - writer.write(((NamedVector) vector).getName()); - writer.write(":"); - } else { - writer.write(String.valueOf(i++)); - writer.write(":"); - } - writer.write(String.valueOf(vector.size())); - writer.write('\n'); - } else if (nameOnly) { - if (vector instanceof NamedVector) { - writer.write(((NamedVector) vector).getName()); + if (filters == null + || !(vector instanceof NamedVector) + || filters.contains(((NamedVector) vector).getName())) { + if (sizeOnly) { + if (vector instanceof NamedVector) { + writer.write(((NamedVector) vector).getName()); + writer.write(":"); + } else { + writer.write(String.valueOf(i++)); + writer.write(":"); + } + writer.write(String.valueOf(vector.size())); writer.write('\n'); - } - } else { - String fmtStr; - if (useCSV) { - fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments); + } else if (nameOnly) { + if (vector instanceof NamedVector) { + writer.write(((NamedVector) vector).getName()); + writer.write('\n'); + } } else { - fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector, - sortVectors); + String fmtStr; + if (useCSV) { + fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments); + } else { + fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector, + sortVectors); + } + writer.write(fmtStr); + writer.write('\n'); } - writer.write(fmtStr); - writer.write('\n'); + itemCount++; } - itemCount++; } } writer.flush();
