Author: smarthi Date: Tue Jun 4 03:40:34 2013 New Revision: 1489281 URL: http://svn.apache.org/r1489281 Log: MAHOUT-1052: Add an option to MinHashDriver that specifies the dimension of vector to hash (indexes or values)
Modified: mahout/trunk/CHANGELOG mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinhashOptionCreator.java mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java Modified: mahout/trunk/CHANGELOG URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1489281&r1=1489280&r2=1489281&view=diff ============================================================================== --- mahout/trunk/CHANGELOG (original) +++ mahout/trunk/CHANGELOG Tue Jun 4 03:40:34 2013 @@ -2,6 +2,8 @@ Mahout Change Log Release 0.8 - unreleased + MAHOUT-1052: Add an option to MinHashDriver that specifies the dimension of vector to hash (indexes or values) (Elena Smirnova via smarthi) + __MAHOUT-1237: Total cluster cost isn't computed properly (dfilimon) MAHOUT-1196: LogisticModelParameters uses csv.getTargetCategories() even if csv is not used. (Vineet Krishnan via ssc) Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java?rev=1489281&r1=1489280&r2=1489281&view=diff ============================================================================== --- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java (original) +++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java Tue Jun 4 03:40:34 2013 @@ -18,14 +18,11 @@ package org.apache.mahout.clustering.minhash; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ToolRunner; @@ -34,65 +31,19 @@ import org.apache.mahout.common.HadoopUt import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.math.VectorWritable; -import java.io.IOException; - public final class MinHashDriver extends AbstractJob { public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new MinHashDriver(), args); - } - - private void runJob(Path input, - Path output, - int minClusterSize, - int minVectorSize, - String hashType, - int numHashFunctions, - int keyGroups, - int numReduceTasks, - boolean debugOutput) throws IOException, ClassNotFoundException, InterruptedException { - Configuration conf = getConf(); - - conf.setInt(MinhashOptionCreator.MIN_CLUSTER_SIZE, minClusterSize); - conf.setInt(MinhashOptionCreator.MIN_VECTOR_SIZE, minVectorSize); - conf.set(MinhashOptionCreator.HASH_TYPE, hashType); - conf.setInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, numHashFunctions); - conf.setInt(MinhashOptionCreator.KEY_GROUPS, keyGroups); - conf.setBoolean(MinhashOptionCreator.DEBUG_OUTPUT, debugOutput); - - Class<? extends Writable> outputClass = debugOutput ? VectorWritable.class : Text.class; - Class<? extends OutputFormat> outputFormatClass = - debugOutput ? SequenceFileOutputFormat.class : TextOutputFormat.class; - - Job job = new Job(conf, "MinHash Clustering"); - job.setJarByClass(MinHashDriver.class); - - FileInputFormat.setInputPaths(job, input); - FileOutputFormat.setOutputPath(job, output); - - job.setMapperClass(MinHashMapper.class); - job.setReducerClass(MinHashReducer.class); - - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(outputFormatClass); - - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(outputClass); - - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(outputClass); - - job.setNumReduceTasks(numReduceTasks); - - job.waitForCompletion(true); + ToolRunner.run(new MinHashDriver(), args); } @Override - public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { + public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(MinhashOptionCreator.minClusterSizeOption().create()); addOption(MinhashOptionCreator.minVectorSizeOption().create()); + addOption(MinhashOptionCreator.vectorDimensionToHashOption().create()); addOption(MinhashOptionCreator.hashTypeOption().create()); addOption(MinhashOptionCreator.numHashFunctionsOption().create()); addOption(MinhashOptionCreator.keyGroupsOption().create()); @@ -104,28 +55,41 @@ public final class MinHashDriver extends return -1; } - Path input = getInputPath(); - Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { - HadoopUtil.delete(getConf(), output); + HadoopUtil.delete(getConf(), getOutputPath()); } + int minClusterSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_CLUSTER_SIZE)); int minVectorSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_VECTOR_SIZE)); + String dimensionToHash = getOption(MinhashOptionCreator.VECTOR_DIMENSION_TO_HASH); String hashType = getOption(MinhashOptionCreator.HASH_TYPE); int numHashFunctions = Integer.valueOf(getOption(MinhashOptionCreator.NUM_HASH_FUNCTIONS)); int keyGroups = Integer.valueOf(getOption(MinhashOptionCreator.KEY_GROUPS)); int numReduceTasks = Integer.parseInt(getOption(MinhashOptionCreator.NUM_REDUCERS)); boolean debugOutput = hasOption(MinhashOptionCreator.DEBUG_OUTPUT); - runJob(input, - output, - minClusterSize, - minVectorSize, - hashType, - numHashFunctions, - keyGroups, - numReduceTasks, - debugOutput); + Class<? extends Writable> outputClass = debugOutput ? VectorWritable.class : Text.class; + Class<? extends OutputFormat> outputFormatClass = + debugOutput ? SequenceFileOutputFormat.class : TextOutputFormat.class; + + Job minHash = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, MinHashMapper.class, + Text.class, outputClass, MinHashReducer.class, Text.class, VectorWritable.class, outputFormatClass); + + Configuration minHashConfiguration = minHash.getConfiguration(); + minHashConfiguration.setInt(MinhashOptionCreator.MIN_CLUSTER_SIZE, minClusterSize); + minHashConfiguration.setInt(MinhashOptionCreator.MIN_VECTOR_SIZE, minVectorSize); + minHashConfiguration.set(MinhashOptionCreator.VECTOR_DIMENSION_TO_HASH, dimensionToHash); + minHashConfiguration.set(MinhashOptionCreator.HASH_TYPE, hashType); + minHashConfiguration.setInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, numHashFunctions); + minHashConfiguration.setInt(MinhashOptionCreator.KEY_GROUPS, keyGroups); + minHashConfiguration.setBoolean(MinhashOptionCreator.DEBUG_OUTPUT, debugOutput); + minHash.setNumReduceTasks(numReduceTasks); + + boolean succeeded = minHash.waitForCompletion(true); + if (!succeeded) { + return -1; + } + return 0; } } Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java?rev=1489281&r1=1489280&r2=1489281&view=diff ============================================================================== --- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java (original) +++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java Tue Jun 4 03:40:34 2013 @@ -40,11 +40,13 @@ public class MinHashMapper extends Mappe private boolean debugOutput; private int[] minHashValues; private byte[] bytesToHash; + private String dimensionToHash; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); + this.dimensionToHash = conf.get(MinhashOptionCreator.VECTOR_DIMENSION_TO_HASH, "value"); this.numHashFunctions = conf.getInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, 10); this.minHashValues = new int[numHashFunctions]; this.bytesToHash = new byte[4]; @@ -76,14 +78,14 @@ public class MinHashMapper extends Mappe if (featureVector.size() < minVectorSize) { return; } - // Initialize the minhash values to highest + // Initialize the MinHash values to highest for (int i = 0; i < numHashFunctions; i++) { minHashValues[i] = Integer.MAX_VALUE; } for (int i = 0; i < numHashFunctions; i++) { for (Vector.Element ele : featureVector.nonZeroes()) { - int value = (int) ele.get(); + int value = "value".equalsIgnoreCase(dimensionToHash) ? (int) ele.get() : ele.index(); bytesToHash[0] = (byte) (value >> 24); bytesToHash[1] = (byte) (value >> 16); bytesToHash[2] = (byte) (value >> 8); Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinhashOptionCreator.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinhashOptionCreator.java?rev=1489281&r1=1489280&r2=1489281&view=diff ============================================================================== --- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinhashOptionCreator.java (original) +++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinhashOptionCreator.java Tue Jun 4 03:40:34 2013 @@ -28,6 +28,7 @@ public final class MinhashOptionCreator public static final String MIN_VECTOR_SIZE = "minVectorSize"; public static final String NUM_REDUCERS = "numReducers"; public static final String DEBUG_OUTPUT = "debugOutput"; + public static final String VECTOR_DIMENSION_TO_HASH = "vectorDimensionToHash"; private MinhashOptionCreator() { } @@ -43,7 +44,7 @@ public final class MinhashOptionCreator return new DefaultOptionBuilder() .withLongName(NUM_REDUCERS) .withRequired(false) - .withShortName("r") + .withShortName("nr") .withArgument( new ArgumentBuilder().withName(NUM_REDUCERS).withDefault("2") .withMinimum(1).withMaximum(1).create()) @@ -125,4 +126,20 @@ public final class MinhashOptionCreator .withMinimum(1).withMaximum(1).create()) .withDescription("Number of key groups to be used").withShortName("kg"); } + + /** + * Returns a default command line option for specifying the vector dimension to hash + * in MinHash clustering: Should be one of ("value","index") + */ + public static DefaultOptionBuilder vectorDimensionToHashOption() { + return new DefaultOptionBuilder() + .withLongName(VECTOR_DIMENSION_TO_HASH) + .withRequired(false) + .withArgument( + new ArgumentBuilder().withName(VECTOR_DIMENSION_TO_HASH).withDefault("value") + .withMinimum(1).withMaximum(1).create()) + .withDescription("Dimension of vector to hash. Available types: (value, index). Defaults to 'value' ") + .withShortName("vdh"); + } + } Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java?rev=1489281&r1=1489280&r2=1489281&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java Tue Jun 4 03:40:34 2013 @@ -43,10 +43,10 @@ import java.util.Set; public final class TestMinHashClustering extends MahoutTestCase { - private static final double[][] REFERENCE = { {1, 2, 3, 4, 5}, {2, 1, 3, 6, 7}, {3, 7, 6, 11, 8, 9}, - {4, 7, 8, 9, 6, 1}, {5, 8, 10, 4, 1}, {6, 17, 14, 15}, - {8, 9, 11, 6, 12, 1, 7}, {10, 13, 9, 7, 4, 6, 3}, - {3, 5, 7, 9, 2, 11}, {13, 7, 6, 8, 5}}; + private static final double[][] REFERENCE = { {0, 0, 3, 4, 5}, {0, 0, 3, 6, 7}, {0, 7, 6, 11, 8, 9}, + {0, 7, 8, 9, 6, 0}, {5, 8, 10, 0, 0}, {6, 17, 14, 15}, + {8, 9, 11, 0, 12, 0, 7}, {10, 13, 9, 7, 0, 6, 0}, + {0, 0, 7, 9, 0, 11}, {13, 7, 6, 8, 0}}; private Path input; private Path output; @@ -77,17 +77,18 @@ public final class TestMinHashClustering writer.append(new Text("Id-" + id++), point); } } finally { - Closeables.closeQuietly(writer); + Closeables.close(writer, false); } } - private String[] makeArguments(int minClusterSize, + private String[] makeArguments(String dimensionToHash, int minClusterSize, int minVectorSize, int numHashFunctions, int keyGroups, String hashType) { return new String[] {optKey(DefaultOptionCreator.INPUT_OPTION), input.toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), + optKey(MinhashOptionCreator.VECTOR_DIMENSION_TO_HASH), dimensionToHash, optKey(MinhashOptionCreator.MIN_CLUSTER_SIZE), String.valueOf(minClusterSize), optKey(MinhashOptionCreator.MIN_VECTOR_SIZE), String.valueOf(minVectorSize), optKey(MinhashOptionCreator.HASH_TYPE), hashType, @@ -97,20 +98,25 @@ public final class TestMinHashClustering optKey(MinhashOptionCreator.DEBUG_OUTPUT)}; } - private static Set<Integer> getValues(Vector vector) { + private static Set<Integer> getValues(Vector vector, String dimensionToHash) { Set<Integer> values = Sets.newHashSet(); - for (Vector.Element e : vector.nonZeroes()) { - values.add((int) e.get()); + if ("value".equalsIgnoreCase(dimensionToHash)) { + for (Vector.Element e : vector.nonZeroes()) + values.add((int) e.get()); + } else { + for (Vector.Element e : vector.nonZeroes()) + values.add(e.index()); } return values; } - - private static void runPairwiseSimilarity(List<Vector> clusteredItems, double simThreshold, String msg) { + + private static void runPairwiseSimilarity(List<Vector> clusteredItems, double simThreshold, + String dimensionToHash, String msg) { if (clusteredItems.size() > 1) { for (int i = 0; i < clusteredItems.size(); i++) { - Set<Integer> itemSet1 = getValues(clusteredItems.get(i)); + Set<Integer> itemSet1 = getValues(clusteredItems.get(i), dimensionToHash); for (int j = i + 1; j < clusteredItems.size(); j++) { - Set<Integer> itemSet2 = getValues(clusteredItems.get(j)); + Set<Integer> itemSet2 = getValues(clusteredItems.get(j), dimensionToHash); Collection<Integer> union = Sets.newHashSet(); union.addAll(itemSet1); union.addAll(itemSet2); @@ -125,7 +131,7 @@ public final class TestMinHashClustering } } - private void verify(Path output, double simThreshold, String msg) throws IOException { + private void verify(Path output, double simThreshold, String dimensionToHash, String msg) throws IOException { Configuration conf = getConfiguration(); Path outputFile = new Path(output, "part-r-00000"); List<Vector> clusteredItems = Lists.newArrayList(); @@ -136,45 +142,77 @@ public final class TestMinHashClustering if (prevClusterId.equals(clusterId.toString())) { clusteredItems.add(point.get()); } else { - runPairwiseSimilarity(clusteredItems, simThreshold, msg); + runPairwiseSimilarity(clusteredItems, simThreshold, dimensionToHash, msg); clusteredItems.clear(); prevClusterId = clusterId.toString(); clusteredItems.add(point.get()); } } - runPairwiseSimilarity(clusteredItems, simThreshold, msg); + runPairwiseSimilarity(clusteredItems, simThreshold, dimensionToHash, msg); } @Test public void testLinearMinHashMRJob() throws Exception { - String[] args = makeArguments(2, 3, 20, 3, HashType.LINEAR.toString()); + String[] args = makeArguments("value", 2, 3, 20, 4, HashType.LINEAR.toString()); int ret = ToolRunner.run(getConfiguration(), new MinHashDriver(), args); - assertEquals("Minhash MR Job failed for " + HashType.LINEAR, 0, ret); - verify(output, 0.2, "Hash Type: LINEAR"); + assertEquals("MinHash MR Hash value Job failed for " + HashType.LINEAR, 0, ret); + verify(output, 0.2, "value", "Hash Type: LINEAR"); } @Test public void testPolynomialMinHashMRJob() throws Exception { - String[] args = makeArguments(2, 3, 20, 3, HashType.POLYNOMIAL.toString()); + String[] args = makeArguments("value", 2, 3, 20, 3, HashType.POLYNOMIAL.toString()); int ret = ToolRunner.run(getConfiguration(), new MinHashDriver(), args); - assertEquals("Minhash MR Job failed for " + HashType.POLYNOMIAL, 0, ret); - verify(output, 0.27, "Hash Type: POLYNOMIAL"); + assertEquals("MinHash MR Job Hash value failed for " + HashType.POLYNOMIAL, 0, ret); + verify(output, 0.27, "value", "Hash Type: POLYNOMIAL"); } @Test public void testMurmurMinHashMRJob() throws Exception { - String[] args = makeArguments(2, 3, 20, 4, HashType.MURMUR.toString()); + String[] args = makeArguments("value", 2, 3, 20, 4, HashType.MURMUR.toString()); int ret = ToolRunner.run(getConfiguration(), new MinHashDriver(), args); - assertEquals("Minhash MR Job failed for " + HashType.MURMUR, 0, ret); - verify(output, 0.2, "Hash Type: MURMUR"); + assertEquals("MinHash MR Job Hash value failed for " + HashType.MURMUR, 0, ret); + verify(output, 0.2, "value", "Hash Type: MURMUR"); } @Test public void testMurmur3MinHashMRJob() throws Exception { - String[] args = makeArguments(2, 3, 20, 4, HashType.MURMUR3.toString()); + String[] args = makeArguments("value", 2, 3, 20, 4, HashType.MURMUR3.toString()); int ret = ToolRunner.run(getConfiguration(), new MinHashDriver(), args); - assertEquals("Minhash MR Job failed for " + HashType.MURMUR3, 0, ret); - verify(output, 0.2, "Hash Type: MURMUR"); + assertEquals("MinHash MR Job Hash value failed for " + HashType.MURMUR3, 0, ret); + verify(output, 0.2, "value", "Hash Type: MURMUR"); } - + + @Test + public void testLinearMinHashMRJobHashIndex() throws Exception { + String[] args = makeArguments("index", 2, 3, 20, 3, HashType.LINEAR.toString()); + int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args); + assertEquals("MinHash MR Job Hash Index failed for " + HashType.LINEAR, 0, ret); + verify(output, 0.2, "index", "Hash Type: LINEAR"); + } + + @Test + public void testPolynomialMinHashMRJobHashIndex() throws Exception { + String[] args = makeArguments("index", 2, 3, 20, 3, HashType.POLYNOMIAL.toString()); + int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args); + assertEquals("MinHash MR Job Hash Index failed for " + HashType.POLYNOMIAL, 0, ret); + verify(output, 0.3, "index", "Hash Type: POLYNOMIAL"); + } + + @Test + public void testMurmurMinHashMRJobHashIndex() throws Exception { + String[] args = makeArguments("index", 2, 3, 20, 4, HashType.MURMUR.toString()); + int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args); + assertEquals("MinHash MR Job Hash Index failed for " + HashType.MURMUR, 0, ret); + verify(output, 0.3, "index", "Hash Type: MURMUR"); + } + + @Test + public void testMurmur3MinHashMRJobHashIndex() throws Exception { + String[] args = makeArguments("index", 2, 3, 20, 4, HashType.MURMUR3.toString()); + int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args); + assertEquals("MinHash MR Job Hash Index failed for " + HashType.MURMUR3, 0, ret); + verify(output, 0.3, "index", "Hash Type: MURMUR"); + } + } \ No newline at end of file