Author: smarthi
Date: Tue Jun  4 03:40:34 2013
New Revision: 1489281

URL: http://svn.apache.org/r1489281
Log:
MAHOUT-1052: Add an option to MinHashDriver that specifies the dimension of 
vector to hash (indexes or values)

Modified:
    mahout/trunk/CHANGELOG
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinhashOptionCreator.java
    
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java

Modified: mahout/trunk/CHANGELOG
URL: 
http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1489281&r1=1489280&r2=1489281&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Tue Jun  4 03:40:34 2013
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.8 - unreleased
 
+  MAHOUT-1052: Add an option to MinHashDriver that specifies the dimension of 
vector to hash (indexes or values) (Elena Smirnova via smarthi)
+
 __MAHOUT-1237: Total cluster cost isn't computed properly (dfilimon)
 
   MAHOUT-1196: LogisticModelParameters uses csv.getTargetCategories() even if 
csv is not used. (Vineet Krishnan via ssc)

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java?rev=1489281&r1=1489280&r2=1489281&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashDriver.java
 Tue Jun  4 03:40:34 2013
@@ -18,14 +18,11 @@
 package org.apache.mahout.clustering.minhash;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.OutputFormat;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 import org.apache.hadoop.util.ToolRunner;
@@ -34,65 +31,19 @@ import org.apache.mahout.common.HadoopUt
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.math.VectorWritable;
 
-import java.io.IOException;
-
 public final class MinHashDriver extends AbstractJob {
 
   public static void main(String[] args) throws Exception {
-    ToolRunner.run(new Configuration(), new MinHashDriver(), args);
-  }
-
-  private void runJob(Path input, 
-                      Path output,
-                      int minClusterSize,
-                      int minVectorSize, 
-                      String hashType, 
-                      int numHashFunctions, 
-                      int keyGroups,
-                      int numReduceTasks, 
-                      boolean debugOutput) throws IOException, 
ClassNotFoundException, InterruptedException {
-    Configuration conf = getConf();
-
-    conf.setInt(MinhashOptionCreator.MIN_CLUSTER_SIZE, minClusterSize);
-    conf.setInt(MinhashOptionCreator.MIN_VECTOR_SIZE, minVectorSize);
-    conf.set(MinhashOptionCreator.HASH_TYPE, hashType);
-    conf.setInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, numHashFunctions);
-    conf.setInt(MinhashOptionCreator.KEY_GROUPS, keyGroups);
-    conf.setBoolean(MinhashOptionCreator.DEBUG_OUTPUT, debugOutput);
-
-    Class<? extends Writable> outputClass = debugOutput ? VectorWritable.class 
: Text.class;
-    Class<? extends OutputFormat> outputFormatClass =
-        debugOutput ? SequenceFileOutputFormat.class : TextOutputFormat.class;
-    
-    Job job = new Job(conf, "MinHash Clustering");
-    job.setJarByClass(MinHashDriver.class);
-
-    FileInputFormat.setInputPaths(job, input);
-    FileOutputFormat.setOutputPath(job, output);
-
-    job.setMapperClass(MinHashMapper.class);
-    job.setReducerClass(MinHashReducer.class);
-
-    job.setInputFormatClass(SequenceFileInputFormat.class);
-    job.setOutputFormatClass(outputFormatClass);
-
-    job.setMapOutputKeyClass(Text.class);
-    job.setMapOutputValueClass(outputClass);
-
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(outputClass);
-
-    job.setNumReduceTasks(numReduceTasks);
-
-    job.waitForCompletion(true);
+    ToolRunner.run(new MinHashDriver(), args);
   }
 
   @Override
-  public int run(String[] args) throws IOException, ClassNotFoundException, 
InterruptedException {
+  public int run(String[] args) throws Exception {
     addInputOption();
     addOutputOption();
     addOption(MinhashOptionCreator.minClusterSizeOption().create());
     addOption(MinhashOptionCreator.minVectorSizeOption().create());
+    addOption(MinhashOptionCreator.vectorDimensionToHashOption().create());
     addOption(MinhashOptionCreator.hashTypeOption().create());
     addOption(MinhashOptionCreator.numHashFunctionsOption().create());
     addOption(MinhashOptionCreator.keyGroupsOption().create());
@@ -104,28 +55,41 @@ public final class MinHashDriver extends
       return -1;
     }
 
-    Path input = getInputPath();
-    Path output = getOutputPath();
     if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
-      HadoopUtil.delete(getConf(), output);
+      HadoopUtil.delete(getConf(), getOutputPath());
     }
+
     int minClusterSize = 
Integer.valueOf(getOption(MinhashOptionCreator.MIN_CLUSTER_SIZE));
     int minVectorSize = 
Integer.valueOf(getOption(MinhashOptionCreator.MIN_VECTOR_SIZE));
+    String dimensionToHash = 
getOption(MinhashOptionCreator.VECTOR_DIMENSION_TO_HASH);
     String hashType = getOption(MinhashOptionCreator.HASH_TYPE);
     int numHashFunctions = 
Integer.valueOf(getOption(MinhashOptionCreator.NUM_HASH_FUNCTIONS));
     int keyGroups = 
Integer.valueOf(getOption(MinhashOptionCreator.KEY_GROUPS));
     int numReduceTasks = 
Integer.parseInt(getOption(MinhashOptionCreator.NUM_REDUCERS));
     boolean debugOutput = hasOption(MinhashOptionCreator.DEBUG_OUTPUT);
 
-    runJob(input,
-           output,
-           minClusterSize,
-           minVectorSize,
-           hashType,
-           numHashFunctions,
-           keyGroups,
-           numReduceTasks,
-           debugOutput);
+    Class<? extends Writable> outputClass = debugOutput ? VectorWritable.class 
: Text.class;
+    Class<? extends OutputFormat> outputFormatClass =
+        debugOutput ? SequenceFileOutputFormat.class : TextOutputFormat.class;
+
+    Job minHash = prepareJob(getInputPath(), getOutputPath(), 
SequenceFileInputFormat.class, MinHashMapper.class,
+            Text.class, outputClass, MinHashReducer.class, Text.class, 
VectorWritable.class, outputFormatClass);
+
+    Configuration minHashConfiguration = minHash.getConfiguration();
+    minHashConfiguration.setInt(MinhashOptionCreator.MIN_CLUSTER_SIZE, 
minClusterSize);
+    minHashConfiguration.setInt(MinhashOptionCreator.MIN_VECTOR_SIZE, 
minVectorSize);
+    minHashConfiguration.set(MinhashOptionCreator.VECTOR_DIMENSION_TO_HASH, 
dimensionToHash);
+    minHashConfiguration.set(MinhashOptionCreator.HASH_TYPE, hashType);
+    minHashConfiguration.setInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, 
numHashFunctions);
+    minHashConfiguration.setInt(MinhashOptionCreator.KEY_GROUPS, keyGroups);
+    minHashConfiguration.setBoolean(MinhashOptionCreator.DEBUG_OUTPUT, 
debugOutput);
+    minHash.setNumReduceTasks(numReduceTasks);
+
+    boolean succeeded = minHash.waitForCompletion(true);
+    if (!succeeded) {
+     return -1;
+    }
+
     return 0;
   }
 }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java?rev=1489281&r1=1489280&r2=1489281&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
 Tue Jun  4 03:40:34 2013
@@ -40,11 +40,13 @@ public class MinHashMapper extends Mappe
   private boolean debugOutput;
   private int[] minHashValues;
   private byte[] bytesToHash;
+  private String dimensionToHash;
 
   @Override
   protected void setup(Context context) throws IOException, 
InterruptedException {
     super.setup(context);
     Configuration conf = context.getConfiguration();
+    this.dimensionToHash = 
conf.get(MinhashOptionCreator.VECTOR_DIMENSION_TO_HASH, "value");
     this.numHashFunctions = 
conf.getInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, 10);
     this.minHashValues = new int[numHashFunctions];
     this.bytesToHash = new byte[4];
@@ -76,14 +78,14 @@ public class MinHashMapper extends Mappe
     if (featureVector.size() < minVectorSize) {
       return;
     }
-    // Initialize the minhash values to highest
+    // Initialize the MinHash values to highest
     for (int i = 0; i < numHashFunctions; i++) {
       minHashValues[i] = Integer.MAX_VALUE;
     }
 
     for (int i = 0; i < numHashFunctions; i++) {
       for (Vector.Element ele : featureVector.nonZeroes()) {
-        int value = (int) ele.get();
+        int value = "value".equalsIgnoreCase(dimensionToHash) ? (int) 
ele.get() : ele.index();
         bytesToHash[0] = (byte) (value >> 24);
         bytesToHash[1] = (byte) (value >> 16);
         bytesToHash[2] = (byte) (value >> 8);

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinhashOptionCreator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinhashOptionCreator.java?rev=1489281&r1=1489280&r2=1489281&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinhashOptionCreator.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinhashOptionCreator.java
 Tue Jun  4 03:40:34 2013
@@ -28,6 +28,7 @@ public final class MinhashOptionCreator 
   public static final String MIN_VECTOR_SIZE = "minVectorSize";
   public static final String NUM_REDUCERS = "numReducers";
   public static final String DEBUG_OUTPUT = "debugOutput";
+  public static final String VECTOR_DIMENSION_TO_HASH  = 
"vectorDimensionToHash";
 
   private MinhashOptionCreator() {
   }
@@ -43,7 +44,7 @@ public final class MinhashOptionCreator 
     return new DefaultOptionBuilder()
         .withLongName(NUM_REDUCERS)
         .withRequired(false)
-        .withShortName("r")
+        .withShortName("nr")
         .withArgument(
             new ArgumentBuilder().withName(NUM_REDUCERS).withDefault("2")
                 .withMinimum(1).withMaximum(1).create())
@@ -125,4 +126,20 @@ public final class MinhashOptionCreator 
                 .withMinimum(1).withMaximum(1).create())
         .withDescription("Number of key groups to be 
used").withShortName("kg");
   }
+
+  /**
+   * Returns a default command line option for specifying the vector dimension 
to hash
+   * in MinHash clustering: Should be one of ("value","index")
+   */
+   public static DefaultOptionBuilder vectorDimensionToHashOption() {
+     return new DefaultOptionBuilder()
+        .withLongName(VECTOR_DIMENSION_TO_HASH)
+        .withRequired(false)
+        .withArgument(
+           new 
ArgumentBuilder().withName(VECTOR_DIMENSION_TO_HASH).withDefault("value")
+               .withMinimum(1).withMaximum(1).create())
+        .withDescription("Dimension of vector to hash. Available types: 
(value, index). Defaults to 'value' ")
+        .withShortName("vdh");
+   }
+
 }

Modified: 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java?rev=1489281&r1=1489280&r2=1489281&view=diff
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
 (original)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
 Tue Jun  4 03:40:34 2013
@@ -43,10 +43,10 @@ import java.util.Set;
 
 public final class TestMinHashClustering extends MahoutTestCase {
   
-  private static final double[][] REFERENCE = { {1, 2, 3, 4, 5}, {2, 1, 3, 6, 
7}, {3, 7, 6, 11, 8, 9},
-                                              {4, 7, 8, 9, 6, 1}, {5, 8, 10, 
4, 1}, {6, 17, 14, 15},
-                                              {8, 9, 11, 6, 12, 1, 7}, {10, 
13, 9, 7, 4, 6, 3},
-                                              {3, 5, 7, 9, 2, 11}, {13, 7, 6, 
8, 5}};
+  private static final double[][] REFERENCE = { {0, 0, 3, 4, 5}, {0, 0, 3, 6, 
7}, {0, 7, 6, 11, 8, 9},
+                                              {0, 7, 8, 9, 6, 0}, {5, 8, 10, 
0, 0}, {6, 17, 14, 15},
+                                              {8, 9, 11, 0, 12, 0, 7}, {10, 
13, 9, 7, 0, 6, 0},
+                                              {0, 0, 7, 9, 0, 11}, {13, 7, 6, 
8, 0}};
 
   private Path input;
   private Path output;
@@ -77,17 +77,18 @@ public final class TestMinHashClustering
         writer.append(new Text("Id-" + id++), point);
       }
     } finally {
-      Closeables.closeQuietly(writer);
+      Closeables.close(writer, false);
     }
   }
   
-  private String[] makeArguments(int minClusterSize,
+  private String[] makeArguments(String dimensionToHash, int minClusterSize,
                                  int minVectorSize,
                                  int numHashFunctions,
                                  int keyGroups,
                                  String hashType) {
     return new String[] {optKey(DefaultOptionCreator.INPUT_OPTION), 
input.toString(),
                          optKey(DefaultOptionCreator.OUTPUT_OPTION), 
output.toString(),
+                         
optKey(MinhashOptionCreator.VECTOR_DIMENSION_TO_HASH), dimensionToHash,
                          optKey(MinhashOptionCreator.MIN_CLUSTER_SIZE), 
String.valueOf(minClusterSize),
                          optKey(MinhashOptionCreator.MIN_VECTOR_SIZE), 
String.valueOf(minVectorSize),
                          optKey(MinhashOptionCreator.HASH_TYPE), hashType,
@@ -97,20 +98,25 @@ public final class TestMinHashClustering
                          optKey(MinhashOptionCreator.DEBUG_OUTPUT)};
   }
   
-  private static Set<Integer> getValues(Vector vector) {
+  private static Set<Integer> getValues(Vector vector, String dimensionToHash) 
{
     Set<Integer> values = Sets.newHashSet();
-    for (Vector.Element e : vector.nonZeroes()) {
-      values.add((int) e.get());
+    if ("value".equalsIgnoreCase(dimensionToHash)) {
+      for (Vector.Element e : vector.nonZeroes())
+        values.add((int) e.get());
+    } else {
+      for (Vector.Element e : vector.nonZeroes())
+        values.add(e.index());
     }
     return values;
   }
-  
-  private static void runPairwiseSimilarity(List<Vector> clusteredItems, 
double simThreshold, String msg) {
+
+  private static void runPairwiseSimilarity(List<Vector> clusteredItems, 
double simThreshold,
+                                            String dimensionToHash, String 
msg) {
     if (clusteredItems.size() > 1) {
       for (int i = 0; i < clusteredItems.size(); i++) {
-        Set<Integer> itemSet1 = getValues(clusteredItems.get(i));
+        Set<Integer> itemSet1 = getValues(clusteredItems.get(i), 
dimensionToHash);
         for (int j = i + 1; j < clusteredItems.size(); j++) {
-          Set<Integer> itemSet2 = getValues(clusteredItems.get(j));
+          Set<Integer> itemSet2 = getValues(clusteredItems.get(j), 
dimensionToHash);
           Collection<Integer> union = Sets.newHashSet();
           union.addAll(itemSet1);
           union.addAll(itemSet2);
@@ -125,7 +131,7 @@ public final class TestMinHashClustering
     }
   }
   
-  private void verify(Path output, double simThreshold, String msg) throws 
IOException {
+  private void verify(Path output, double simThreshold, String 
dimensionToHash, String msg) throws IOException {
     Configuration conf = getConfiguration();
     Path outputFile = new Path(output, "part-r-00000");
     List<Vector> clusteredItems = Lists.newArrayList();
@@ -136,45 +142,77 @@ public final class TestMinHashClustering
       if (prevClusterId.equals(clusterId.toString())) {
         clusteredItems.add(point.get());
       } else {
-        runPairwiseSimilarity(clusteredItems, simThreshold, msg);
+        runPairwiseSimilarity(clusteredItems, simThreshold, dimensionToHash, 
msg);
         clusteredItems.clear();
         prevClusterId = clusterId.toString();
         clusteredItems.add(point.get());
       }
     }
-    runPairwiseSimilarity(clusteredItems, simThreshold, msg);
+    runPairwiseSimilarity(clusteredItems, simThreshold, dimensionToHash, msg);
   }
   
   @Test
   public void testLinearMinHashMRJob() throws Exception {
-    String[] args = makeArguments(2, 3, 20, 3, HashType.LINEAR.toString());
+    String[] args = makeArguments("value", 2, 3, 20, 4, 
HashType.LINEAR.toString());
     int ret = ToolRunner.run(getConfiguration(), new MinHashDriver(), args);
-    assertEquals("Minhash MR Job failed for " + HashType.LINEAR, 0, ret);
-    verify(output, 0.2, "Hash Type: LINEAR");
+    assertEquals("MinHash MR Hash value Job failed for " + HashType.LINEAR, 0, 
ret);
+    verify(output, 0.2, "value", "Hash Type: LINEAR");
   }
   
   @Test
   public void testPolynomialMinHashMRJob() throws Exception {
-    String[] args = makeArguments(2, 3, 20, 3, HashType.POLYNOMIAL.toString());
+    String[] args = makeArguments("value", 2, 3, 20, 3, 
HashType.POLYNOMIAL.toString());
     int ret = ToolRunner.run(getConfiguration(), new MinHashDriver(), args);
-    assertEquals("Minhash MR Job failed for " + HashType.POLYNOMIAL, 0, ret);
-    verify(output, 0.27, "Hash Type: POLYNOMIAL");
+    assertEquals("MinHash MR Job Hash value failed for " + 
HashType.POLYNOMIAL, 0, ret);
+    verify(output, 0.27, "value", "Hash Type: POLYNOMIAL");
   }
   
   @Test
   public void testMurmurMinHashMRJob() throws Exception {
-    String[] args = makeArguments(2, 3, 20, 4, HashType.MURMUR.toString());
+    String[] args = makeArguments("value", 2, 3, 20, 4, 
HashType.MURMUR.toString());
     int ret = ToolRunner.run(getConfiguration(), new MinHashDriver(), args);
-    assertEquals("Minhash MR Job failed for " + HashType.MURMUR, 0, ret);
-    verify(output, 0.2, "Hash Type: MURMUR");
+    assertEquals("MinHash MR Job Hash value failed for " + HashType.MURMUR, 0, 
ret);
+    verify(output, 0.2, "value", "Hash Type: MURMUR");
   }
 
   @Test
   public void testMurmur3MinHashMRJob() throws Exception {
-    String[] args = makeArguments(2, 3, 20, 4, HashType.MURMUR3.toString());
+    String[] args = makeArguments("value", 2, 3, 20, 4, 
HashType.MURMUR3.toString());
     int ret = ToolRunner.run(getConfiguration(), new MinHashDriver(), args);
-    assertEquals("Minhash MR Job failed for " + HashType.MURMUR3, 0, ret);
-    verify(output, 0.2, "Hash Type: MURMUR");
+    assertEquals("MinHash MR Job Hash value failed for " + HashType.MURMUR3, 
0, ret);
+    verify(output, 0.2, "value", "Hash Type: MURMUR");
   }
-  
+
+  @Test
+  public void testLinearMinHashMRJobHashIndex() throws Exception {
+    String[] args = makeArguments("index", 2, 3, 20, 3, 
HashType.LINEAR.toString());
+    int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
+    assertEquals("MinHash MR Job Hash Index failed for " + HashType.LINEAR, 0, 
ret);
+    verify(output, 0.2, "index", "Hash Type: LINEAR");
+  }
+
+  @Test
+  public void testPolynomialMinHashMRJobHashIndex() throws Exception {
+    String[] args = makeArguments("index", 2, 3, 20, 3, 
HashType.POLYNOMIAL.toString());
+    int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
+    assertEquals("MinHash MR Job Hash Index failed for " + 
HashType.POLYNOMIAL, 0, ret);
+    verify(output, 0.3, "index", "Hash Type: POLYNOMIAL");
+  }
+
+  @Test
+  public void testMurmurMinHashMRJobHashIndex() throws Exception {
+    String[] args = makeArguments("index", 2, 3, 20, 4, 
HashType.MURMUR.toString());
+    int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
+    assertEquals("MinHash MR Job Hash Index failed for " + HashType.MURMUR, 0, 
ret);
+    verify(output, 0.3, "index", "Hash Type: MURMUR");
+  }
+
+  @Test
+  public void testMurmur3MinHashMRJobHashIndex() throws Exception {
+    String[] args = makeArguments("index", 2, 3, 20, 4, 
HashType.MURMUR3.toString());
+    int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
+    assertEquals("MinHash MR Job Hash Index failed for " + HashType.MURMUR3, 
0, ret);
+    verify(output, 0.3, "index", "Hash Type: MURMUR");
+  }
+
 }
\ No newline at end of file


Reply via email to