Author: gsingers
Date: Wed Nov  2 14:57:44 2011
New Revision: 1196616

URL: http://svn.apache.org/viewvc?rev=1196616&view=rev
Log:
lop off some stop words so that we get better clusters

Added:
    mahout/trunk/math/src/main/java/org/apache/mahout/math/MurmurHash3.java
Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
    
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
    mahout/trunk/examples/bin/build-reuters.sh

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java?rev=1196616&r1=1196615&r2=1196616&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
 Wed Nov  2 14:57:44 2011
@@ -18,6 +18,7 @@ package org.apache.mahout.clustering.min
 
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.math.MurmurHash;
+import org.apache.mahout.math.MurmurHash3;
 
 import java.util.Random;
 
@@ -27,7 +28,7 @@ public final class HashFactory {
   }
 
   public enum HashType {
-    LINEAR, POLYNOMIAL, MURMUR
+    LINEAR, POLYNOMIAL, MURMUR, MURMUR3
   }
 
   public static HashFunction[] createHashFunctions(HashType type, int 
numFunctions) {
@@ -49,6 +50,11 @@ public final class HashFactory {
           hashFunction[i] = new MurmurHashWrapper(seed.nextInt());
         }
         break;
+      case MURMUR3:
+        for (int i = 0; i < numFunctions; i++) {
+          hashFunction[i] = new MurmurHash3Wrapper(seed.nextInt());
+        }
+        break;
       default:
         throw new IllegalStateException("Unknown type: " + type);
     }
@@ -111,4 +117,18 @@ public final class HashFactory {
       return Math.abs((int) (hashValue % 
RandomUtils.MAX_INT_SMALLER_TWIN_PRIME));
     }
   }
+
+  static class MurmurHash3Wrapper implements HashFunction {
+    private final int seed;
+
+    MurmurHash3Wrapper(int seed) {
+      this.seed = seed;
+    }
+
+    @Override
+    public int hash(byte[] bytes) {
+      long hashValue = MurmurHash3.murmurhash3_x86_32(bytes, 0, bytes.length, 
seed);
+      return Math.abs((int) (hashValue % 
RandomUtils.MAX_INT_SMALLER_TWIN_PRIME));
+    }
+  }
 }

Modified: 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java?rev=1196616&r1=1196615&r2=1196616&view=diff
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
 (original)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
 Wed Nov  2 14:57:44 2011
@@ -170,5 +170,13 @@ public class TestMinHashClustering exten
     assertEquals("Minhash MR Job failed for " + HashType.MURMUR, 0, ret);
     verify(output, 0.3, "Hash Type: MURMUR");
   }
+
+  @Test
+  public void testMurmur3MinHashMRJob() throws Exception {
+    String[] args = makeArguments(2, 3, 20, 4, HashType.MURMUR3.toString());
+    int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
+    assertEquals("Minhash MR Job failed for " + HashType.MURMUR3, 0, ret);
+    verify(output, 0.3, "Hash Type: MURMUR");
+  }
   
 }
\ No newline at end of file

Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1196616&r1=1196615&r2=1196616&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Wed Nov  2 14:57:44 2011
@@ -93,7 +93,7 @@ fi
 if [ "x$clustertype" == "xkmeans" ]; then
   $MAHOUT seq2sparse \
     -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 \
   && \
   $MAHOUT kmeans \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
@@ -109,7 +109,7 @@ if [ "x$clustertype" == "xkmeans" ]; the
 elif [ "x$clustertype" == "xfuzzykmeans" ]; then
   $MAHOUT seq2sparse \
     -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 \
   && \
   $MAHOUT fkmeans \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
@@ -139,7 +139,7 @@ elif [ "x$clustertype" == "xlda" ]; then
 elif [ "x$clustertype" == "xdirichlet" ]; then
   $MAHOUT seq2sparse \
     -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet  --maxDFPercent 85 \
   && \
   $MAHOUT dirichlet \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet/tfidf-vectors \
@@ -155,7 +155,7 @@ elif [ "x$clustertype" == "xdirichlet" ]
 elif [ "x$clustertype" == "xminhash" ]; then
   $MAHOUT seq2sparse \
     -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-minhash \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-minhash --maxDFPercent 85 \
   && \
   $MAHOUT org.apache.mahout.clustering.minhash.MinHashDriver \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-minhash/tfidf-vectors \

Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/MurmurHash3.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/MurmurHash3.java?rev=1196616&view=auto
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/MurmurHash3.java 
(added)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/MurmurHash3.java Wed 
Nov  2 14:57:44 2011
@@ -0,0 +1,80 @@
+package org.apache.mahout.math;
+/**
+ *  This code is public domain.
+ *
+ *  The MurmurHash3 algorithm was created by Austin Appleby and put into the 
public domain.  See http://code.google.com/p/smhasher/
+ *
+ *  This java port was authored by
+ *  Yonik Seeley and was placed into the public domain per 
https://github.com/yonik/java_util/blob/master/src/util/hash/MurmurHash3.java.
+ */
+
+//
+
+/**
+ *  <p>
+ *  This produces exactly the same hash values as the final C++
+ *  version of MurmurHash3 and is thus suitable for producing the same hash 
values across
+ *  platforms.
+ *  <p>
+ *  The 32 bit x86 version of this hash should be the fastest variant for 
relatively short keys like ids.
+ *  <p>
+ *  Note - The x86 and x64 versions do _not_ produce the same results, as the
+ *  algorithms are optimized for their respective platforms.
+ *  <p>
+ *  See also http://github.com/yonik/java_util for future updates to this file.
+ */
+public class MurmurHash3 {
+
+  /** Returns the MurmurHash3_x86_32 hash. */
+  public static int murmurhash3_x86_32(byte[] data, int offset, int len, int 
seed) {
+
+    final int c1 = 0xcc9e2d51;
+    final int c2 = 0x1b873593;
+
+    int h1 = seed;
+    int roundedEnd = offset + (len & 0xfffffffc);  // round down to 4 byte 
block
+
+    for (int i=offset; i<roundedEnd; i+=4) {
+      // little endian load order
+      int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 
0xff) << 16) | (data[i+3] << 24);
+      k1 *= c1;
+      k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
+      k1 *= c2;
+
+      h1 ^= k1;
+      h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
+      h1 = h1*5+0xe6546b64;
+    }
+
+    // tail
+    int k1 = 0;
+
+    switch(len & 0x03) {
+      case 3:
+        k1 = (data[roundedEnd + 2] & 0xff) << 16;
+        // fallthrough
+      case 2:
+        k1 |= (data[roundedEnd + 1] & 0xff) << 8;
+        // fallthrough
+      case 1:
+        k1 |= (data[roundedEnd] & 0xff);
+        k1 *= c1;
+        k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
+        k1 *= c2;
+        h1 ^= k1;
+    }
+
+    // finalization
+    h1 ^= len;
+
+    // fmix(h1);
+    h1 ^= h1 >>> 16;
+    h1 *= 0x85ebca6b;
+    h1 ^= h1 >>> 13;
+    h1 *= 0xc2b2ae35;
+    h1 ^= h1 >>> 16;
+
+    return h1;
+  }
+
+}
\ No newline at end of file


Reply via email to