Author: tdunning
Date: Mon Aug 30 19:34:13 2010
New Revision: 990914
URL: http://svn.apache.org/viewvc?rev=990914&view=rev
Log:
MAHOUT-492 - modified InteractionValueEncoder to allow interactions between
TextValueEncoder and other encoders
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java?rev=990914&r1=990913&r2=990914&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
Mon Aug 30 19:34:13 2010
@@ -21,10 +21,7 @@ import com.google.common.collect.Sets;
import org.apache.mahout.math.Vector;
import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
/**
* General interface for objects that record features into a feature vector.
@@ -76,13 +73,10 @@ public abstract class FeatureVectorEncod
protected abstract int hashForProbe(String originalForm, Vector data, String
name, int i);
protected Iterable<Integer> hashesForProbe(String originalForm, Vector data,
String name, int i){
- List<Integer> hashes = new ArrayList<Integer>();
- hashes.add(hashForProbe(originalForm,data,name,i));
- return hashes;
+ return Collections.singletonList(hashForProbe(originalForm,data,name,i));
}
-
- protected double getWeight(String originalFor, double w){
+ protected double getWeight(String originalForm, double w){
return 1.0;
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java?rev=990914&r1=990913&r2=990914&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
Mon Aug 30 19:34:13 2010
@@ -17,9 +17,10 @@
package org.apache.mahout.vectors;
+import org.apache.mahout.common.iterator.ArrayIterator;
import org.apache.mahout.math.Vector;
-import java.util.Locale;
+import java.util.ArrayList;
public class InteractionValueEncoder extends FeatureVectorEncoder {
@@ -52,24 +53,33 @@ public class InteractionValueEncoder ext
* @param data The vector to which the value should be added.
*/
public void addInteractionToVector(String originalForm1, String
originalForm2, double weight, Vector data) {
- int probes = getProbes();
String name = getName();
double w = getWeight(originalForm1, originalForm2, weight);
- for (int i = 0; i < probes; i++) {
- int h1 = firstEncoder.hashForProbe(originalForm1, data, name, i);
- int h2 = secondEncoder.hashForProbe(originalForm1, data, name, i);
- int j = firstEncoder.hashForProbe(originalForm2, data, name, i);
- int n = (h1 + (j+1)*h2) % data.size();
- if(n < 0){
- n = n+data.size();
+ for (int i = 0; i < probes(); i++) {
+ for(Integer k : firstEncoder.hashesForProbe(originalForm1, data, name,
i)){
+ for(Integer j : secondEncoder.hashesForProbe(originalForm2, data,
name, i)){
+ int n =
linearDoubleHash(hash1(k,name,i,data),hash2(k,name,i,data),j,data.size());
+ trace(String.format("%s:%s", originalForm1, originalForm2), n);
+ data.set(n, data.get(n) + w);
+ }
}
- trace(String.format("%s:%s", originalForm1, originalForm2), n);
- data.set(n, data.get(n) + w);
}
}
+ private int probes() {
+ return getProbes();
+ }
+
protected double getWeight(String originalForm1, String originalForm2,
double w) {
- return firstEncoder.getWeight(originalForm1, 1.0) *
secondEncoder.getWeight(originalForm2,1.0) * w;
+ return firstEncoder.getWeight(originalForm1, 1.0) *
secondEncoder.getWeight(originalForm2, 1.0) * w;
+ }
+
+ private int linearDoubleHash(int h1, int h2, int j, int modulus){
+ int n = (h1 + (j+1)*h2) % modulus;
+ if(n < 0){
+ n = n+modulus;
+ }
+ return n;
}
/**
@@ -90,12 +100,13 @@ public class InteractionValueEncoder ext
return hash(name, i, data.size());
}
- protected int hash1(String term1, String term2, int probe, int numFeatures) {
- return hash(term1, term2, probe+INTERACTION_VALUE_HASH_SEED_1,numFeatures);
+ protected int hash1(int value, String name, int i, Vector data){
+ return hash(name, i+value+INTERACTION_VALUE_HASH_SEED_1, data.size());
}
- protected int hash2(String term1, String term2, int probe, int numFeatures) {
- return hash(term1, term2, probe+INTERACTION_VALUE_HASH_SEED_2,numFeatures);
+ protected int hash2(int value, String name, int i, Vector data){
+ return hash(name, i+value+INTERACTION_VALUE_HASH_SEED_2, data.size());
}
}
+
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java?rev=990914&r1=990913&r2=990914&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
Mon Aug 30 19:34:13 2010
@@ -41,7 +41,7 @@ public class StaticWordValueEncoder exte
@Override
protected int hashForProbe(String originalForm, Vector data, String name,
int i) {
- return hash(name, i, data.size());
+ return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + i,
data.size());
}
/**
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java?rev=990914&r1=990913&r2=990914&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
Mon Aug 30 19:34:13 2010
@@ -52,21 +52,21 @@ public class TextValueEncoder extends Fe
}
}
- @Override
- protected int hashForProbe(String originalForm, Vector data, String name,
int i) {
- return 0;
- }
+ @Override
+ protected int hashForProbe(String originalForm, Vector data, String name,
int i) {
+ return 0;
+ }
- protected Iterable<Integer> hashesForProbe(String originalForm, Vector
data, String name, int i){
- List<Integer> hashes = new ArrayList<Integer>();
- for (String word : tokenize(originalForm)){
- hashes.add(hashForProbe(word,data,name,i));
- }
- return hashes;
+ protected Iterable<Integer> hashesForProbe(String originalForm, Vector data,
String name, int i){
+ List<Integer> hashes = new ArrayList<Integer>();
+ for (String word : tokenize(originalForm)){
+ hashes.add(hashForProbe(word,data,name,i));
}
+ return hashes;
+ }
- private Iterable<String> tokenize(CharSequence originalForm) {
+ private Iterable<String> tokenize(CharSequence originalForm) {
return onNonWord.split(originalForm);
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java?rev=990914&r1=990913&r2=990914&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
Mon Aug 30 19:34:13 2010
@@ -49,6 +49,7 @@ public abstract class WordValueEncoder e
}
}
+
@Override
protected double getWeight(String originalForm, double w) {
return w*weight(originalForm);
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java?rev=990914&r1=990913&r2=990914&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java
Mon Aug 30 19:34:13 2010
@@ -55,7 +55,7 @@ public class InteractionValueEncoderTest
}
@Test
- public void testaddToVectorUsesProductOfWeights(){
+ public void testAddToVectorUsesProductOfWeights(){
WordValueEncoder wv = new StaticWordValueEncoder("word");
ContinuousValueEncoder cv = new ContinuousValueEncoder("cont");
InteractionValueEncoder enc = new InteractionValueEncoder("interactions",
wv, cv);
@@ -66,4 +66,17 @@ public class InteractionValueEncoderTest
Assert.assertEquals((float) k*0.5*0.9, v1.norm(1), 0);
Assert.assertEquals(0.5*0.9, v1.maxValue(), 0);
}
+
+ @Test
+ public void testAddToVectorWithTextValueEncoder(){
+ WordValueEncoder wv = new StaticWordValueEncoder("word");
+ TextValueEncoder tv = new TextValueEncoder("text");
+ InteractionValueEncoder enc = new InteractionValueEncoder("interactions",
wv, tv);
+ Vector v1 = new DenseVector(200);
+ enc.addInteractionToVector("a","some text here",1.0, v1);
+ int k = enc.getProbes();
+ // should interact "a" with each of "some","text" and "here"
+ Assert.assertEquals((float) k*3, v1.norm(1), 0);
+ }
+
}