Author: srowen
Date: Sat Jul 16 14:19:51 2011
New Revision: 1147431
URL: http://svn.apache.org/viewvc?rev=1147431&view=rev
Log:
Knock down a few small CPU hotspots. Parse files in FileDataModel with
Splitter. Optimize and even increase accuracy of entropy-like calculation for
log-likelihood calc to avoid 5 unnecessary array allocations
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java?rev=1147431&r1=1147430&r2=1147431&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
Sat Jul 16 14:19:51 2011
@@ -22,11 +22,12 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
+import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.locks.ReentrantLock;
-import java.util.regex.Pattern;
+import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.mahout.cf.taste.common.Refreshable;
@@ -127,7 +128,7 @@ public class FileDataModel extends Abstr
private long lastModified;
private long lastUpdateFileModified;
private final char delimiter;
- private final Pattern delimiterPattern;
+ private final Splitter delimiterPattern;
private final boolean hasPrefValues;
private DataModel delegate;
private final ReentrantLock reloadLock;
@@ -177,10 +178,13 @@ public class FileDataModel extends Abstr
Closeables.closeQuietly(iterator);
delimiter = determineDelimiter(firstLine);
- delimiterPattern = Pattern.compile(String.valueOf(delimiter));
- String[] firstLineSplit = delimiterPattern.split(firstLine);
+ delimiterPattern = Splitter.on(delimiter);
+ List<String> firstLineSplit = Lists.newArrayList();
+ for (String token : delimiterPattern.split(firstLine)) {
+ firstLineSplit.add(token);
+ }
// If preference value exists and isn't empty then the file is specifying
pref values
- hasPrefValues = firstLineSplit.length >= 3 && firstLineSplit[2].length() >
0;
+ hasPrefValues = firstLineSplit.size() >= 3 &&
firstLineSplit.get(2).length() > 0;
this.reloadLock = new ReentrantLock();
this.transpose = transpose;
@@ -368,15 +372,12 @@ public class FileDataModel extends Abstr
return;
}
- // Consume up to 4 tokens, and gather whatever is left in an unused 5th
token:
- String[] tokens = delimiterPattern.split(line, 5);
-
- Preconditions.checkArgument(tokens.length >= 3, "Bad line: %s", line);
-
- String userIDString = tokens[0];
- String itemIDString = tokens[1];
- String preferenceValueString = tokens[2];
- String timestampString = tokens.length >= 4 ? tokens[3] : null;
+ Iterator<String> tokens = delimiterPattern.split(line).iterator();
+ String userIDString = tokens.next();
+ String itemIDString = tokens.next();
+ String preferenceValueString = tokens.next();
+ boolean hasTimestamp = tokens.hasNext();
+ String timestampString = hasTimestamp ? tokens.next() : null;
long userID = readUserIDFromString(userIDString);
long itemID = readItemIDFromString(itemIDString);
@@ -393,7 +394,7 @@ public class FileDataModel extends Abstr
// Data are PreferenceArray
PreferenceArray prefs = (PreferenceArray) maybePrefs;
- if (tokens.length == 3 && preferenceValueString.length() == 0) {
+ if (!hasTimestamp && preferenceValueString.length() == 0) {
// Then line is of form "userID,itemID,", meaning remove
if (prefs != null) {
boolean exists = false;
@@ -461,7 +462,7 @@ public class FileDataModel extends Abstr
Collection<Preference> prefs = (Collection<Preference>) maybePrefs;
- if (tokens.length == 3 && preferenceValueString.length() == 0) {
+ if (!hasTimestamp && preferenceValueString.length() == 0) {
// Then line is of form "userID,itemID,", meaning remove
if (prefs != null) {
// remove pref
@@ -532,12 +533,13 @@ public class FileDataModel extends Abstr
return;
}
- String[] tokens = delimiterPattern.split(line);
- Preconditions.checkArgument(tokens.length >= 2, "Bad line: %s", line);
- String userIDString = tokens[0];
- String itemIDString = tokens[1];
- String preferenceValueString = tokens.length >= 3 ? tokens[2] : "";
- String timestampString = tokens.length >= 4 ? tokens[3] : null;
+ Iterator<String> tokens = delimiterPattern.split(line).iterator();
+ String userIDString = tokens.next();
+ String itemIDString = tokens.next();
+ boolean hasPreference = tokens.hasNext();
+ String preferenceValueString = hasPreference ? tokens.next() : "";
+ boolean hasTimestamp = tokens.hasNext();
+ String timestampString = hasTimestamp ? tokens.next() : null;
long userID = readUserIDFromString(userIDString);
long itemID = readItemIDFromString(itemIDString);
@@ -548,7 +550,7 @@ public class FileDataModel extends Abstr
itemID = tmp;
}
- if (tokens.length == 3 && preferenceValueString.length() == 0) {
+ if (hasPreference && !hasTimestamp && preferenceValueString.length() == 0)
{
// Then line is of form "userID,itemID,", meaning remove
FastIDSet itemIDs = data.get(userID);
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java?rev=1147431&r1=1147430&r2=1147431&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java
Sat Jul 16 14:19:51 2011
@@ -67,8 +67,8 @@ public final class LogLikelihoodSimilari
assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(1, 0));
assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(0, 1));
- assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(2, 3));
- assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(3, 2));
+ assertCorrelationEquals(0.0, similarity.itemSimilarity(2, 3));
+ assertCorrelationEquals(0.0, similarity.itemSimilarity(3, 2));
}
@Test
Modified:
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java?rev=1147431&r1=1147430&r2=1147431&view=diff
==============================================================================
---
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
(original)
+++
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
Sat Jul 16 14:19:51 2011
@@ -17,6 +17,7 @@
package org.apache.mahout.math.stats;
+import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import com.google.common.collect.Ordering;
@@ -48,19 +49,34 @@ public final class LogLikelihood {
* @return The entropy value for the elements
*/
public static double entropy(long... elements) {
- double sum = 0.0;
+ long sum = 0;
double result = 0.0;
for (long element : elements) {
- if (element < 0) {
- throw new IllegalArgumentException("Should not have negative count for
entropy computation: (" + element + ')');
- }
- if (element > 0) {
- result += element * Math.log(element);
- sum += element;
- }
+ Preconditions.checkArgument(element >= 0);
+ result += xLogX(element);
+ sum += element;
}
- result -= sum * Math.log(sum);
- return -result;
+ return xLogX(sum) - result;
+ }
+
+ private static double xLogX(long x) {
+ return x == 0 ? 0.0 : x * Math.log(x);
+ }
+
+ /**
+ * Merely an optimization for the common two argument case of {@link
#entropy(long...)}
+ * @see #logLikelihoodRatio(long, long, long, long)
+ */
+ private static double entropy(long a, long b) {
+ return xLogX(a + b) - xLogX(a) - xLogX(b);
+ }
+
+ /**
+ * Merely an optimization for the common four argument case of {@link
#entropy(long...)}
+ * @see #logLikelihoodRatio(long, long, long, long)
+ */
+ private static double entropy(long a, long b, long c, long d) {
+ return xLogX(a + b + c + d) - xLogX(a) - xLogX(b) - xLogX(c) - xLogX(d);
}
/**
@@ -82,6 +98,7 @@ public final class LogLikelihood {
* Credit to
http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html for the
table and the descriptions.
*/
public static double logLikelihoodRatio(long k11, long k12, long k21, long
k22) {
+ Preconditions.checkArgument(k11 >= 0 && k12 >= 0 && k21 >= 0 && k22 >= 0);
// note that we have counts here, not probabilities, and that the entropy
is not normalized.
double rowEntropy = entropy(k11, k12) + entropy(k21, k22);
double columnEntropy = entropy(k11, k21) + entropy(k12, k22);
Modified:
mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java?rev=1147431&r1=1147430&r2=1147431&view=diff
==============================================================================
---
mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
(original)
+++
mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
Sat Jul 16 14:19:51 2011
@@ -68,7 +68,7 @@ public final class LogLikelihoodTest ext
@Test
public void testRootNegativeLLR() {
- assertEquals(0.0, LogLikelihood.rootLogLikelihoodRatio(6, 7567, 1924,
2426487), 0.00000001);
+ assertTrue(LogLikelihood.rootLogLikelihoodRatio(6, 7567, 1924, 2426487) >
0.0);
}
@Test