[ 
https://issues.apache.org/jira/browse/MAHOUT-1257?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13681427#comment-13681427
 ] 

Mike Sokolov edited comment on MAHOUT-1257 at 6/12/13 5:41 PM:
---------------------------------------------------------------

b/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
index 336449c..08fa775 100644
--- a/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
+++ b/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
@@ -95,11 +95,14 @@
    * @return The raw log-likelihood ratio
    *
    * <p/>
+   * <p>
    * Credit to 
http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html for the 
table and the descriptions.
    */
   public static double logLikelihoodRatio(long k11, long k12, long k21, long 
k22) {
     Preconditions.checkArgument(k11 >= 0 && k12 >= 0 && k21 >= 0 && k22 >= 0);
     // note that we have counts here, not probabilities, and that the entropy 
is not normalized.
+    
+    /*
     double rowEntropy = entropy(k11 + k12, k21 + k22);
     double columnEntropy = entropy(k11 + k21, k12 + k22);
     double matrixEntropy = entropy(k11, k12, k21, k22);
@@ -108,6 +111,22 @@
       return 0.0;
     }
     return 2.0 * (rowEntropy + columnEntropy - matrixEntropy);
+    */
+
+    // This uses 2 fewer calls to xLogX than the explicit formulation in terms
+    // of row and column entropy and fewer additions
+    long p1 = k11 + k21; // number of occurrences of event 1
+    long p2 = k11 + k12; // number of occurrences of event 2
+    long P1 = k12 + k22; // N - p1; number of non-occurrences of event 1
+    long P2 = k21 + k22; // N - p2;  number of non-occurrences of event 2
+    long N = p1 + P1; // total number of occurrences:
+    double entropy = xLogX(N) - xLogX(p1) - xLogX(p2) - xLogX(P1) - xLogX(P2)
+               + xLogX(k11) + xLogX(k12) + xLogX(k21) + xLogX(k22);
+    if (entropy < 0) {
+        // round off error
+       return 0;
+    }
+    return 2.0 * entropy;
   }

                
      was (Author: sokolov):
    trying to attach patch ...
                  
> performance improvement to LogLikehood
> --------------------------------------
>
>                 Key: MAHOUT-1257
>                 URL: https://issues.apache.org/jira/browse/MAHOUT-1257
>             Project: Mahout
>          Issue Type: Improvement
>          Components: Math
>            Reporter: Mike Sokolov
>
> This patch reduces the amount of computation required for LLR. It simplifies 
> the math by canceling terms.  In a microbenchmark we saw an 18% run time 
> improvement.

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira

Reply via email to