[commons-statistics] branch master updated: STATISTICS-25: T-dist to use the complement of the regularized beta

aherbert Sat, 22 Jan 2022 06:34:35 -0800

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-statistics.git



The following commit(s) were added to refs/heads/master by this push:
     new 876b5ec  STATISTICS-25: T-dist to use the complement of the 
regularized beta
876b5ec is described below

commit 876b5ec9782a36887f08612bee5b0ee8917df6f2
Author: Alex Herbert <aherb...@apache.org>
AuthorDate: Sat Jan 22 14:32:20 2022 +0000

    STATISTICS-25: T-dist to use the complement of the regularized beta
    
    NUMBERS-181 improved the regularized beta function. This allows
    increasing the threshold for switching to the normal approximation.
---
 .../statistics/distribution/TDistribution.java     | 56 +++++++++++-----------
 .../statistics/distribution/TDistributionTest.java | 48 +++++++++++--------
 2 files changed, 56 insertions(+), 48 deletions(-)

diff --git 
a/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/TDistribution.java
 
b/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/TDistribution.java
index 7c93318..50fc554 100644
--- 
a/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/TDistribution.java
+++ 
b/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/TDistribution.java
@@ -108,8 +108,9 @@ public abstract class TDistribution extends 
AbstractContinuousDistribution {
         /** 2. */
         private static final double TWO = 2;
         /** Number of degrees of freedom above which to use the normal 
distribution.
-         * This is used to check the CDF when the degrees of freedom is large. 
*/
-        private static final double DOF_THRESHOLD_NORMAL = 2.99e6;
+         * This is used to check the CDF when the degrees of freedom is large.
+         * Set to 1 / machine epsilon, 2^52, or 4.50e15. */
+        private static final double DOF_THRESHOLD_NORMAL = 0x1.0p52;
 
         /** degreesOfFreedom / 2. */
         private final double dofOver2;
@@ -167,36 +168,33 @@ public abstract class TDistribution extends 
AbstractContinuousDistribution {
             if (x == 0) {
                 return 0.5;
             }
-            final double df = getDegreesOfFreedom();
-            if (df > DOF_THRESHOLD_NORMAL) {
+            final double v = getDegreesOfFreedom();
+
+            // This threshold may no longer be required.
+            // See STATISTICS-25.
+            if (v > DOF_THRESHOLD_NORMAL) {
                 return STANDARD_NORMAL.cumulativeProbability(x);
             }
-            final double x2 = x * x;
-            // z = 1 / (1 + x^2/df)
-            // Simplify by multiplication by df
-            final double z = df / (df + x2);
-
-            // The RegularizedBeta has the complement:
-            //   I(z, a, b) = 1 - I(1 - z, a, b)
-            // This is used when z > (a + 1) / (2 + b + a).
-            // Detect this condition and directly use the complement.
-            if (z > (dofOver2 + 1) / (2.5 + dofOver2)) {
-                // zc = 1 - z; pc = 1 - p
-                final double zc = x2 / (df + x2);
-                final double pc = RegularizedBeta.value(zc, 0.5, dofOver2);
-
-                return x < 0 ?
-                    // 0.5 * p == 0.5 * (1 - pc) = 0.5 - 0.5 * pc
-                    0.5 - 0.5 * pc :
-                    // 1 - 0.5 * p == 1 - 0.5 * (1 - pc) = 0.5 + 0.5 * pc
-                    0.5 + 0.5 * pc;
-            }
-
-            final double p = RegularizedBeta.value(z, dofOver2, 0.5);
 
-            return x < 0 ?
-                0.5 * p :
-                1 - 0.5 * p;
+            // cdf(t) = 1 - 0.5 * I_x(t)(v/2, 1/2)
+            // where x(t) = v / (v + t^2)
+            //
+            // When t^2 << v using the regularized beta results
+            // in loss of precision. Use the complement instead:
+            // I[x](a,b) = 1 - I[1-x](b,a)
+            // x   = v   / (v + t^2)
+            // 1-x = t^2 / (v + t^2)
+            // Use the threshold documented in the Boost t-distribution:
+            // 
https://www.boost.org/doc/libs/1_78_0/libs/math/doc/html/math_toolkit/dist_ref/dists/students_t_dist.html
+
+            final double t2 = x * x;
+            double z;
+            if (v < 2 * t2) {
+                z = RegularizedBeta.value(v / (v + t2), dofOver2, 0.5) / 2;
+            } else {
+                z = RegularizedBeta.complement(t2 / (v + t2), 0.5, dofOver2) / 
2;
+            }
+            return x > 0 ? 1 - z : z;
         }
 
         @Override
diff --git 
a/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/TDistributionTest.java
 
b/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/TDistributionTest.java
index ab27303..106a1f3 100644
--- 
a/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/TDistributionTest.java
+++ 
b/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/TDistributionTest.java
@@ -106,31 +106,41 @@ class TDistributionTest extends 
BaseContinuousDistributionTest {
     void testStatistics25() {
         final double[] df = {1, 10, 1e2, 1e3, 1e4, 1e5,
                              1e6, 2e6, 2.98e6, 2.99e6, 3e6, 4e6,
-                             1e7, 1e8, 1e9, 1e10};
+                             1e7, 1e8, 1e9, 1e10, 1e11,
+                             1e12, 1e13, 1e14, 1e15, 1e16,
+                             1e17, 1e18};
         // Generated from Python.
-        final double[] expected = {0.507956089912,
-                                   0.509726595102,
-                                   0.509947608093,
-                                   0.509970024339,
-                                   0.509972268782,
-                                   0.509972493254,
-                                   0.509972515701,
-                                   0.509972516948,
-                                   0.509972517358,
-                                   0.509972517361,
-                                   0.509972517364,
-                                   0.509972517572,
-                                   0.509972517946,
-                                   0.50997251817,
-                                   0.509972518193,
-                                   0.509972518195};
+        final double[] expected = {0.50795608991202579,
+                                   0.50972659510159002,
+                                   0.50994760809308259,
+                                   0.50997002433945715,
+                                   0.50997226878155033,
+                                   0.50997249325358851,
+                                   0.50997251570107027,
+                                   0.50997251694815415,
+                                   0.50997251735826887,
+                                   0.50997251736106808,
+                                   0.50997251736384874,
+                                   0.50997251757169604,
+                                   0.50997251794582121,
+                                   0.50997251817029643,
+                                   0.50997251819274392,
+                                   0.50997251819498857,
+                                   0.50997251819521316,
+                                   0.50997251819523559,
+                                   0.50997251819523781,
+                                   // 1e-14 follows
+                                   0.50997251819523803,
+                                   0.50997251819523803,
+                                   0.50997251819523814,
+                                   0.50997251819523814,
+                                   0.50997251819523803};
 
         final double c = 0.025;
-        final double tol = 1e-9;
         for (int i = 0; i < df.length; i++) {
             final TDistribution dist = TDistribution.of(df[i]);
             final double x = dist.cumulativeProbability(c);
-            Assertions.assertEquals(expected[i], x, tol);
+            Assertions.assertEquals(expected[i], x, 6 * Math.ulp(expected[i]));
         }
     }
 }

[commons-statistics] branch master updated: STATISTICS-25: T-dist to use the complement of the regularized beta

Reply via email to