tobrien 2003/06/10 20:33:05 Modified: math/src/java/org/apache/commons/math/stat BivariateRegression.java math/src/test/org/apache/commons/math/stat BivariateRegressionTest.java Log: * Fixed all checkstyle errors and eliminated redundant NaN checks. Now have 100% test path coverage. * Used distribution framework TDistribution to implement getSlopeConfidenceInterval and getSignificance methods. PR: Issue #20657 Obtained from: Bugzilla Submitted by: Phil Steitz Reviewed by: Tim O'Brien Revision Changes Path 1.2 +217 -120 jakarta-commons-sandbox/math/src/java/org/apache/commons/math/stat/BivariateRegression.java Index: BivariateRegression.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/math/src/java/org/apache/commons/math/stat/BivariateRegression.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- BivariateRegression.java 29 May 2003 20:35:45 -0000 1.1 +++ BivariateRegression.java 11 Jun 2003 03:33:05 -0000 1.2 @@ -50,30 +50,33 @@ * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. - * */ package org.apache.commons.math.stat; +import org.apache.commons.math.stat.distribution.DistributionFactory; +import org.apache.commons.math.stat.distribution.TDistribution; /** * Estimates an ordinary least squares regression model - * with one independent variable: <p> - * - * y = intercept + slope * x </code><p> - * + * with one independent variable. + * <p> + * <code> y = intercept + slope * x </code> + * <p> * Standard errors for <code>intercept</code> and <code>slope</code> are - * available as well as ANOVA, r-square and Pearson's r statistics.<p> - * + * available as well as ANOVA, r-square and Pearson's r statistics. + * <p> * Observations (x,y pairs) can be added to the model one at a time or they * can be provided in a 2-dimensional array. The observations are not stored * in memory, so there is no limit to the number of observations that can be - * added to the model. <p> - * + * added to the model. + * <p> * <strong>Usage Notes</strong>: <ul> * <li> When there are fewer than two observations in the model, or when * there is no variation in the x values (i.e. all x values are the same) * all statistics return <code>NaN</code>. At least two observations with - * different x coordinates are requred to estimate a bivariate regression model.</li> + * different x coordinates are requred to estimate a bivariate regression + * model. + * </li> * <li> getters for the statistics always compute values based on the current * set of observations -- i.e., you can get statistics, then add more data * and get updated statistics without using a new instance. There is no @@ -114,31 +117,34 @@ */ public void addData(double x, double y) { sumX += x; - sumSqX += x*x; + sumSqX += x * x; sumY += y; - sumSqY += y*y; - sumXY += x*y; + sumSqY += y * y; + sumXY += x * y; n++; } /** - * Adds the observations represented by the elements in <code>data.</code><p> + * Adds the observations represented by the elements in + * <code>data</code>. + * <p> * <code>(data[0][0],data[0][1])</code> will be the first observation, then * <code>(data[1][0],data[1][1])</code>, etc. <p> * * This method does not replace data that has already been added. - * To replace all data, use <code>clear()</code> before adding the new data. + * To replace all data, use <code>clear()</code> before adding the new + * data. * * @param data array of observations to be added */ public void addData(double[][] data) { for (int i = 0; i < data.length; i++) { - addData(data[i][0],data[i][1]); + addData(data[i][0], data[i][1]); } } - /* - * Clears all data from the model + /** + * Clears all data from the model. */ public void clear() { sumX = 0d; @@ -150,9 +156,9 @@ } /** - * Returns the number of observations that have been added to the model + * Returns the number of observations that have been added to the model. * - * @return n + * @return n number of observations that have been added. */ public long getN() { return n; @@ -160,37 +166,38 @@ /** * Returns the "predicted" <code>y</code> value associated with the - * supplied <code>x</code> value. Specifically, <p> - * - * <code> predict(x) = intercept + slope * x </code> <p> - * - * At least two observations (with at least two different x values) + * supplied <code>x</code> value. + * <p> + * <code> predict(x) = intercept + slope * x </code> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN</code> is * returned. + * </li></ul> * * @param x input <code>x</code> value * @return predicted <code>y</code> value */ public double predict(double x) { double b1 = getSlope(); - if (b1 == Double.NaN) { - return b1; - } - return getIntercept(b1) + b1*x; + return getIntercept(b1) + b1 * x; } /** * Returns the intercept of the estimated regression line. - * The least squares estimate of the intercept is computed using the normal - * equations, as described - * <a href=http://www.xycoon.com/estimation4.htm>here</a>. - * The intercept is sometimes denoted b0. <p> - * - * At least two distinct data pairs (with at least two different x values) + * <p> + * The least squares estimate of the intercept is computed using the + * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. + * The intercept is sometimes denoted b0. + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN</code> is * returned. + * </li></ul> * * @return the intercept of the regression line */ @@ -200,15 +207,17 @@ /** * Returns the slope of the estimated regression line. - * The least squares estimate of the slope is computed using the normal - * equations, as described - * <a href=http://www.xycoon.com/estimation4.htm>here</a>. - * The slope is sometimes denoted b1. <p> - * - * At least two observations (with at least two different x values) + * <p> + * The least squares estimate of the slope is computed using the + * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. + * The slope is sometimes denoted b1. + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN</code> is * returned. + * </li></ul> * * @return the slope of the regression line */ @@ -217,22 +226,24 @@ return Double.NaN; //not enough data } double dn = (double) n; - double denom = sumSqX - (sumX*sumX/dn); - if (Math.abs(denom)< 10*Double.MIN_VALUE) { + double denom = sumSqX - (sumX * sumX / dn); + if (Math.abs(denom) < 10 * Double.MIN_VALUE) { return Double.NaN; //not enough variation in x } - return (sumXY - (sumX*sumY/dn))/denom; + return (sumXY - (sumX * sumY / dn)) / denom; } /** - * Returns the sum of squared errors</a> associated with the regression - * model. This is defined as SSE - * <a href=http://www.xycoon.com/SumOfSquares.htm>here</a>. <p> - * - * At least two distinct data pairs (with at least two different x values) + * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> + * sum of squared errors</a> (SSE) associated with the regression + * model. + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN</code> is * returned. + * </li></ul> * * @return sum of squared errors associated with the regression model */ @@ -242,10 +253,11 @@ /** * Returns the sum of squared deviations of the y values about their mean. + * <p> * This is defined as SSTO - * <a href=http://www.xycoon.com/SumOfSquares.htm>here</a>. + * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>. * <p> - * If n < 2, this returns NaN. + * If <code>n < 2</code>, this returns <code>Double.NaN</code>. * * @return sum of squared deviations of y values */ @@ -253,36 +265,37 @@ if (n < 2) { return Double.NaN; } - return sumSqY - sumY*sumY/(double) n; + return sumSqY - sumY * sumY / (double) n; } /** * Returns the sum of squared deviations of the predicted y values about * their mean (which equals the mean of y). + * <p> * This is usually abbreviated SSR or SSM. It is defined as SSM - * <a href=http://www.xycoon.com/SumOfSquares.htm>here</a><p> - * - * At least two distinct data pairs (with at least two different x values) + * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a> + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN</code> is * returned. + * </li></ul> * * @return sum of squared deviations of y values */ public double getRegressionSumSquares() { double b1 = getSlope(); - if (b1 == Double.NaN) { - return b1; - } - return b1*(sumXY - sumX*sumY/(double) n); + return b1 * (sumXY - sumX * sumY / (double) n); } /** - * Returns the sum of squared errors divided by the degrees of freedom. - * This is usually abbreviated MSE. <p> - * + * Returns the sum of squared errors divided by the degrees of freedom, + * usually abbreviated MSE. + * <p> * If there are fewer than <strong>three</strong> data pairs in the model, - * or if there is no variation in x, this returns <code>NaN</code>. + * or if there is no variation in <code>x</code>, this returns + * <code>Double.NaN</code>. * * @return sum of squared deviations of y values */ @@ -291,29 +304,25 @@ return Double.NaN; } double sse = getSumSquaredErrors(); - if (sse == Double.NaN) { - return sse; - } - return sse/(double) (n - 2); + return sse / (double) (n - 2); } /** - * Returns <a href=http://www.stt.msu.edu/~xiaoyimi/STT200/Lecture5.pdf> - * Pearson's product moment correlation coefficient</a>. - * This is usually denoted r. <p> - * - * At least two observations (with at least two different x values) + * Returns <a href="http://www.stt.msu.edu/~xiaoyimi/STT200/Lecture5.pdf"> + * Pearson's product moment correlation coefficient</a>, + * usually denoted r. + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN</code> is * returned. + * </li></ul> * * @return Pearson's r */ public double getR() { double b1 = getSlope(); - if (b1 == Double.NaN) { - return b1; - } double result = Math.sqrt(getRSquare(b1)); if (b1 < 0) { result = -result; @@ -322,14 +331,16 @@ } /** - * Returns the <a href=http://www.xycoon.com/coefficient1.htm> coefficient - * of determination</a>. - * This is usually denoted r-square. <p> - * - * At least two observaions (with at least two different x values) + * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> + * coefficient of determination</a>, + * usually denoted r-square. + * <p> + * <strong>Preconditions</strong>: <ul> + * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN</code> is * returned. + * </li></ul> * * @return r-square */ @@ -339,70 +350,150 @@ /** - * Returns the <a href=http://www.xycoon.com/standarderrorb0.htm>standard - * error of the intercept estimate</a>. - * This is usually denoted s(b0). <p> - * - * If there are fewer that <strong>three</strong> observations in the model, - * or if there is no variation in x, this returns <code>NaN</code>. + * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> + * standard error of the intercept estimate</a>, + * usually denoted s(b0). + * <p> + * If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>. * * @return standard error associated with intercept estimate */ public double getInterceptStdErr() { double ssx = getSumSquaresX(); - if (ssx == Double.NaN) { - return ssx; - } - return Math.sqrt(getMeanSquareError()*sumSqX/(((double) n)*ssx)); + return Math.sqrt(getMeanSquareError() * sumSqX / (((double) n) * ssx)); } /** - * Returns the <a http://www.xycoon.com/standerrorb(1).htm>standard - * error of the slope estimate</a>. - * This is usually denoted s(b1). <p> - * + * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard + * error of the slope estimate</a>, + * usually denoted s(b1). + * <p> * If there are fewer that <strong>three</strong> data pairs in the model, - * or if there is no variation in x, this returns <code>NaN</code>. + * or if there is no variation in x, this returns <code>Double.NaN</code>. * * @return standard error associated with slope estimate */ public double getSlopeStdErr() { double ssx = getSumSquaresX(); - if (ssx == Double.NaN) { - return ssx; + return Math.sqrt(getMeanSquareError() / ssx); + } + + /** + * Returns the half-width of a 95% confidence interval for the slope + * estimate. + * <p> + * The 95% confidence interval is + * <p> + * <code>(getSlope() - getSlopeConfidenceInterval(), + * getSlope() + getSlopeConfidenceInterval())</code> + * <p> + * If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>. + * <p> + * <strong>Usage Note</strong>:<br> + * The validity of this statistic depends on the assumption that the + * observations included in the model are drawn from a + * <a href="http://mathworld.wolfram.com/ + * BivariateNormalDistribution.html">Bivariate Normal Distribution</a>. + * + * @return half-width of 95% confidence interval for the slope estimate + */ + public double getSlopeConfidenceInterval() { + return getSlopeConfidenceInterval(0.05d); + } + + /** + * Returns the half-width of a (100-100*alpha)% confidence interval for + * the slope estimate. + * <p> + * The (100-100*alpha)% confidence interval is + * <p> + * <code>(getSlope() - getSlopeConfidenceInterval(), + * getSlope() + getSlopeConfidenceInterval())</code> + * <p> + * To request, for example, a 99% confidence interval, use + * <code>alpha = .01</code> + * <p> + * <strong>Usage Note</strong>:<br> + * The validity of this statistic depends on the assumption that the + * observations included in the model are drawn from a + * <a href="http://mathworld.wolfram.com/ + * BivariateNormalDistribution.html">Bivariate Normal Distribution</a>. + * <p> + * <strong> Preconditions:</strong><ul> + * <li>If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>. + * </li> + * <li><code>(0 < alpha < 1)</code>; otherwise an + * <code>IllegalArgumentException</code> is thrown. + * </li></ul> + * + * @param alpha the desired significance level + * @return half-width of 95% confidence interval for the slope estimate + */ + public double getSlopeConfidenceInterval(double alpha) { + if (alpha >= 1 || alpha <= 0) { + throw new IllegalArgumentException(); } - return Math.sqrt(getMeanSquareError()/ssx); + return getSlopeStdErr() * + getTDistribution().inverseCummulativeProbability(1d - alpha / 2d); + } + + /** + * Returns the significance level of the slope (equiv) correlation. + * <p> + * Specifically, the returned value is the smallest <code>alpha</code> + * such that the slope confidence interval with significance level + * equal to <code>alpha</code> does not include <code>0</code>. + * On regression output, this is often denoted <code>Prob(|t| > 0)</code> + * <p> + * <strong>Usage Note</strong>:<br> + * The validity of this statistic depends on the assumption that the + * observations included in the model are drawn from a + * <a href="http://mathworld.wolfram.com/ + * BivariateNormalDistribution.html">Bivariate Normal Distribution</a>. + * <p> + * If there are fewer that <strong>three</strong> observations in the + * model, or if there is no variation in x, this returns + * <code>Double.NaN</code>. + * + * @return significance level for slope/correlation + */ + public double getSignificance() { + return (1d - getTDistribution().cummulativeProbability( + Math.abs(getSlope()) / getSlopeStdErr())); } // ---------------------Private methods----------------------------------- /** * Returns the intercept of the estimated regression line, given the slope. + * <p> * Will return <code>NaN</code> if slope is <code>NaN</code>. * * @param slope current slope * @return the intercept of the regression line */ private double getIntercept(double slope) { - if (slope == Double.NaN) { - return slope; - } - return (sumY - slope*sumX)/((double) n); + return (sumY - slope * sumX) / ((double) n); } /** - * Returns the sum of squared errors</a> associated with the regression - * model, using the slope of the regression line. Returns NaN if the slope - * is NaN. - * + * Returns the sum of squared errors associated with the regression + * model, using the slope of the regression line. + * <p> + * Returns NaN if the slope is NaN. + * + * @param b1 current slope * @return sum of squared errors associated with the regression model */ private double getSumSquaredErrors(double b1) { - if (b1 == Double.NaN) { - return b1; - } double b0 = getIntercept(b1); - return sumSqY - b0*sumY - b1*sumXY; + return sumSqY - b0 * sumY - b1 * sumXY; } /** @@ -416,24 +507,30 @@ if (n < 2) { return Double.NaN; } - return sumSqX - sumX*sumX/(double) n; + return sumSqX - sumX * sumX / (double) n; } /** * Computes r-square from the slope. - * will return NaN if slope is Nan + * <p> + * will return NaN if slope is Nan. * + * @param b1 current slope * @return r-square */ private double getRSquare(double b1) { - if (b1 == Double.NaN) { - return b1; - } double ssto = getTotalSumSquares(); - if (ssto == Double.NaN) { - return ssto; - } - return (ssto - getSumSquaredErrors(b1))/ssto; + return (ssto - getSumSquaredErrors(b1)) / ssto; + } + + /** + * Uses distribution framework to get a t distribution instance + * with df = n - 2 + * + * @return t distribution with df = n - 2 + */ + private TDistribution getTDistribution() { + return DistributionFactory.newInstance().createTDistribution(n - 2); } } 1.2 +40 -2 jakarta-commons-sandbox/math/src/test/org/apache/commons/math/stat/BivariateRegressionTest.java Index: BivariateRegressionTest.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/math/src/test/org/apache/commons/math/stat/BivariateRegressionTest.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- BivariateRegressionTest.java 29 May 2003 20:35:46 -0000 1.1 +++ BivariateRegressionTest.java 11 Jun 2003 03:33:05 -0000 1.2 @@ -87,6 +87,18 @@ {90.6,111.6},{86.5,122.2},{89.7,117.6},{90.6,121.1},{82.8,136.0}, {70.1,154.2},{65.4,153.6},{61.3,158.5},{62.5,140.6},{63.6,136.2}, {52.6,168.0},{59.7,154.3},{59.5,149.0},{61.3,165.5}}; + + /* + * From Moore and Mcabe, "Introduction to the Practice of Statistics" + * Example 10.3 + */ + private double[][] infData = {{15.6,5.2},{26.8,6.1},{37.8,8.7},{36.4,8.5}, + {35.5,8.8},{18.6,4.9},{15.3,4.5},{7.9,2.5},{0.0,1.1}}; + + /* + * From http://www.xycoon.com/simple_linear_regression.htm + */ + private double[][] infData2 = {{1,3},{2,5},{3,7},{4,14},{5,11}}; public BivariateRegressionTest(String name) { super(name); @@ -221,6 +233,32 @@ regression.addData(data); assertEquals("number of observations",53,regression.getN()); } - + + public void testInference() { + BivariateRegression regression = new BivariateRegression(); + regression.addData(infData); + assertEquals("slope confidence interval", 0.0271, + regression.getSlopeConfidenceInterval(),0.0001); + assertEquals("slope std err",0.01146, + regression.getSlopeStdErr(),0.0001); + + regression = new BivariateRegression(); + regression.addData(infData2); + assertEquals("significance", 0.023331, + regression.getSignificance(),0.0001); + + //FIXME: get a real example to test against with alpha = .01 + assertTrue("tighter means wider", + regression.getSlopeConfidenceInterval() < + regression.getSlopeConfidenceInterval(0.01)); + + try { + double x = regression.getSlopeConfidenceInterval(1); + fail("expecting IllegalArgumentException for alpha = 1"); + } catch (IllegalArgumentException ex) { + ; + } + + } }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]