1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.math.stat.regression; 19 import java.io.Serializable; 20 21 import org.apache.commons.math.MathException; 22 import org.apache.commons.math.distribution.TDistribution; 23 import org.apache.commons.math.distribution.TDistributionImpl; 24 25 /** 26 * Estimates an ordinary least squares regression model 27 * with one independent variable. 28 * <p> 29 * <code> y = intercept + slope * x </code></p> 30 * <p> 31 * Standard errors for <code>intercept</code> and <code>slope</code> are 32 * available as well as ANOVA, r-square and Pearson's r statistics.</p> 33 * <p> 34 * Observations (x,y pairs) can be added to the model one at a time or they 35 * can be provided in a 2-dimensional array. The observations are not stored 36 * in memory, so there is no limit to the number of observations that can be 37 * added to the model.</p> 38 * <p> 39 * <strong>Usage Notes</strong>: <ul> 40 * <li> When there are fewer than two observations in the model, or when 41 * there is no variation in the x values (i.e. all x values are the same) 42 * all statistics return <code>NaN</code>. At least two observations with 43 * different x coordinates are requred to estimate a bivariate regression 44 * model. 45 * </li> 46 * <li> getters for the statistics always compute values based on the current 47 * set of observations -- i.e., you can get statistics, then add more data 48 * and get updated statistics without using a new instance. There is no 49 * "compute" method that updates all statistics. Each of the getters performs 50 * the necessary computations to return the requested statistic.</li> 51 * </ul></p> 52 * 53 * @version $Revision: 617953 $ $Date: 2008-02-02 22:54:00 -0700 (Sat, 02 Feb 2008) $ 54 */ 55 public class SimpleRegression implements Serializable { 56 57 /** Serializable version identifier */ 58 private static final long serialVersionUID = -3004689053607543335L; 59 60 /** the distribution used to compute inference statistics. */ 61 private TDistribution distribution; 62 63 /** sum of x values */ 64 private double sumX = 0d; 65 66 /** total variation in x (sum of squared deviations from xbar) */ 67 private double sumXX = 0d; 68 69 /** sum of y values */ 70 private double sumY = 0d; 71 72 /** total variation in y (sum of squared deviations from ybar) */ 73 private double sumYY = 0d; 74 75 /** sum of products */ 76 private double sumXY = 0d; 77 78 /** number of observations */ 79 private long n = 0; 80 81 /** mean of accumulated x values, used in updating formulas */ 82 private double xbar = 0; 83 84 /** mean of accumulated y values, used in updating formulas */ 85 private double ybar = 0; 86 87 // ---------------------Public methods-------------------------------------- 88 89 /** 90 * Create an empty SimpleRegression instance 91 */ 92 public SimpleRegression() { 93 this(new TDistributionImpl(1.0)); 94 } 95 96 /** 97 * Create an empty SimpleRegression using the given distribution object to 98 * compute inference statistics. 99 * @param t the distribution used to compute inference statistics. 100 * @since 1.2 101 */ 102 public SimpleRegression(TDistribution t) { 103 super(); 104 setDistribution(t); 105 } 106 107 /** 108 * Adds the observation (x,y) to the regression data set. 109 * <p> 110 * Uses updating formulas for means and sums of squares defined in 111 * "Algorithms for Computing the Sample Variance: Analysis and 112 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 113 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in 114 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p> 115 * 116 * 117 * @param x independent variable value 118 * @param y dependent variable value 119 */ 120 public void addData(double x, double y) { 121 if (n == 0) { 122 xbar = x; 123 ybar = y; 124 } else { 125 double dx = x - xbar; 126 double dy = y - ybar; 127 sumXX += dx * dx * (double) n / (double) (n + 1.0); 128 sumYY += dy * dy * (double) n / (double) (n + 1.0); 129 sumXY += dx * dy * (double) n / (double) (n + 1.0); 130 xbar += dx / (double) (n + 1.0); 131 ybar += dy / (double) (n + 1.0); 132 } 133 sumX += x; 134 sumY += y; 135 n++; 136 137 if (n > 2) { 138 distribution.setDegreesOfFreedom(n - 2); 139 } 140 } 141 142 /** 143 * Adds the observations represented by the elements in 144 * <code>data</code>. 145 * <p> 146 * <code>(data[0][0],data[0][1])</code> will be the first observation, then 147 * <code>(data[1][0],data[1][1])</code>, etc.</p> 148 * <p> 149 * This method does not replace data that has already been added. The 150 * observations represented by <code>data</code> are added to the existing 151 * dataset.</p> 152 * <p> 153 * To replace all data, use <code>clear()</code> before adding the new 154 * data.</p> 155 * 156 * @param data array of observations to be added 157 */ 158 public void addData(double[][] data) { 159 for (int i = 0; i < data.length; i++) { 160 addData(data[i][0], data[i][1]); 161 } 162 } 163 164 /** 165 * Clears all data from the model. 166 */ 167 public void clear() { 168 sumX = 0d; 169 sumXX = 0d; 170 sumY = 0d; 171 sumYY = 0d; 172 sumXY = 0d; 173 n = 0; 174 } 175 176 /** 177 * Returns the number of observations that have been added to the model. 178 * 179 * @return n number of observations that have been added. 180 */ 181 public long getN() { 182 return n; 183 } 184 185 /** 186 * Returns the "predicted" <code>y</code> value associated with the 187 * supplied <code>x</code> value, based on the data that has been 188 * added to the model when this method is activated. 189 * <p> 190 * <code> predict(x) = intercept + slope * x </code></p> 191 * <p> 192 * <strong>Preconditions</strong>: <ul> 193 * <li>At least two observations (with at least two different x values) 194 * must have been added before invoking this method. If this method is 195 * invoked before a model can be estimated, <code>Double,NaN</code> is 196 * returned. 197 * </li></ul></p> 198 * 199 * @param x input <code>x</code> value 200 * @return predicted <code>y</code> value 201 */ 202 public double predict(double x) { 203 double b1 = getSlope(); 204 return getIntercept(b1) + b1 * x; 205 } 206 207 /** 208 * Returns the intercept of the estimated regression line. 209 * <p> 210 * The least squares estimate of the intercept is computed using the 211 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 212 * The intercept is sometimes denoted b0.</p> 213 * <p> 214 * <strong>Preconditions</strong>: <ul> 215 * <li>At least two observations (with at least two different x values) 216 * must have been added before invoking this method. If this method is 217 * invoked before a model can be estimated, <code>Double,NaN</code> is 218 * returned. 219 * </li></ul></p> 220 * 221 * @return the intercept of the regression line 222 */ 223 public double getIntercept() { 224 return getIntercept(getSlope()); 225 } 226 227 /** 228 * Returns the slope of the estimated regression line. 229 * <p> 230 * The least squares estimate of the slope is computed using the 231 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 232 * The slope is sometimes denoted b1.</p> 233 * <p> 234 * <strong>Preconditions</strong>: <ul> 235 * <li>At least two observations (with at least two different x values) 236 * must have been added before invoking this method. If this method is 237 * invoked before a model can be estimated, <code>Double.NaN</code> is 238 * returned. 239 * </li></ul></p> 240 * 241 * @return the slope of the regression line 242 */ 243 public double getSlope() { 244 if (n < 2) { 245 return Double.NaN; //not enough data 246 } 247 if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) { 248 return Double.NaN; //not enough variation in x 249 } 250 return sumXY / sumXX; 251 } 252 253 /** 254 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> 255 * sum of squared errors</a> (SSE) associated with the regression 256 * model. 257 * <p> 258 * The sum is computed using the computational formula</p> 259 * <p> 260 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p> 261 * <p> 262 * where <code>SYY</code> is the sum of the squared deviations of the y 263 * values about their mean, <code>SXX</code> is similarly defined and 264 * <code>SXY</code> is the sum of the products of x and y mean deviations. 265 * </p><p> 266 * The sums are accumulated using the updating algorithm referenced in 267 * {@link #addData}.</p> 268 * <p> 269 * The return value is constrained to be non-negative - i.e., if due to 270 * rounding errors the computational formula returns a negative result, 271 * 0 is returned.</p> 272 * <p> 273 * <strong>Preconditions</strong>: <ul> 274 * <li>At least two observations (with at least two different x values) 275 * must have been added before invoking this method. If this method is 276 * invoked before a model can be estimated, <code>Double,NaN</code> is 277 * returned. 278 * </li></ul></p> 279 * 280 * @return sum of squared errors associated with the regression model 281 */ 282 public double getSumSquaredErrors() { 283 return Math.max(0d, sumYY - sumXY * sumXY / sumXX); 284 } 285 286 /** 287 * Returns the sum of squared deviations of the y values about their mean. 288 * <p> 289 * This is defined as SSTO 290 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p> 291 * <p> 292 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> 293 * 294 * @return sum of squared deviations of y values 295 */ 296 public double getTotalSumSquares() { 297 if (n < 2) { 298 return Double.NaN; 299 } 300 return sumYY; 301 } 302 303 /** 304 * Returns the sum of squared deviations of the predicted y values about 305 * their mean (which equals the mean of y). 306 * <p> 307 * This is usually abbreviated SSR or SSM. It is defined as SSM 308 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p> 309 * <p> 310 * <strong>Preconditions</strong>: <ul> 311 * <li>At least two observations (with at least two different x values) 312 * must have been added before invoking this method. If this method is 313 * invoked before a model can be estimated, <code>Double.NaN</code> is 314 * returned. 315 * </li></ul></p> 316 * 317 * @return sum of squared deviations of predicted y values 318 */ 319 public double getRegressionSumSquares() { 320 return getRegressionSumSquares(getSlope()); 321 } 322 323 /** 324 * Returns the sum of squared errors divided by the degrees of freedom, 325 * usually abbreviated MSE. 326 * <p> 327 * If there are fewer than <strong>three</strong> data pairs in the model, 328 * or if there is no variation in <code>x</code>, this returns 329 * <code>Double.NaN</code>.</p> 330 * 331 * @return sum of squared deviations of y values 332 */ 333 public double getMeanSquareError() { 334 if (n < 3) { 335 return Double.NaN; 336 } 337 return getSumSquaredErrors() / (double) (n - 2); 338 } 339 340 /** 341 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> 342 * Pearson's product moment correlation coefficient</a>, 343 * usually denoted r. 344 * <p> 345 * <strong>Preconditions</strong>: <ul> 346 * <li>At least two observations (with at least two different x values) 347 * must have been added before invoking this method. If this method is 348 * invoked before a model can be estimated, <code>Double,NaN</code> is 349 * returned. 350 * </li></ul></p> 351 * 352 * @return Pearson's r 353 */ 354 public double getR() { 355 double b1 = getSlope(); 356 double result = Math.sqrt(getRSquare()); 357 if (b1 < 0) { 358 result = -result; 359 } 360 return result; 361 } 362 363 /** 364 * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 365 * coefficient of determination</a>, 366 * usually denoted r-square. 367 * <p> 368 * <strong>Preconditions</strong>: <ul> 369 * <li>At least two observations (with at least two different x values) 370 * must have been added before invoking this method. If this method is 371 * invoked before a model can be estimated, <code>Double,NaN</code> is 372 * returned. 373 * </li></ul></p> 374 * 375 * @return r-square 376 */ 377 public double getRSquare() { 378 double ssto = getTotalSumSquares(); 379 return (ssto - getSumSquaredErrors()) / ssto; 380 } 381 382 /** 383 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> 384 * standard error of the intercept estimate</a>, 385 * usually denoted s(b0). 386 * <p> 387 * If there are fewer that <strong>three</strong> observations in the 388 * model, or if there is no variation in x, this returns 389 * <code>Double.NaN</code>.</p> 390 * 391 * @return standard error associated with intercept estimate 392 */ 393 public double getInterceptStdErr() { 394 return Math.sqrt( 395 getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX)); 396 } 397 398 /** 399 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard 400 * error of the slope estimate</a>, 401 * usually denoted s(b1). 402 * <p> 403 * If there are fewer that <strong>three</strong> data pairs in the model, 404 * or if there is no variation in x, this returns <code>Double.NaN</code>. 405 * </p> 406 * 407 * @return standard error associated with slope estimate 408 */ 409 public double getSlopeStdErr() { 410 return Math.sqrt(getMeanSquareError() / sumXX); 411 } 412 413 /** 414 * Returns the half-width of a 95% confidence interval for the slope 415 * estimate. 416 * <p> 417 * The 95% confidence interval is</p> 418 * <p> 419 * <code>(getSlope() - getSlopeConfidenceInterval(), 420 * getSlope() + getSlopeConfidenceInterval())</code></p> 421 * <p> 422 * If there are fewer that <strong>three</strong> observations in the 423 * model, or if there is no variation in x, this returns 424 * <code>Double.NaN</code>.</p> 425 * <p> 426 * <strong>Usage Note</strong>:<br> 427 * The validity of this statistic depends on the assumption that the 428 * observations included in the model are drawn from a 429 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 430 * Bivariate Normal Distribution</a>.</p> 431 * 432 * @return half-width of 95% confidence interval for the slope estimate 433 * @throws MathException if the confidence interval can not be computed. 434 */ 435 public double getSlopeConfidenceInterval() throws MathException { 436 return getSlopeConfidenceInterval(0.05d); 437 } 438 439 /** 440 * Returns the half-width of a (100-100*alpha)% confidence interval for 441 * the slope estimate. 442 * <p> 443 * The (100-100*alpha)% confidence interval is </p> 444 * <p> 445 * <code>(getSlope() - getSlopeConfidenceInterval(), 446 * getSlope() + getSlopeConfidenceInterval())</code></p> 447 * <p> 448 * To request, for example, a 99% confidence interval, use 449 * <code>alpha = .01</code></p> 450 * <p> 451 * <strong>Usage Note</strong>:<br> 452 * The validity of this statistic depends on the assumption that the 453 * observations included in the model are drawn from a 454 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 455 * Bivariate Normal Distribution</a>.</p> 456 * <p> 457 * <strong> Preconditions:</strong><ul> 458 * <li>If there are fewer that <strong>three</strong> observations in the 459 * model, or if there is no variation in x, this returns 460 * <code>Double.NaN</code>. 461 * </li> 462 * <li><code>(0 < alpha < 1)</code>; otherwise an 463 * <code>IllegalArgumentException</code> is thrown. 464 * </li></ul></p> 465 * 466 * @param alpha the desired significance level 467 * @return half-width of 95% confidence interval for the slope estimate 468 * @throws MathException if the confidence interval can not be computed. 469 */ 470 public double getSlopeConfidenceInterval(double alpha) 471 throws MathException { 472 if (alpha >= 1 || alpha <= 0) { 473 throw new IllegalArgumentException(); 474 } 475 return getSlopeStdErr() * 476 distribution.inverseCumulativeProbability(1d - alpha / 2d); 477 } 478 479 /** 480 * Returns the significance level of the slope (equiv) correlation. 481 * <p> 482 * Specifically, the returned value is the smallest <code>alpha</code> 483 * such that the slope confidence interval with significance level 484 * equal to <code>alpha</code> does not include <code>0</code>. 485 * On regression output, this is often denoted <code>Prob(|t| > 0)</code> 486 * </p><p> 487 * <strong>Usage Note</strong>:<br> 488 * The validity of this statistic depends on the assumption that the 489 * observations included in the model are drawn from a 490 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 491 * Bivariate Normal Distribution</a>.</p> 492 * <p> 493 * If there are fewer that <strong>three</strong> observations in the 494 * model, or if there is no variation in x, this returns 495 * <code>Double.NaN</code>.</p> 496 * 497 * @return significance level for slope/correlation 498 * @throws MathException if the significance level can not be computed. 499 */ 500 public double getSignificance() throws MathException { 501 return 2d * (1.0 - distribution.cumulativeProbability( 502 Math.abs(getSlope()) / getSlopeStdErr())); 503 } 504 505 // ---------------------Private methods----------------------------------- 506 507 /** 508 * Returns the intercept of the estimated regression line, given the slope. 509 * <p> 510 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p> 511 * 512 * @param slope current slope 513 * @return the intercept of the regression line 514 */ 515 private double getIntercept(double slope) { 516 return (sumY - slope * sumX) / ((double) n); 517 } 518 519 /** 520 * Computes SSR from b1. 521 * 522 * @param slope regression slope estimate 523 * @return sum of squared deviations of predicted y values 524 */ 525 private double getRegressionSumSquares(double slope) { 526 return slope * slope * sumXX; 527 } 528 529 /** 530 * Modify the distribution used to compute inference statistics. 531 * @param value the new distribution 532 * @since 1.2 533 */ 534 public void setDistribution(TDistribution value) { 535 distribution = value; 536 537 // modify degrees of freedom 538 if (n > 2) { 539 distribution.setDegreesOfFreedom(n - 2); 540 } 541 } 542 }