View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.math.stat.regression;
19  import java.io.Serializable;
20  
21  import org.apache.commons.math.MathException;
22  import org.apache.commons.math.distribution.TDistribution;
23  import org.apache.commons.math.distribution.TDistributionImpl;
24  
25  /**
26   * Estimates an ordinary least squares regression model
27   * with one independent variable.
28   * <p>
29   * <code> y = intercept + slope * x  </code></p>
30   * <p>
31   * Standard errors for <code>intercept</code> and <code>slope</code> are 
32   * available as well as ANOVA, r-square and Pearson's r statistics.</p>
33   * <p>
34   * Observations (x,y pairs) can be added to the model one at a time or they 
35   * can be provided in a 2-dimensional array.  The observations are not stored
36   * in memory, so there is no limit to the number of observations that can be
37   * added to the model.</p> 
38   * <p>
39   * <strong>Usage Notes</strong>: <ul>
40   * <li> When there are fewer than two observations in the model, or when
41   * there is no variation in the x values (i.e. all x values are the same) 
42   * all statistics return <code>NaN</code>. At least two observations with
43   * different x coordinates are requred to estimate a bivariate regression 
44   * model.
45   * </li>
46   * <li> getters for the statistics always compute values based on the current
47   * set of observations -- i.e., you can get statistics, then add more data
48   * and get updated statistics without using a new instance.  There is no 
49   * "compute" method that updates all statistics.  Each of the getters performs
50   * the necessary computations to return the requested statistic.</li>
51   * </ul></p>
52   *
53   * @version $Revision: 617953 $ $Date: 2008-02-02 22:54:00 -0700 (Sat, 02 Feb 2008) $
54   */
55  public class SimpleRegression implements Serializable {
56  
57      /** Serializable version identifier */
58      private static final long serialVersionUID = -3004689053607543335L;
59  
60      /** the distribution used to compute inference statistics. */
61      private TDistribution distribution;
62      
63      /** sum of x values */
64      private double sumX = 0d;
65  
66      /** total variation in x (sum of squared deviations from xbar) */
67      private double sumXX = 0d;
68  
69      /** sum of y values */
70      private double sumY = 0d;
71  
72      /** total variation in y (sum of squared deviations from ybar) */
73      private double sumYY = 0d;
74  
75      /** sum of products */
76      private double sumXY = 0d;
77  
78      /** number of observations */
79      private long n = 0;
80  
81      /** mean of accumulated x values, used in updating formulas */
82      private double xbar = 0;
83  
84      /** mean of accumulated y values, used in updating formulas */
85      private double ybar = 0;
86  
87      // ---------------------Public methods--------------------------------------
88  
89      /**
90       * Create an empty SimpleRegression instance
91       */
92      public SimpleRegression() {
93          this(new TDistributionImpl(1.0));
94      }
95      
96      /**
97       * Create an empty SimpleRegression using the given distribution object to
98       * compute inference statistics.
99       * @param t the distribution used to compute inference statistics.
100      * @since 1.2
101      */
102     public SimpleRegression(TDistribution t) {
103         super();
104         setDistribution(t);
105     }
106     
107     /**
108      * Adds the observation (x,y) to the regression data set.
109      * <p>
110      * Uses updating formulas for means and sums of squares defined in 
111      * "Algorithms for Computing the Sample Variance: Analysis and
112      * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 
113      * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
114      * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
115      *
116      *
117      * @param x independent variable value
118      * @param y dependent variable value
119      */
120     public void addData(double x, double y) {
121         if (n == 0) {
122             xbar = x;
123             ybar = y;
124         } else {
125             double dx = x - xbar;
126             double dy = y - ybar;
127             sumXX += dx * dx * (double) n / (double) (n + 1.0);
128             sumYY += dy * dy * (double) n / (double) (n + 1.0);
129             sumXY += dx * dy * (double) n / (double) (n + 1.0);
130             xbar += dx / (double) (n + 1.0);
131             ybar += dy / (double) (n + 1.0);
132         }
133         sumX += x;
134         sumY += y;
135         n++;
136         
137         if (n > 2) {
138             distribution.setDegreesOfFreedom(n - 2);
139         }
140     }
141 
142     /**
143      * Adds the observations represented by the elements in 
144      * <code>data</code>.
145      * <p>
146      * <code>(data[0][0],data[0][1])</code> will be the first observation, then
147      * <code>(data[1][0],data[1][1])</code>, etc.</p>
148      * <p> 
149      * This method does not replace data that has already been added.  The
150      * observations represented by <code>data</code> are added to the existing
151      * dataset.</p>
152      * <p> 
153      * To replace all data, use <code>clear()</code> before adding the new 
154      * data.</p>
155      * 
156      * @param data array of observations to be added
157      */
158     public void addData(double[][] data) {
159         for (int i = 0; i < data.length; i++) {
160             addData(data[i][0], data[i][1]);
161         }
162     }
163 
164     /**
165      * Clears all data from the model.
166      */
167     public void clear() {
168         sumX = 0d;
169         sumXX = 0d;
170         sumY = 0d;
171         sumYY = 0d;
172         sumXY = 0d;
173         n = 0;
174     }
175 
176     /**
177      * Returns the number of observations that have been added to the model.
178      *
179      * @return n number of observations that have been added.
180      */
181     public long getN() {
182         return n;
183     }
184 
185     /**
186      * Returns the "predicted" <code>y</code> value associated with the 
187      * supplied <code>x</code> value,  based on the data that has been
188      * added to the model when this method is activated.
189      * <p>
190      * <code> predict(x) = intercept + slope * x </code></p>
191      * <p>
192      * <strong>Preconditions</strong>: <ul>
193      * <li>At least two observations (with at least two different x values)
194      * must have been added before invoking this method. If this method is 
195      * invoked before a model can be estimated, <code>Double,NaN</code> is
196      * returned.
197      * </li></ul></p>
198      *
199      * @param x input <code>x</code> value
200      * @return predicted <code>y</code> value
201      */
202     public double predict(double x) {
203         double b1 = getSlope();
204         return getIntercept(b1) + b1 * x;
205     }
206 
207     /**
208      * Returns the intercept of the estimated regression line.
209      * <p>
210      * The least squares estimate of the intercept is computed using the 
211      * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
212      * The intercept is sometimes denoted b0.</p>
213      * <p>
214      * <strong>Preconditions</strong>: <ul>
215      * <li>At least two observations (with at least two different x values)
216      * must have been added before invoking this method. If this method is 
217      * invoked before a model can be estimated, <code>Double,NaN</code> is
218      * returned.
219      * </li></ul></p>
220      *
221      * @return the intercept of the regression line
222      */
223     public double getIntercept() {
224         return getIntercept(getSlope());
225     }
226 
227     /**
228     * Returns the slope of the estimated regression line.  
229     * <p>
230     * The least squares estimate of the slope is computed using the 
231     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
232     * The slope is sometimes denoted b1.</p>
233     * <p>
234     * <strong>Preconditions</strong>: <ul>
235     * <li>At least two observations (with at least two different x values)
236     * must have been added before invoking this method. If this method is 
237     * invoked before a model can be estimated, <code>Double.NaN</code> is
238     * returned.
239     * </li></ul></p>
240     *
241     * @return the slope of the regression line
242     */
243     public double getSlope() {
244         if (n < 2) {
245             return Double.NaN; //not enough data 
246         }
247         if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
248             return Double.NaN; //not enough variation in x
249         }
250         return sumXY / sumXX;
251     }
252 
253     /**
254      * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
255      * sum of squared errors</a> (SSE) associated with the regression 
256      * model.
257      * <p>
258      * The sum is computed using the computational formula</p>
259      * <p>
260      * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
261      * <p>
262      * where <code>SYY</code> is the sum of the squared deviations of the y
263      * values about their mean, <code>SXX</code> is similarly defined and
264      * <code>SXY</code> is the sum of the products of x and y mean deviations.
265      * </p><p>
266      * The sums are accumulated using the updating algorithm referenced in 
267      * {@link #addData}.</p>
268      * <p>
269      * The return value is constrained to be non-negative - i.e., if due to 
270      * rounding errors the computational formula returns a negative result, 
271      * 0 is returned.</p>
272      * <p>
273      * <strong>Preconditions</strong>: <ul>
274      * <li>At least two observations (with at least two different x values)
275      * must have been added before invoking this method. If this method is 
276      * invoked before a model can be estimated, <code>Double,NaN</code> is
277      * returned.
278      * </li></ul></p>
279      *
280      * @return sum of squared errors associated with the regression model
281      */
282     public double getSumSquaredErrors() {
283         return Math.max(0d, sumYY - sumXY * sumXY / sumXX);
284     }
285 
286     /**
287      * Returns the sum of squared deviations of the y values about their mean.
288      * <p>
289      * This is defined as SSTO 
290      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
291      * <p>
292      * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
293      *
294      * @return sum of squared deviations of y values
295      */
296     public double getTotalSumSquares() {
297         if (n < 2) {
298             return Double.NaN;
299         }
300         return sumYY;
301     }
302 
303     /**
304      * Returns the sum of squared deviations of the predicted y values about 
305      * their mean (which equals the mean of y).
306      * <p>
307      * This is usually abbreviated SSR or SSM.  It is defined as SSM 
308      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
309      * <p>
310      * <strong>Preconditions</strong>: <ul>
311      * <li>At least two observations (with at least two different x values)
312      * must have been added before invoking this method. If this method is 
313      * invoked before a model can be estimated, <code>Double.NaN</code> is
314      * returned.
315      * </li></ul></p>
316      *
317      * @return sum of squared deviations of predicted y values
318      */
319     public double getRegressionSumSquares() {
320         return getRegressionSumSquares(getSlope());
321     }
322 
323     /**
324      * Returns the sum of squared errors divided by the degrees of freedom,
325      * usually abbreviated MSE. 
326      * <p>
327      * If there are fewer than <strong>three</strong> data pairs in the model,
328      * or if there is no variation in <code>x</code>, this returns 
329      * <code>Double.NaN</code>.</p>
330      *
331      * @return sum of squared deviations of y values
332      */
333     public double getMeanSquareError() {
334         if (n < 3) {
335             return Double.NaN;
336         }
337         return getSumSquaredErrors() / (double) (n - 2);
338     }
339 
340     /**
341      * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
342      * Pearson's product moment correlation coefficient</a>,
343      * usually denoted r. 
344      * <p>
345      * <strong>Preconditions</strong>: <ul>
346      * <li>At least two observations (with at least two different x values)
347      * must have been added before invoking this method. If this method is 
348      * invoked before a model can be estimated, <code>Double,NaN</code> is
349      * returned.
350      * </li></ul></p>
351      *
352      * @return Pearson's r
353      */
354     public double getR() {
355         double b1 = getSlope();
356         double result = Math.sqrt(getRSquare());
357         if (b1 < 0) {
358             result = -result;
359         }
360         return result;
361     }
362 
363     /** 
364      * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 
365      * coefficient of determination</a>,
366      * usually denoted r-square. 
367      * <p>
368      * <strong>Preconditions</strong>: <ul>
369      * <li>At least two observations (with at least two different x values)
370      * must have been added before invoking this method. If this method is 
371      * invoked before a model can be estimated, <code>Double,NaN</code> is
372      * returned.
373      * </li></ul></p>
374      *
375      * @return r-square
376      */
377     public double getRSquare() {
378         double ssto = getTotalSumSquares();
379         return (ssto - getSumSquaredErrors()) / ssto;
380     }
381 
382     /**
383      * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
384      * standard error of the intercept estimate</a>, 
385      * usually denoted s(b0). 
386      * <p>
387      * If there are fewer that <strong>three</strong> observations in the 
388      * model, or if there is no variation in x, this returns 
389      * <code>Double.NaN</code>.</p>
390      *
391      * @return standard error associated with intercept estimate
392      */
393     public double getInterceptStdErr() {
394         return Math.sqrt(
395             getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
396     }
397 
398     /**
399      * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
400      * error of the slope estimate</a>,
401      * usually denoted s(b1). 
402      * <p>
403      * If there are fewer that <strong>three</strong> data pairs in the model,
404      * or if there is no variation in x, this returns <code>Double.NaN</code>.
405      * </p>
406      * 
407      * @return standard error associated with slope estimate
408      */
409     public double getSlopeStdErr() {
410         return Math.sqrt(getMeanSquareError() / sumXX);
411     }
412 
413     /**
414      * Returns the half-width of a 95% confidence interval for the slope
415      * estimate.
416      * <p>
417      * The 95% confidence interval is</p>
418      * <p>
419      * <code>(getSlope() - getSlopeConfidenceInterval(), 
420      * getSlope() + getSlopeConfidenceInterval())</code></p>
421      * <p>
422      * If there are fewer that <strong>three</strong> observations in the 
423      * model, or if there is no variation in x, this returns 
424      * <code>Double.NaN</code>.</p>
425      * <p>
426      * <strong>Usage Note</strong>:<br>
427      * The validity of this statistic depends on the assumption that the 
428      * observations included in the model are drawn from a
429      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
430      * Bivariate Normal Distribution</a>.</p>
431      *
432      * @return half-width of 95% confidence interval for the slope estimate
433      * @throws MathException if the confidence interval can not be computed.
434      */
435     public double getSlopeConfidenceInterval() throws MathException {
436         return getSlopeConfidenceInterval(0.05d);
437     }
438 
439     /**
440      * Returns the half-width of a (100-100*alpha)% confidence interval for 
441      * the slope estimate.
442      * <p>
443      * The (100-100*alpha)% confidence interval is </p>
444      * <p>
445      * <code>(getSlope() - getSlopeConfidenceInterval(), 
446      * getSlope() + getSlopeConfidenceInterval())</code></p>
447      * <p>
448      * To request, for example, a 99% confidence interval, use 
449      * <code>alpha = .01</code></p>
450      * <p>
451      * <strong>Usage Note</strong>:<br>
452      * The validity of this statistic depends on the assumption that the 
453      * observations included in the model are drawn from a
454      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
455      * Bivariate Normal Distribution</a>.</p>
456      * <p>
457      * <strong> Preconditions:</strong><ul>
458      * <li>If there are fewer that <strong>three</strong> observations in the 
459      * model, or if there is no variation in x, this returns 
460      * <code>Double.NaN</code>.
461      * </li>
462      * <li><code>(0 < alpha < 1)</code>; otherwise an 
463      * <code>IllegalArgumentException</code> is thrown.
464      * </li></ul></p> 
465      *
466      * @param alpha the desired significance level 
467      * @return half-width of 95% confidence interval for the slope estimate
468      * @throws MathException if the confidence interval can not be computed.
469      */
470     public double getSlopeConfidenceInterval(double alpha)
471         throws MathException {
472         if (alpha >= 1 || alpha <= 0) {
473             throw new IllegalArgumentException();
474         }
475         return getSlopeStdErr() *
476             distribution.inverseCumulativeProbability(1d - alpha / 2d);
477     }
478 
479     /**
480      * Returns the significance level of the slope (equiv) correlation. 
481      * <p>
482      * Specifically, the returned value is the smallest <code>alpha</code>
483      * such that the slope confidence interval with significance level
484      * equal to <code>alpha</code> does not include <code>0</code>.
485      * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
486      * </p><p>
487      * <strong>Usage Note</strong>:<br>
488      * The validity of this statistic depends on the assumption that the 
489      * observations included in the model are drawn from a
490      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
491      * Bivariate Normal Distribution</a>.</p>
492      * <p>
493      * If there are fewer that <strong>three</strong> observations in the 
494      * model, or if there is no variation in x, this returns 
495      * <code>Double.NaN</code>.</p>
496      *
497      * @return significance level for slope/correlation
498      * @throws MathException if the significance level can not be computed.
499      */
500     public double getSignificance() throws MathException {
501         return 2d * (1.0 - distribution.cumulativeProbability(
502                     Math.abs(getSlope()) / getSlopeStdErr()));
503     }
504 
505     // ---------------------Private methods-----------------------------------
506 
507     /**
508     * Returns the intercept of the estimated regression line, given the slope.
509     * <p>
510     * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
511     *
512     * @param slope current slope
513     * @return the intercept of the regression line
514     */
515     private double getIntercept(double slope) {
516         return (sumY - slope * sumX) / ((double) n);
517     }
518 
519     /**
520      * Computes SSR from b1.
521      * 
522      * @param slope regression slope estimate
523      * @return sum of squared deviations of predicted y values
524      */
525     private double getRegressionSumSquares(double slope) {
526         return slope * slope * sumXX;
527     }
528     
529     /**
530      * Modify the distribution used to compute inference statistics.
531      * @param value the new distribution
532      * @since 1.2
533      */
534     public void setDistribution(TDistribution value) {
535         distribution = value;
536         
537         // modify degrees of freedom
538         if (n > 2) {
539             distribution.setDegreesOfFreedom(n - 2);
540         }
541     }
542 }