001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.math3.stat.regression;
019    import java.io.Serializable;
020    
021    import org.apache.commons.math3.exception.OutOfRangeException;
022    import org.apache.commons.math3.distribution.TDistribution;
023    import org.apache.commons.math3.exception.MathIllegalArgumentException;
024    import org.apache.commons.math3.exception.NoDataException;
025    import org.apache.commons.math3.exception.util.LocalizedFormats;
026    import org.apache.commons.math3.util.FastMath;
027    import org.apache.commons.math3.util.Precision;
028    
029    /**
030     * Estimates an ordinary least squares regression model
031     * with one independent variable.
032     * <p>
033     * <code> y = intercept + slope * x  </code></p>
034     * <p>
035     * Standard errors for <code>intercept</code> and <code>slope</code> are
036     * available as well as ANOVA, r-square and Pearson's r statistics.</p>
037     * <p>
038     * Observations (x,y pairs) can be added to the model one at a time or they
039     * can be provided in a 2-dimensional array.  The observations are not stored
040     * in memory, so there is no limit to the number of observations that can be
041     * added to the model.</p>
042     * <p>
043     * <strong>Usage Notes</strong>: <ul>
044     * <li> When there are fewer than two observations in the model, or when
045     * there is no variation in the x values (i.e. all x values are the same)
046     * all statistics return <code>NaN</code>. At least two observations with
047     * different x coordinates are required to estimate a bivariate regression
048     * model.
049     * </li>
050     * <li> Getters for the statistics always compute values based on the current
051     * set of observations -- i.e., you can get statistics, then add more data
052     * and get updated statistics without using a new instance.  There is no
053     * "compute" method that updates all statistics.  Each of the getters performs
054     * the necessary computations to return the requested statistic.
055     * </li>
056     * <li> The intercept term may be suppressed by passing {@code false} to
057     * the {@link #SimpleRegression(boolean)} constructor.  When the
058     * {@code hasIntercept} property is false, the model is estimated without a
059     * constant term and {@link #getIntercept()} returns {@code 0}.</li>
060     * </ul></p>
061     *
062     * @version $Id: SimpleRegression.java 1416643 2012-12-03 19:37:14Z tn $
063     */
064    public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression {
065    
066        /** Serializable version identifier */
067        private static final long serialVersionUID = -3004689053607543335L;
068    
069        /** sum of x values */
070        private double sumX = 0d;
071    
072        /** total variation in x (sum of squared deviations from xbar) */
073        private double sumXX = 0d;
074    
075        /** sum of y values */
076        private double sumY = 0d;
077    
078        /** total variation in y (sum of squared deviations from ybar) */
079        private double sumYY = 0d;
080    
081        /** sum of products */
082        private double sumXY = 0d;
083    
084        /** number of observations */
085        private long n = 0;
086    
087        /** mean of accumulated x values, used in updating formulas */
088        private double xbar = 0;
089    
090        /** mean of accumulated y values, used in updating formulas */
091        private double ybar = 0;
092    
093        /** include an intercept or not */
094        private final boolean hasIntercept;
095        // ---------------------Public methods--------------------------------------
096    
097        /**
098         * Create an empty SimpleRegression instance
099         */
100        public SimpleRegression() {
101            this(true);
102        }
103        /**
104        * Create a SimpleRegression instance, specifying whether or not to estimate
105        * an intercept.
106        *
107        * <p>Use {@code false} to estimate a model with no intercept.  When the
108        * {@code hasIntercept} property is false, the model is estimated without a
109        * constant term and {@link #getIntercept()} returns {@code 0}.</p>
110        *
111        * @param includeIntercept whether or not to include an intercept term in
112        * the regression model
113        */
114        public SimpleRegression(boolean includeIntercept) {
115            super();
116            hasIntercept = includeIntercept;
117        }
118    
119        /**
120         * Adds the observation (x,y) to the regression data set.
121         * <p>
122         * Uses updating formulas for means and sums of squares defined in
123         * "Algorithms for Computing the Sample Variance: Analysis and
124         * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
125         * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
126         * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
127         *
128         *
129         * @param x independent variable value
130         * @param y dependent variable value
131         */
132        public void addData(final double x,final double y) {
133            if (n == 0) {
134                xbar = x;
135                ybar = y;
136            } else {
137                if( hasIntercept ){
138                    final double fact1 = 1.0 + n;
139                    final double fact2 = n / (1.0 + n);
140                    final double dx = x - xbar;
141                    final double dy = y - ybar;
142                    sumXX += dx * dx * fact2;
143                    sumYY += dy * dy * fact2;
144                    sumXY += dx * dy * fact2;
145                    xbar += dx / fact1;
146                    ybar += dy / fact1;
147                }
148             }
149            if( !hasIntercept ){
150                sumXX += x * x ;
151                sumYY += y * y ;
152                sumXY += x * y ;
153            }
154            sumX += x;
155            sumY += y;
156            n++;
157        }
158    
159    
160        /**
161         * Removes the observation (x,y) from the regression data set.
162         * <p>
163         * Mirrors the addData method.  This method permits the use of
164         * SimpleRegression instances in streaming mode where the regression
165         * is applied to a sliding "window" of observations, however the caller is
166         * responsible for maintaining the set of observations in the window.</p>
167         *
168         * The method has no effect if there are no points of data (i.e. n=0)
169         *
170         * @param x independent variable value
171         * @param y dependent variable value
172         */
173        public void removeData(final double x,final double y) {
174            if (n > 0) {
175                if (hasIntercept) {
176                    final double fact1 = n - 1.0;
177                    final double fact2 = n / (n - 1.0);
178                    final double dx = x - xbar;
179                    final double dy = y - ybar;
180                    sumXX -= dx * dx * fact2;
181                    sumYY -= dy * dy * fact2;
182                    sumXY -= dx * dy * fact2;
183                    xbar -= dx / fact1;
184                    ybar -= dy / fact1;
185                } else {
186                    final double fact1 = n - 1.0;
187                    sumXX -= x * x;
188                    sumYY -= y * y;
189                    sumXY -= x * y;
190                    xbar -= x / fact1;
191                    ybar -= y / fact1;
192                }
193                 sumX -= x;
194                 sumY -= y;
195                 n--;
196            }
197        }
198    
199        /**
200         * Adds the observations represented by the elements in
201         * <code>data</code>.
202         * <p>
203         * <code>(data[0][0],data[0][1])</code> will be the first observation, then
204         * <code>(data[1][0],data[1][1])</code>, etc.</p>
205         * <p>
206         * This method does not replace data that has already been added.  The
207         * observations represented by <code>data</code> are added to the existing
208         * dataset.</p>
209         * <p>
210         * To replace all data, use <code>clear()</code> before adding the new
211         * data.</p>
212         *
213         * @param data array of observations to be added
214         * @throws ModelSpecificationException if the length of {@code data[i]} is not
215         * greater than or equal to 2
216         */
217        public void addData(final double[][] data) throws ModelSpecificationException {
218            for (int i = 0; i < data.length; i++) {
219                if( data[i].length < 2 ){
220                   throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,
221                        data[i].length, 2);
222                }
223                addData(data[i][0], data[i][1]);
224            }
225        }
226    
227        /**
228         * Adds one observation to the regression model.
229         *
230         * @param x the independent variables which form the design matrix
231         * @param y the dependent or response variable
232         * @throws ModelSpecificationException if the length of {@code x} does not equal
233         * the number of independent variables in the model
234         */
235        public void addObservation(final double[] x,final double y)
236        throws ModelSpecificationException {
237            if( x == null || x.length == 0 ){
238                throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
239            }
240            addData( x[0], y );
241        }
242    
243        /**
244         * Adds a series of observations to the regression model. The lengths of
245         * x and y must be the same and x must be rectangular.
246         *
247         * @param x a series of observations on the independent variables
248         * @param y a series of observations on the dependent variable
249         * The length of x and y must be the same
250         * @throws ModelSpecificationException if {@code x} is not rectangular, does not match
251         * the length of {@code y} or does not contain sufficient data to estimate the model
252         */
253        public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException {
254            if ((x == null) || (y == null) || (x.length != y.length)) {
255                throw new ModelSpecificationException(
256                      LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
257                      (x == null) ? 0 : x.length,
258                      (y == null) ? 0 : y.length);
259            }
260            boolean obsOk=true;
261            for( int i = 0 ; i < x.length; i++){
262                if( x[i] == null || x[i].length == 0 ){
263                    obsOk = false;
264                }
265            }
266            if( !obsOk ){
267                throw new ModelSpecificationException(
268                      LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
269                      0, 1);
270            }
271            for( int i = 0 ; i < x.length ; i++){
272                addData( x[i][0], y[i] );
273            }
274        }
275    
276        /**
277         * Removes observations represented by the elements in <code>data</code>.
278          * <p>
279         * If the array is larger than the current n, only the first n elements are
280         * processed.  This method permits the use of SimpleRegression instances in
281         * streaming mode where the regression is applied to a sliding "window" of
282         * observations, however the caller is responsible for maintaining the set
283         * of observations in the window.</p>
284         * <p>
285         * To remove all data, use <code>clear()</code>.</p>
286         *
287         * @param data array of observations to be removed
288         */
289        public void removeData(double[][] data) {
290            for (int i = 0; i < data.length && n > 0; i++) {
291                removeData(data[i][0], data[i][1]);
292            }
293        }
294    
295        /**
296         * Clears all data from the model.
297         */
298        public void clear() {
299            sumX = 0d;
300            sumXX = 0d;
301            sumY = 0d;
302            sumYY = 0d;
303            sumXY = 0d;
304            n = 0;
305        }
306    
307        /**
308         * Returns the number of observations that have been added to the model.
309         *
310         * @return n number of observations that have been added.
311         */
312        public long getN() {
313            return n;
314        }
315    
316        /**
317         * Returns the "predicted" <code>y</code> value associated with the
318         * supplied <code>x</code> value,  based on the data that has been
319         * added to the model when this method is activated.
320         * <p>
321         * <code> predict(x) = intercept + slope * x </code></p>
322         * <p>
323         * <strong>Preconditions</strong>: <ul>
324         * <li>At least two observations (with at least two different x values)
325         * must have been added before invoking this method. If this method is
326         * invoked before a model can be estimated, <code>Double,NaN</code> is
327         * returned.
328         * </li></ul></p>
329         *
330         * @param x input <code>x</code> value
331         * @return predicted <code>y</code> value
332         */
333        public double predict(final double x) {
334            final double b1 = getSlope();
335            if (hasIntercept) {
336                return getIntercept(b1) + b1 * x;
337            }
338            return b1 * x;
339        }
340    
341        /**
342         * Returns the intercept of the estimated regression line, if
343         * {@link #hasIntercept()} is true; otherwise 0.
344         * <p>
345         * The least squares estimate of the intercept is computed using the
346         * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
347         * The intercept is sometimes denoted b0.</p>
348         * <p>
349         * <strong>Preconditions</strong>: <ul>
350         * <li>At least two observations (with at least two different x values)
351         * must have been added before invoking this method. If this method is
352         * invoked before a model can be estimated, <code>Double,NaN</code> is
353         * returned.
354         * </li></ul></p>
355         *
356         * @return the intercept of the regression line if the model includes an
357         * intercept; 0 otherwise
358         * @see #SimpleRegression(boolean)
359         */
360        public double getIntercept() {
361            return hasIntercept ? getIntercept(getSlope()) : 0.0;
362        }
363    
364        /**
365         * Returns true if the model includes an intercept term.
366         *
367         * @return true if the regression includes an intercept; false otherwise
368         * @see #SimpleRegression(boolean)
369         */
370        public boolean hasIntercept() {
371            return hasIntercept;
372        }
373    
374        /**
375        * Returns the slope of the estimated regression line.
376        * <p>
377        * The least squares estimate of the slope is computed using the
378        * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
379        * The slope is sometimes denoted b1.</p>
380        * <p>
381        * <strong>Preconditions</strong>: <ul>
382        * <li>At least two observations (with at least two different x values)
383        * must have been added before invoking this method. If this method is
384        * invoked before a model can be estimated, <code>Double.NaN</code> is
385        * returned.
386        * </li></ul></p>
387        *
388        * @return the slope of the regression line
389        */
390        public double getSlope() {
391            if (n < 2) {
392                return Double.NaN; //not enough data
393            }
394            if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
395                return Double.NaN; //not enough variation in x
396            }
397            return sumXY / sumXX;
398        }
399    
400        /**
401         * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
402         * sum of squared errors</a> (SSE) associated with the regression
403         * model.
404         * <p>
405         * The sum is computed using the computational formula</p>
406         * <p>
407         * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
408         * <p>
409         * where <code>SYY</code> is the sum of the squared deviations of the y
410         * values about their mean, <code>SXX</code> is similarly defined and
411         * <code>SXY</code> is the sum of the products of x and y mean deviations.
412         * </p><p>
413         * The sums are accumulated using the updating algorithm referenced in
414         * {@link #addData}.</p>
415         * <p>
416         * The return value is constrained to be non-negative - i.e., if due to
417         * rounding errors the computational formula returns a negative result,
418         * 0 is returned.</p>
419         * <p>
420         * <strong>Preconditions</strong>: <ul>
421         * <li>At least two observations (with at least two different x values)
422         * must have been added before invoking this method. If this method is
423         * invoked before a model can be estimated, <code>Double,NaN</code> is
424         * returned.
425         * </li></ul></p>
426         *
427         * @return sum of squared errors associated with the regression model
428         */
429        public double getSumSquaredErrors() {
430            return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
431        }
432    
433        /**
434         * Returns the sum of squared deviations of the y values about their mean.
435         * <p>
436         * This is defined as SSTO
437         * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
438         * <p>
439         * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
440         *
441         * @return sum of squared deviations of y values
442         */
443        public double getTotalSumSquares() {
444            if (n < 2) {
445                return Double.NaN;
446            }
447            return sumYY;
448        }
449    
450        /**
451         * Returns the sum of squared deviations of the x values about their mean.
452         *
453         * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
454         *
455         * @return sum of squared deviations of x values
456         */
457        public double getXSumSquares() {
458            if (n < 2) {
459                return Double.NaN;
460            }
461            return sumXX;
462        }
463    
464        /**
465         * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
466         *
467         * @return sum of cross products
468         */
469        public double getSumOfCrossProducts() {
470            return sumXY;
471        }
472    
473        /**
474         * Returns the sum of squared deviations of the predicted y values about
475         * their mean (which equals the mean of y).
476         * <p>
477         * This is usually abbreviated SSR or SSM.  It is defined as SSM
478         * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
479         * <p>
480         * <strong>Preconditions</strong>: <ul>
481         * <li>At least two observations (with at least two different x values)
482         * must have been added before invoking this method. If this method is
483         * invoked before a model can be estimated, <code>Double.NaN</code> is
484         * returned.
485         * </li></ul></p>
486         *
487         * @return sum of squared deviations of predicted y values
488         */
489        public double getRegressionSumSquares() {
490            return getRegressionSumSquares(getSlope());
491        }
492    
493        /**
494         * Returns the sum of squared errors divided by the degrees of freedom,
495         * usually abbreviated MSE.
496         * <p>
497         * If there are fewer than <strong>three</strong> data pairs in the model,
498         * or if there is no variation in <code>x</code>, this returns
499         * <code>Double.NaN</code>.</p>
500         *
501         * @return sum of squared deviations of y values
502         */
503        public double getMeanSquareError() {
504            if (n < 3) {
505                return Double.NaN;
506            }
507            return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
508        }
509    
510        /**
511         * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
512         * Pearson's product moment correlation coefficient</a>,
513         * usually denoted r.
514         * <p>
515         * <strong>Preconditions</strong>: <ul>
516         * <li>At least two observations (with at least two different x values)
517         * must have been added before invoking this method. If this method is
518         * invoked before a model can be estimated, <code>Double,NaN</code> is
519         * returned.
520         * </li></ul></p>
521         *
522         * @return Pearson's r
523         */
524        public double getR() {
525            double b1 = getSlope();
526            double result = FastMath.sqrt(getRSquare());
527            if (b1 < 0) {
528                result = -result;
529            }
530            return result;
531        }
532    
533        /**
534         * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
535         * coefficient of determination</a>,
536         * usually denoted r-square.
537         * <p>
538         * <strong>Preconditions</strong>: <ul>
539         * <li>At least two observations (with at least two different x values)
540         * must have been added before invoking this method. If this method is
541         * invoked before a model can be estimated, <code>Double,NaN</code> is
542         * returned.
543         * </li></ul></p>
544         *
545         * @return r-square
546         */
547        public double getRSquare() {
548            double ssto = getTotalSumSquares();
549            return (ssto - getSumSquaredErrors()) / ssto;
550        }
551    
552        /**
553         * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
554         * standard error of the intercept estimate</a>,
555         * usually denoted s(b0).
556         * <p>
557         * If there are fewer that <strong>three</strong> observations in the
558         * model, or if there is no variation in x, this returns
559         * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
560         * returned when the intercept is constrained to be zero
561         *
562         * @return standard error associated with intercept estimate
563         */
564        public double getInterceptStdErr() {
565            if( !hasIntercept ){
566                return Double.NaN;
567            }
568            return FastMath.sqrt(
569                getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
570        }
571    
572        /**
573         * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
574         * error of the slope estimate</a>,
575         * usually denoted s(b1).
576         * <p>
577         * If there are fewer that <strong>three</strong> data pairs in the model,
578         * or if there is no variation in x, this returns <code>Double.NaN</code>.
579         * </p>
580         *
581         * @return standard error associated with slope estimate
582         */
583        public double getSlopeStdErr() {
584            return FastMath.sqrt(getMeanSquareError() / sumXX);
585        }
586    
587        /**
588         * Returns the half-width of a 95% confidence interval for the slope
589         * estimate.
590         * <p>
591         * The 95% confidence interval is</p>
592         * <p>
593         * <code>(getSlope() - getSlopeConfidenceInterval(),
594         * getSlope() + getSlopeConfidenceInterval())</code></p>
595         * <p>
596         * If there are fewer that <strong>three</strong> observations in the
597         * model, or if there is no variation in x, this returns
598         * <code>Double.NaN</code>.</p>
599         * <p>
600         * <strong>Usage Note</strong>:<br>
601         * The validity of this statistic depends on the assumption that the
602         * observations included in the model are drawn from a
603         * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
604         * Bivariate Normal Distribution</a>.</p>
605         *
606         * @return half-width of 95% confidence interval for the slope estimate
607         * @throws OutOfRangeException if the confidence interval can not be computed.
608         */
609        public double getSlopeConfidenceInterval() throws OutOfRangeException {
610            return getSlopeConfidenceInterval(0.05d);
611        }
612    
613        /**
614         * Returns the half-width of a (100-100*alpha)% confidence interval for
615         * the slope estimate.
616         * <p>
617         * The (100-100*alpha)% confidence interval is </p>
618         * <p>
619         * <code>(getSlope() - getSlopeConfidenceInterval(),
620         * getSlope() + getSlopeConfidenceInterval())</code></p>
621         * <p>
622         * To request, for example, a 99% confidence interval, use
623         * <code>alpha = .01</code></p>
624         * <p>
625         * <strong>Usage Note</strong>:<br>
626         * The validity of this statistic depends on the assumption that the
627         * observations included in the model are drawn from a
628         * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
629         * Bivariate Normal Distribution</a>.</p>
630         * <p>
631         * <strong> Preconditions:</strong><ul>
632         * <li>If there are fewer that <strong>three</strong> observations in the
633         * model, or if there is no variation in x, this returns
634         * <code>Double.NaN</code>.
635         * </li>
636         * <li><code>(0 < alpha < 1)</code>; otherwise an
637         * <code>OutOfRangeException</code> is thrown.
638         * </li></ul></p>
639         *
640         * @param alpha the desired significance level
641         * @return half-width of 95% confidence interval for the slope estimate
642         * @throws OutOfRangeException if the confidence interval can not be computed.
643         */
644        public double getSlopeConfidenceInterval(final double alpha)
645        throws OutOfRangeException {
646            if (n < 3) {
647                return Double.NaN;
648            }
649            if (alpha >= 1 || alpha <= 0) {
650                throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL,
651                                              alpha, 0, 1);
652            }
653            // No advertised NotStrictlyPositiveException here - will return NaN above
654            TDistribution distribution = new TDistribution(n - 2);
655            return getSlopeStdErr() *
656                distribution.inverseCumulativeProbability(1d - alpha / 2d);
657        }
658    
659        /**
660         * Returns the significance level of the slope (equiv) correlation.
661         * <p>
662         * Specifically, the returned value is the smallest <code>alpha</code>
663         * such that the slope confidence interval with significance level
664         * equal to <code>alpha</code> does not include <code>0</code>.
665         * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
666         * </p><p>
667         * <strong>Usage Note</strong>:<br>
668         * The validity of this statistic depends on the assumption that the
669         * observations included in the model are drawn from a
670         * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
671         * Bivariate Normal Distribution</a>.</p>
672         * <p>
673         * If there are fewer that <strong>three</strong> observations in the
674         * model, or if there is no variation in x, this returns
675         * <code>Double.NaN</code>.</p>
676         *
677         * @return significance level for slope/correlation
678         * @throws org.apache.commons.math3.exception.MaxCountExceededException
679         * if the significance level can not be computed.
680         */
681        public double getSignificance() {
682            if (n < 3) {
683                return Double.NaN;
684            }
685            // No advertised NotStrictlyPositiveException here - will return NaN above
686            TDistribution distribution = new TDistribution(n - 2);
687            return 2d * (1.0 - distribution.cumulativeProbability(
688                        FastMath.abs(getSlope()) / getSlopeStdErr()));
689        }
690    
691        // ---------------------Private methods-----------------------------------
692    
693        /**
694        * Returns the intercept of the estimated regression line, given the slope.
695        * <p>
696        * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
697        *
698        * @param slope current slope
699        * @return the intercept of the regression line
700        */
701        private double getIntercept(final double slope) {
702          if( hasIntercept){
703            return (sumY - slope * sumX) / n;
704          }
705          return 0.0;
706        }
707    
708        /**
709         * Computes SSR from b1.
710         *
711         * @param slope regression slope estimate
712         * @return sum of squared deviations of predicted y values
713         */
714        private double getRegressionSumSquares(final double slope) {
715            return slope * slope * sumXX;
716        }
717    
718        /**
719         * Performs a regression on data present in buffers and outputs a RegressionResults object.
720         *
721         * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
722         * a {@code NoDataException} is thrown.  If there is no intercept term, the model must
723         * contain at least 2 observations.</p>
724         *
725         * @return RegressionResults acts as a container of regression output
726         * @throws ModelSpecificationException if the model is not correctly specified
727         * @throws NoDataException if there is not sufficient data in the model to
728         * estimate the regression parameters
729         */
730        public RegressionResults regress() throws ModelSpecificationException, NoDataException {
731            if (hasIntercept) {
732              if( n < 3 ){
733                  throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
734              }
735              if( FastMath.abs( sumXX ) > Precision.SAFE_MIN ){
736                  final double[] params = new double[]{ getIntercept(), getSlope() };
737                  final double mse = getMeanSquareError();
738                  final double _syy = sumYY + sumY * sumY / n;
739                  final double[] vcv = new double[]{
740                    mse * (xbar *xbar /sumXX + 1.0 / n),
741                    -xbar*mse/sumXX,
742                    mse/sumXX };
743                  return new RegressionResults(
744                          params, new double[][]{vcv}, true, n, 2,
745                          sumY, _syy, getSumSquaredErrors(),true,false);
746              }else{
747                  final double[] params = new double[]{ sumY / n, Double.NaN };
748                  //final double mse = getMeanSquareError();
749                  final double[] vcv = new double[]{
750                    ybar / (n - 1.0),
751                    Double.NaN,
752                    Double.NaN };
753                  return new RegressionResults(
754                          params, new double[][]{vcv}, true, n, 1,
755                          sumY, sumYY, getSumSquaredErrors(),true,false);
756              }
757            }else{
758              if (n < 2) {
759                  throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
760              }
761              if( !Double.isNaN(sumXX) ){
762              final double[] vcv = new double[]{ getMeanSquareError() / sumXX };
763              final double[] params = new double[]{ sumXY/sumXX };
764              return new RegressionResults(
765                          params, new double[][]{vcv}, true, n, 1,
766                          sumY, sumYY, getSumSquaredErrors(),false,false);
767              }else{
768              final double[] vcv = new double[]{Double.NaN };
769              final double[] params = new double[]{ Double.NaN };
770              return new RegressionResults(
771                          params, new double[][]{vcv}, true, n, 1,
772                          Double.NaN, Double.NaN, Double.NaN,false,false);
773              }
774            }
775        }
776    
777        /**
778         * Performs a regression on data present in buffers including only regressors
779         * indexed in variablesToInclude and outputs a RegressionResults object
780         * @param variablesToInclude an array of indices of regressors to include
781         * @return RegressionResults acts as a container of regression output
782         * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
783         * @throws OutOfRangeException if a requested variable is not present in model
784         */
785        public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{
786            if( variablesToInclude == null || variablesToInclude.length == 0){
787              throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
788            }
789            if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){
790                throw new ModelSpecificationException(
791                        LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
792                        (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
793            }
794    
795            if( hasIntercept ){
796                if( variablesToInclude.length == 2 ){
797                    if( variablesToInclude[0] == 1 ){
798                        throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE);
799                    }else if( variablesToInclude[0] != 0 ){
800                        throw new OutOfRangeException( variablesToInclude[0], 0,1 );
801                    }
802                    if( variablesToInclude[1] != 1){
803                         throw new OutOfRangeException( variablesToInclude[0], 0,1 );
804                    }
805                    return regress();
806                }else{
807                    if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){
808                         throw new OutOfRangeException( variablesToInclude[0],0,1 );
809                    }
810                    final double _mean = sumY * sumY / n;
811                    final double _syy = sumYY + _mean;
812                    if( variablesToInclude[0] == 0 ){
813                        //just the mean
814                        final double[] vcv = new double[]{ sumYY/(((n-1)*n)) };
815                        final double[] params = new double[]{ ybar };
816                        return new RegressionResults(
817                          params, new double[][]{vcv}, true, n, 1,
818                          sumY, _syy+_mean, sumYY,true,false);
819    
820                    }else if( variablesToInclude[0] == 1){
821                        //final double _syy = sumYY + sumY * sumY / ((double) n);
822                        final double _sxx = sumXX + sumX * sumX / n;
823                        final double _sxy = sumXY + sumX * sumY / n;
824                        final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx);
825                        final double _mse = _sse/((n-1));
826                        if( !Double.isNaN(_sxx) ){
827                            final double[] vcv = new double[]{ _mse / _sxx };
828                            final double[] params = new double[]{ _sxy/_sxx };
829                            return new RegressionResults(
830                                        params, new double[][]{vcv}, true, n, 1,
831                                        sumY, _syy, _sse,false,false);
832                        }else{
833                            final double[] vcv = new double[]{Double.NaN };
834                            final double[] params = new double[]{ Double.NaN };
835                            return new RegressionResults(
836                                        params, new double[][]{vcv}, true, n, 1,
837                                        Double.NaN, Double.NaN, Double.NaN,false,false);
838                        }
839                    }
840                }
841            }else{
842                if( variablesToInclude[0] != 0 ){
843                    throw new OutOfRangeException(variablesToInclude[0],0,0);
844                }
845                return regress();
846            }
847    
848            return null;
849        }
850    }