001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math3.stat.descriptive.moment;
018    
019    import java.io.Serializable;
020    
021    import org.apache.commons.math3.exception.MathIllegalArgumentException;
022    import org.apache.commons.math3.exception.NullArgumentException;
023    import org.apache.commons.math3.exception.util.LocalizedFormats;
024    import org.apache.commons.math3.stat.descriptive.WeightedEvaluation;
025    import org.apache.commons.math3.stat.descriptive.AbstractStorelessUnivariateStatistic;
026    import org.apache.commons.math3.util.MathUtils;
027    
028    /**
029     * Computes the variance of the available values.  By default, the unbiased
030     * "sample variance" definitional formula is used:
031     * <p>
032     * variance = sum((x_i - mean)^2) / (n - 1) </p>
033     * <p>
034     * where mean is the {@link Mean} and <code>n</code> is the number
035     * of sample observations.</p>
036     * <p>
037     * The definitional formula does not have good numerical properties, so
038     * this implementation does not compute the statistic using the definitional
039     * formula. <ul>
040     * <li> The <code>getResult</code> method computes the variance using
041     * updating formulas based on West's algorithm, as described in
042     * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
043     * J. G. Lewis 1979, <i>Communications of the ACM</i>,
044     * vol. 22 no. 9, pp. 526-531.</a></li>
045     * <li> The <code>evaluate</code> methods leverage the fact that they have the
046     * full array of values in memory to execute a two-pass algorithm.
047     * Specifically, these methods use the "corrected two-pass algorithm" from
048     * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
049     * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
050     * Note that adding values using <code>increment</code> or
051     * <code>incrementAll</code> and then executing <code>getResult</code> will
052     * sometimes give a different, less accurate, result than executing
053     * <code>evaluate</code> with the full array of values. The former approach
054     * should only be used when the full array of values is not available.</p>
055     * <p>
056     * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
057     * be computed using this statistic.  The <code>isBiasCorrected</code>
058     * property determines whether the "population" or "sample" value is
059     * returned by the <code>evaluate</code> and <code>getResult</code> methods.
060     * To compute population variances, set this property to <code>false.</code>
061     * </p>
062     * <p>
063     * <strong>Note that this implementation is not synchronized.</strong> If
064     * multiple threads access an instance of this class concurrently, and at least
065     * one of the threads invokes the <code>increment()</code> or
066     * <code>clear()</code> method, it must be synchronized externally.</p>
067     *
068     * @version $Id: Variance.java 1416643 2012-12-03 19:37:14Z tn $
069     */
070    public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
071    
072        /** Serializable version identifier */
073        private static final long serialVersionUID = -9111962718267217978L;
074    
075        /** SecondMoment is used in incremental calculation of Variance*/
076        protected SecondMoment moment = null;
077    
078        /**
079         * Whether or not {@link #increment(double)} should increment
080         * the internal second moment. When a Variance is constructed with an
081         * external SecondMoment as a constructor parameter, this property is
082         * set to false and increments must be applied to the second moment
083         * directly.
084         */
085        protected boolean incMoment = true;
086    
087        /**
088         * Whether or not bias correction is applied when computing the
089         * value of the statistic. True means that bias is corrected.  See
090         * {@link Variance} for details on the formula.
091         */
092        private boolean isBiasCorrected = true;
093    
094        /**
095         * Constructs a Variance with default (true) <code>isBiasCorrected</code>
096         * property.
097         */
098        public Variance() {
099            moment = new SecondMoment();
100        }
101    
102        /**
103         * Constructs a Variance based on an external second moment.
104         * When this constructor is used, the statistic may only be
105         * incremented via the moment, i.e., {@link #increment(double)}
106         * does nothing; whereas {@code m2.increment(value)} increments
107         * both {@code m2} and the Variance instance constructed from it.
108         *
109         * @param m2 the SecondMoment (Third or Fourth moments work
110         * here as well.)
111         */
112        public Variance(final SecondMoment m2) {
113            incMoment = false;
114            this.moment = m2;
115        }
116    
117        /**
118         * Constructs a Variance with the specified <code>isBiasCorrected</code>
119         * property
120         *
121         * @param isBiasCorrected  setting for bias correction - true means
122         * bias will be corrected and is equivalent to using the argumentless
123         * constructor
124         */
125        public Variance(boolean isBiasCorrected) {
126            moment = new SecondMoment();
127            this.isBiasCorrected = isBiasCorrected;
128        }
129    
130        /**
131         * Constructs a Variance with the specified <code>isBiasCorrected</code>
132         * property and the supplied external second moment.
133         *
134         * @param isBiasCorrected  setting for bias correction - true means
135         * bias will be corrected
136         * @param m2 the SecondMoment (Third or Fourth moments work
137         * here as well.)
138         */
139        public Variance(boolean isBiasCorrected, SecondMoment m2) {
140            incMoment = false;
141            this.moment = m2;
142            this.isBiasCorrected = isBiasCorrected;
143        }
144    
145        /**
146         * Copy constructor, creates a new {@code Variance} identical
147         * to the {@code original}
148         *
149         * @param original the {@code Variance} instance to copy
150         * @throws NullArgumentException if original is null
151         */
152        public Variance(Variance original) throws NullArgumentException {
153            copy(original, this);
154        }
155    
156        /**
157         * {@inheritDoc}
158         * <p>If all values are available, it is more accurate to use
159         * {@link #evaluate(double[])} rather than adding values one at a time
160         * using this method and then executing {@link #getResult}, since
161         * <code>evaluate</code> leverages the fact that is has the full
162         * list of values together to execute a two-pass algorithm.
163         * See {@link Variance}.</p>
164         *
165         * <p>Note also that when {@link #Variance(SecondMoment)} is used to
166         * create a Variance, this method does nothing. In that case, the
167         * SecondMoment should be incremented directly.</p>
168         */
169        @Override
170        public void increment(final double d) {
171            if (incMoment) {
172                moment.increment(d);
173            }
174        }
175    
176        /**
177         * {@inheritDoc}
178         */
179        @Override
180        public double getResult() {
181                if (moment.n == 0) {
182                    return Double.NaN;
183                } else if (moment.n == 1) {
184                    return 0d;
185                } else {
186                    if (isBiasCorrected) {
187                        return moment.m2 / (moment.n - 1d);
188                    } else {
189                        return moment.m2 / (moment.n);
190                    }
191                }
192        }
193    
194        /**
195         * {@inheritDoc}
196         */
197        public long getN() {
198            return moment.getN();
199        }
200    
201        /**
202         * {@inheritDoc}
203         */
204        @Override
205        public void clear() {
206            if (incMoment) {
207                moment.clear();
208            }
209        }
210    
211        /**
212         * Returns the variance of the entries in the input array, or
213         * <code>Double.NaN</code> if the array is empty.
214         * <p>
215         * See {@link Variance} for details on the computing algorithm.</p>
216         * <p>
217         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
218         * <p>
219         * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
220         * <p>
221         * Does not change the internal state of the statistic.</p>
222         *
223         * @param values the input array
224         * @return the variance of the values or Double.NaN if length = 0
225         * @throws MathIllegalArgumentException if the array is null
226         */
227        @Override
228        public double evaluate(final double[] values) throws MathIllegalArgumentException {
229            if (values == null) {
230                throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
231            }
232            return evaluate(values, 0, values.length);
233        }
234    
235        /**
236         * Returns the variance of the entries in the specified portion of
237         * the input array, or <code>Double.NaN</code> if the designated subarray
238         * is empty.
239         * <p>
240         * See {@link Variance} for details on the computing algorithm.</p>
241         * <p>
242         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
243         * <p>
244         * Does not change the internal state of the statistic.</p>
245         * <p>
246         * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
247         *
248         * @param values the input array
249         * @param begin index of the first array element to include
250         * @param length the number of elements to include
251         * @return the variance of the values or Double.NaN if length = 0
252         * @throws MathIllegalArgumentException if the array is null or the array index
253         *  parameters are not valid
254         */
255        @Override
256        public double evaluate(final double[] values, final int begin, final int length)
257        throws MathIllegalArgumentException {
258    
259            double var = Double.NaN;
260    
261            if (test(values, begin, length)) {
262                clear();
263                if (length == 1) {
264                    var = 0.0;
265                } else if (length > 1) {
266                    Mean mean = new Mean();
267                    double m = mean.evaluate(values, begin, length);
268                    var = evaluate(values, m, begin, length);
269                }
270            }
271            return var;
272        }
273    
274        /**
275         * <p>Returns the weighted variance of the entries in the specified portion of
276         * the input array, or <code>Double.NaN</code> if the designated subarray
277         * is empty.</p>
278         * <p>
279         * Uses the formula <pre>
280         *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
281         * </pre>
282         * where weightedMean is the weighted mean</p>
283         * <p>
284         * This formula will not return the same result as the unweighted variance when all
285         * weights are equal, unless all weights are equal to 1. The formula assumes that
286         * weights are to be treated as "expansion values," as will be the case if for example
287         * the weights represent frequency counts. To normalize weights so that the denominator
288         * in the variance computation equals the length of the input vector minus one, use <pre>
289         *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
290         * </pre>
291         * <p>
292         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
293         * <p>
294         * Throws <code>IllegalArgumentException</code> if any of the following are true:
295         * <ul><li>the values array is null</li>
296         *     <li>the weights array is null</li>
297         *     <li>the weights array does not have the same length as the values array</li>
298         *     <li>the weights array contains one or more infinite values</li>
299         *     <li>the weights array contains one or more NaN values</li>
300         *     <li>the weights array contains negative values</li>
301         *     <li>the start and length arguments do not determine a valid array</li>
302         * </ul></p>
303         * <p>
304         * Does not change the internal state of the statistic.</p>
305         * <p>
306         * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
307         *
308         * @param values the input array
309         * @param weights the weights array
310         * @param begin index of the first array element to include
311         * @param length the number of elements to include
312         * @return the weighted variance of the values or Double.NaN if length = 0
313         * @throws MathIllegalArgumentException if the parameters are not valid
314         * @since 2.1
315         */
316        public double evaluate(final double[] values, final double[] weights,
317                               final int begin, final int length) throws MathIllegalArgumentException {
318    
319            double var = Double.NaN;
320    
321            if (test(values, weights,begin, length)) {
322                clear();
323                if (length == 1) {
324                    var = 0.0;
325                } else if (length > 1) {
326                    Mean mean = new Mean();
327                    double m = mean.evaluate(values, weights, begin, length);
328                    var = evaluate(values, weights, m, begin, length);
329                }
330            }
331            return var;
332        }
333    
334        /**
335         * <p>
336         * Returns the weighted variance of the entries in the the input array.</p>
337         * <p>
338         * Uses the formula <pre>
339         *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
340         * </pre>
341         * where weightedMean is the weighted mean</p>
342         * <p>
343         * This formula will not return the same result as the unweighted variance when all
344         * weights are equal, unless all weights are equal to 1. The formula assumes that
345         * weights are to be treated as "expansion values," as will be the case if for example
346         * the weights represent frequency counts. To normalize weights so that the denominator
347         * in the variance computation equals the length of the input vector minus one, use <pre>
348         *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
349         * </pre>
350         * <p>
351         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
352         * <p>
353         * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
354         * <ul><li>the values array is null</li>
355         *     <li>the weights array is null</li>
356         *     <li>the weights array does not have the same length as the values array</li>
357         *     <li>the weights array contains one or more infinite values</li>
358         *     <li>the weights array contains one or more NaN values</li>
359         *     <li>the weights array contains negative values</li>
360         * </ul></p>
361         * <p>
362         * Does not change the internal state of the statistic.</p>
363         * <p>
364         * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
365         *
366         * @param values the input array
367         * @param weights the weights array
368         * @return the weighted variance of the values
369         * @throws MathIllegalArgumentException if the parameters are not valid
370         * @since 2.1
371         */
372        public double evaluate(final double[] values, final double[] weights)
373        throws MathIllegalArgumentException {
374            return evaluate(values, weights, 0, values.length);
375        }
376    
377        /**
378         * Returns the variance of the entries in the specified portion of
379         * the input array, using the precomputed mean value.  Returns
380         * <code>Double.NaN</code> if the designated subarray is empty.
381         * <p>
382         * See {@link Variance} for details on the computing algorithm.</p>
383         * <p>
384         * The formula used assumes that the supplied mean value is the arithmetic
385         * mean of the sample data, not a known population parameter.  This method
386         * is supplied only to save computation when the mean has already been
387         * computed.</p>
388         * <p>
389         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
390         * <p>
391         * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
392         * <p>
393         * Does not change the internal state of the statistic.</p>
394         *
395         * @param values the input array
396         * @param mean the precomputed mean value
397         * @param begin index of the first array element to include
398         * @param length the number of elements to include
399         * @return the variance of the values or Double.NaN if length = 0
400         * @throws MathIllegalArgumentException if the array is null or the array index
401         *  parameters are not valid
402         */
403        public double evaluate(final double[] values, final double mean,
404                final int begin, final int length) throws MathIllegalArgumentException {
405    
406            double var = Double.NaN;
407    
408            if (test(values, begin, length)) {
409                if (length == 1) {
410                    var = 0.0;
411                } else if (length > 1) {
412                    double accum = 0.0;
413                    double dev = 0.0;
414                    double accum2 = 0.0;
415                    for (int i = begin; i < begin + length; i++) {
416                        dev = values[i] - mean;
417                        accum += dev * dev;
418                        accum2 += dev;
419                    }
420                    double len = length;
421                    if (isBiasCorrected) {
422                        var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
423                    } else {
424                        var = (accum - (accum2 * accum2 / len)) / len;
425                    }
426                }
427            }
428            return var;
429        }
430    
431        /**
432         * Returns the variance of the entries in the input array, using the
433         * precomputed mean value.  Returns <code>Double.NaN</code> if the array
434         * is empty.
435         * <p>
436         * See {@link Variance} for details on the computing algorithm.</p>
437         * <p>
438         * If <code>isBiasCorrected</code> is <code>true</code> the formula used
439         * assumes that the supplied mean value is the arithmetic mean of the
440         * sample data, not a known population parameter.  If the mean is a known
441         * population parameter, or if the "population" version of the variance is
442         * desired, set <code>isBiasCorrected</code> to <code>false</code> before
443         * invoking this method.</p>
444         * <p>
445         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
446         * <p>
447         * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
448         * <p>
449         * Does not change the internal state of the statistic.</p>
450         *
451         * @param values the input array
452         * @param mean the precomputed mean value
453         * @return the variance of the values or Double.NaN if the array is empty
454         * @throws MathIllegalArgumentException if the array is null
455         */
456        public double evaluate(final double[] values, final double mean) throws MathIllegalArgumentException {
457            return evaluate(values, mean, 0, values.length);
458        }
459    
460        /**
461         * Returns the weighted variance of the entries in the specified portion of
462         * the input array, using the precomputed weighted mean value.  Returns
463         * <code>Double.NaN</code> if the designated subarray is empty.
464         * <p>
465         * Uses the formula <pre>
466         *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
467         * </pre></p>
468         * <p>
469         * The formula used assumes that the supplied mean value is the weighted arithmetic
470         * mean of the sample data, not a known population parameter. This method
471         * is supplied only to save computation when the mean has already been
472         * computed.</p>
473         * <p>
474         * This formula will not return the same result as the unweighted variance when all
475         * weights are equal, unless all weights are equal to 1. The formula assumes that
476         * weights are to be treated as "expansion values," as will be the case if for example
477         * the weights represent frequency counts. To normalize weights so that the denominator
478         * in the variance computation equals the length of the input vector minus one, use <pre>
479         *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
480         * </pre>
481         * <p>
482         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
483         * <p>
484         * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
485         * <ul><li>the values array is null</li>
486         *     <li>the weights array is null</li>
487         *     <li>the weights array does not have the same length as the values array</li>
488         *     <li>the weights array contains one or more infinite values</li>
489         *     <li>the weights array contains one or more NaN values</li>
490         *     <li>the weights array contains negative values</li>
491         *     <li>the start and length arguments do not determine a valid array</li>
492         * </ul></p>
493         * <p>
494         * Does not change the internal state of the statistic.</p>
495         *
496         * @param values the input array
497         * @param weights the weights array
498         * @param mean the precomputed weighted mean value
499         * @param begin index of the first array element to include
500         * @param length the number of elements to include
501         * @return the variance of the values or Double.NaN if length = 0
502         * @throws MathIllegalArgumentException if the parameters are not valid
503         * @since 2.1
504         */
505        public double evaluate(final double[] values, final double[] weights,
506        final double mean, final int begin, final int length)
507        throws MathIllegalArgumentException {
508    
509            double var = Double.NaN;
510    
511            if (test(values, weights, begin, length)) {
512                if (length == 1) {
513                    var = 0.0;
514                } else if (length > 1) {
515                    double accum = 0.0;
516                    double dev = 0.0;
517                    double accum2 = 0.0;
518                    for (int i = begin; i < begin + length; i++) {
519                        dev = values[i] - mean;
520                        accum += weights[i] * (dev * dev);
521                        accum2 += weights[i] * dev;
522                    }
523    
524                    double sumWts = 0;
525                    for (int i = begin; i < begin + length; i++) {
526                        sumWts += weights[i];
527                    }
528    
529                    if (isBiasCorrected) {
530                        var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
531                    } else {
532                        var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
533                    }
534                }
535            }
536            return var;
537        }
538    
539        /**
540         * <p>Returns the weighted variance of the values in the input array, using
541         * the precomputed weighted mean value.</p>
542         * <p>
543         * Uses the formula <pre>
544         *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
545         * </pre></p>
546         * <p>
547         * The formula used assumes that the supplied mean value is the weighted arithmetic
548         * mean of the sample data, not a known population parameter. This method
549         * is supplied only to save computation when the mean has already been
550         * computed.</p>
551         * <p>
552         * This formula will not return the same result as the unweighted variance when all
553         * weights are equal, unless all weights are equal to 1. The formula assumes that
554         * weights are to be treated as "expansion values," as will be the case if for example
555         * the weights represent frequency counts. To normalize weights so that the denominator
556         * in the variance computation equals the length of the input vector minus one, use <pre>
557         *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
558         * </pre>
559         * <p>
560         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
561         * <p>
562         * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
563         * <ul><li>the values array is null</li>
564         *     <li>the weights array is null</li>
565         *     <li>the weights array does not have the same length as the values array</li>
566         *     <li>the weights array contains one or more infinite values</li>
567         *     <li>the weights array contains one or more NaN values</li>
568         *     <li>the weights array contains negative values</li>
569         * </ul></p>
570         * <p>
571         * Does not change the internal state of the statistic.</p>
572         *
573         * @param values the input array
574         * @param weights the weights array
575         * @param mean the precomputed weighted mean value
576         * @return the variance of the values or Double.NaN if length = 0
577         * @throws MathIllegalArgumentException if the parameters are not valid
578         * @since 2.1
579         */
580        public double evaluate(final double[] values, final double[] weights, final double mean)
581        throws MathIllegalArgumentException {
582            return evaluate(values, weights, mean, 0, values.length);
583        }
584    
585        /**
586         * @return Returns the isBiasCorrected.
587         */
588        public boolean isBiasCorrected() {
589            return isBiasCorrected;
590        }
591    
592        /**
593         * @param biasCorrected The isBiasCorrected to set.
594         */
595        public void setBiasCorrected(boolean biasCorrected) {
596            this.isBiasCorrected = biasCorrected;
597        }
598    
599        /**
600         * {@inheritDoc}
601         */
602        @Override
603        public Variance copy() {
604            Variance result = new Variance();
605            // No try-catch or advertised exception because parameters are guaranteed non-null
606            copy(this, result);
607            return result;
608        }
609    
610        /**
611         * Copies source to dest.
612         * <p>Neither source nor dest can be null.</p>
613         *
614         * @param source Variance to copy
615         * @param dest Variance to copy to
616         * @throws NullArgumentException if either source or dest is null
617         */
618        public static void copy(Variance source, Variance dest)
619            throws NullArgumentException {
620            MathUtils.checkNotNull(source);
621            MathUtils.checkNotNull(dest);
622            dest.setData(source.getDataRef());
623            dest.moment = source.moment.copy();
624            dest.isBiasCorrected = source.isBiasCorrected;
625            dest.incMoment = source.incMoment;
626        }
627    }