001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.math3.stat.descriptive.moment; 018 019 import java.io.Serializable; 020 021 import org.apache.commons.math3.exception.MathIllegalArgumentException; 022 import org.apache.commons.math3.exception.NullArgumentException; 023 import org.apache.commons.math3.exception.util.LocalizedFormats; 024 import org.apache.commons.math3.stat.descriptive.WeightedEvaluation; 025 import org.apache.commons.math3.stat.descriptive.AbstractStorelessUnivariateStatistic; 026 import org.apache.commons.math3.util.MathUtils; 027 028 /** 029 * Computes the variance of the available values. By default, the unbiased 030 * "sample variance" definitional formula is used: 031 * <p> 032 * variance = sum((x_i - mean)^2) / (n - 1) </p> 033 * <p> 034 * where mean is the {@link Mean} and <code>n</code> is the number 035 * of sample observations.</p> 036 * <p> 037 * The definitional formula does not have good numerical properties, so 038 * this implementation does not compute the statistic using the definitional 039 * formula. <ul> 040 * <li> The <code>getResult</code> method computes the variance using 041 * updating formulas based on West's algorithm, as described in 042 * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and 043 * J. G. Lewis 1979, <i>Communications of the ACM</i>, 044 * vol. 22 no. 9, pp. 526-531.</a></li> 045 * <li> The <code>evaluate</code> methods leverage the fact that they have the 046 * full array of values in memory to execute a two-pass algorithm. 047 * Specifically, these methods use the "corrected two-pass algorithm" from 048 * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>, 049 * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul> 050 * Note that adding values using <code>increment</code> or 051 * <code>incrementAll</code> and then executing <code>getResult</code> will 052 * sometimes give a different, less accurate, result than executing 053 * <code>evaluate</code> with the full array of values. The former approach 054 * should only be used when the full array of values is not available.</p> 055 * <p> 056 * The "population variance" ( sum((x_i - mean)^2) / n ) can also 057 * be computed using this statistic. The <code>isBiasCorrected</code> 058 * property determines whether the "population" or "sample" value is 059 * returned by the <code>evaluate</code> and <code>getResult</code> methods. 060 * To compute population variances, set this property to <code>false.</code> 061 * </p> 062 * <p> 063 * <strong>Note that this implementation is not synchronized.</strong> If 064 * multiple threads access an instance of this class concurrently, and at least 065 * one of the threads invokes the <code>increment()</code> or 066 * <code>clear()</code> method, it must be synchronized externally.</p> 067 * 068 * @version $Id: Variance.java 1416643 2012-12-03 19:37:14Z tn $ 069 */ 070 public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation { 071 072 /** Serializable version identifier */ 073 private static final long serialVersionUID = -9111962718267217978L; 074 075 /** SecondMoment is used in incremental calculation of Variance*/ 076 protected SecondMoment moment = null; 077 078 /** 079 * Whether or not {@link #increment(double)} should increment 080 * the internal second moment. When a Variance is constructed with an 081 * external SecondMoment as a constructor parameter, this property is 082 * set to false and increments must be applied to the second moment 083 * directly. 084 */ 085 protected boolean incMoment = true; 086 087 /** 088 * Whether or not bias correction is applied when computing the 089 * value of the statistic. True means that bias is corrected. See 090 * {@link Variance} for details on the formula. 091 */ 092 private boolean isBiasCorrected = true; 093 094 /** 095 * Constructs a Variance with default (true) <code>isBiasCorrected</code> 096 * property. 097 */ 098 public Variance() { 099 moment = new SecondMoment(); 100 } 101 102 /** 103 * Constructs a Variance based on an external second moment. 104 * When this constructor is used, the statistic may only be 105 * incremented via the moment, i.e., {@link #increment(double)} 106 * does nothing; whereas {@code m2.increment(value)} increments 107 * both {@code m2} and the Variance instance constructed from it. 108 * 109 * @param m2 the SecondMoment (Third or Fourth moments work 110 * here as well.) 111 */ 112 public Variance(final SecondMoment m2) { 113 incMoment = false; 114 this.moment = m2; 115 } 116 117 /** 118 * Constructs a Variance with the specified <code>isBiasCorrected</code> 119 * property 120 * 121 * @param isBiasCorrected setting for bias correction - true means 122 * bias will be corrected and is equivalent to using the argumentless 123 * constructor 124 */ 125 public Variance(boolean isBiasCorrected) { 126 moment = new SecondMoment(); 127 this.isBiasCorrected = isBiasCorrected; 128 } 129 130 /** 131 * Constructs a Variance with the specified <code>isBiasCorrected</code> 132 * property and the supplied external second moment. 133 * 134 * @param isBiasCorrected setting for bias correction - true means 135 * bias will be corrected 136 * @param m2 the SecondMoment (Third or Fourth moments work 137 * here as well.) 138 */ 139 public Variance(boolean isBiasCorrected, SecondMoment m2) { 140 incMoment = false; 141 this.moment = m2; 142 this.isBiasCorrected = isBiasCorrected; 143 } 144 145 /** 146 * Copy constructor, creates a new {@code Variance} identical 147 * to the {@code original} 148 * 149 * @param original the {@code Variance} instance to copy 150 * @throws NullArgumentException if original is null 151 */ 152 public Variance(Variance original) throws NullArgumentException { 153 copy(original, this); 154 } 155 156 /** 157 * {@inheritDoc} 158 * <p>If all values are available, it is more accurate to use 159 * {@link #evaluate(double[])} rather than adding values one at a time 160 * using this method and then executing {@link #getResult}, since 161 * <code>evaluate</code> leverages the fact that is has the full 162 * list of values together to execute a two-pass algorithm. 163 * See {@link Variance}.</p> 164 * 165 * <p>Note also that when {@link #Variance(SecondMoment)} is used to 166 * create a Variance, this method does nothing. In that case, the 167 * SecondMoment should be incremented directly.</p> 168 */ 169 @Override 170 public void increment(final double d) { 171 if (incMoment) { 172 moment.increment(d); 173 } 174 } 175 176 /** 177 * {@inheritDoc} 178 */ 179 @Override 180 public double getResult() { 181 if (moment.n == 0) { 182 return Double.NaN; 183 } else if (moment.n == 1) { 184 return 0d; 185 } else { 186 if (isBiasCorrected) { 187 return moment.m2 / (moment.n - 1d); 188 } else { 189 return moment.m2 / (moment.n); 190 } 191 } 192 } 193 194 /** 195 * {@inheritDoc} 196 */ 197 public long getN() { 198 return moment.getN(); 199 } 200 201 /** 202 * {@inheritDoc} 203 */ 204 @Override 205 public void clear() { 206 if (incMoment) { 207 moment.clear(); 208 } 209 } 210 211 /** 212 * Returns the variance of the entries in the input array, or 213 * <code>Double.NaN</code> if the array is empty. 214 * <p> 215 * See {@link Variance} for details on the computing algorithm.</p> 216 * <p> 217 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 218 * <p> 219 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p> 220 * <p> 221 * Does not change the internal state of the statistic.</p> 222 * 223 * @param values the input array 224 * @return the variance of the values or Double.NaN if length = 0 225 * @throws MathIllegalArgumentException if the array is null 226 */ 227 @Override 228 public double evaluate(final double[] values) throws MathIllegalArgumentException { 229 if (values == null) { 230 throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY); 231 } 232 return evaluate(values, 0, values.length); 233 } 234 235 /** 236 * Returns the variance of the entries in the specified portion of 237 * the input array, or <code>Double.NaN</code> if the designated subarray 238 * is empty. 239 * <p> 240 * See {@link Variance} for details on the computing algorithm.</p> 241 * <p> 242 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 243 * <p> 244 * Does not change the internal state of the statistic.</p> 245 * <p> 246 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p> 247 * 248 * @param values the input array 249 * @param begin index of the first array element to include 250 * @param length the number of elements to include 251 * @return the variance of the values or Double.NaN if length = 0 252 * @throws MathIllegalArgumentException if the array is null or the array index 253 * parameters are not valid 254 */ 255 @Override 256 public double evaluate(final double[] values, final int begin, final int length) 257 throws MathIllegalArgumentException { 258 259 double var = Double.NaN; 260 261 if (test(values, begin, length)) { 262 clear(); 263 if (length == 1) { 264 var = 0.0; 265 } else if (length > 1) { 266 Mean mean = new Mean(); 267 double m = mean.evaluate(values, begin, length); 268 var = evaluate(values, m, begin, length); 269 } 270 } 271 return var; 272 } 273 274 /** 275 * <p>Returns the weighted variance of the entries in the specified portion of 276 * the input array, or <code>Double.NaN</code> if the designated subarray 277 * is empty.</p> 278 * <p> 279 * Uses the formula <pre> 280 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1) 281 * </pre> 282 * where weightedMean is the weighted mean</p> 283 * <p> 284 * This formula will not return the same result as the unweighted variance when all 285 * weights are equal, unless all weights are equal to 1. The formula assumes that 286 * weights are to be treated as "expansion values," as will be the case if for example 287 * the weights represent frequency counts. To normalize weights so that the denominator 288 * in the variance computation equals the length of the input vector minus one, use <pre> 289 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code> 290 * </pre> 291 * <p> 292 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 293 * <p> 294 * Throws <code>IllegalArgumentException</code> if any of the following are true: 295 * <ul><li>the values array is null</li> 296 * <li>the weights array is null</li> 297 * <li>the weights array does not have the same length as the values array</li> 298 * <li>the weights array contains one or more infinite values</li> 299 * <li>the weights array contains one or more NaN values</li> 300 * <li>the weights array contains negative values</li> 301 * <li>the start and length arguments do not determine a valid array</li> 302 * </ul></p> 303 * <p> 304 * Does not change the internal state of the statistic.</p> 305 * <p> 306 * Throws <code>MathIllegalArgumentException</code> if either array is null.</p> 307 * 308 * @param values the input array 309 * @param weights the weights array 310 * @param begin index of the first array element to include 311 * @param length the number of elements to include 312 * @return the weighted variance of the values or Double.NaN if length = 0 313 * @throws MathIllegalArgumentException if the parameters are not valid 314 * @since 2.1 315 */ 316 public double evaluate(final double[] values, final double[] weights, 317 final int begin, final int length) throws MathIllegalArgumentException { 318 319 double var = Double.NaN; 320 321 if (test(values, weights,begin, length)) { 322 clear(); 323 if (length == 1) { 324 var = 0.0; 325 } else if (length > 1) { 326 Mean mean = new Mean(); 327 double m = mean.evaluate(values, weights, begin, length); 328 var = evaluate(values, weights, m, begin, length); 329 } 330 } 331 return var; 332 } 333 334 /** 335 * <p> 336 * Returns the weighted variance of the entries in the the input array.</p> 337 * <p> 338 * Uses the formula <pre> 339 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1) 340 * </pre> 341 * where weightedMean is the weighted mean</p> 342 * <p> 343 * This formula will not return the same result as the unweighted variance when all 344 * weights are equal, unless all weights are equal to 1. The formula assumes that 345 * weights are to be treated as "expansion values," as will be the case if for example 346 * the weights represent frequency counts. To normalize weights so that the denominator 347 * in the variance computation equals the length of the input vector minus one, use <pre> 348 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code> 349 * </pre> 350 * <p> 351 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 352 * <p> 353 * Throws <code>MathIllegalArgumentException</code> if any of the following are true: 354 * <ul><li>the values array is null</li> 355 * <li>the weights array is null</li> 356 * <li>the weights array does not have the same length as the values array</li> 357 * <li>the weights array contains one or more infinite values</li> 358 * <li>the weights array contains one or more NaN values</li> 359 * <li>the weights array contains negative values</li> 360 * </ul></p> 361 * <p> 362 * Does not change the internal state of the statistic.</p> 363 * <p> 364 * Throws <code>MathIllegalArgumentException</code> if either array is null.</p> 365 * 366 * @param values the input array 367 * @param weights the weights array 368 * @return the weighted variance of the values 369 * @throws MathIllegalArgumentException if the parameters are not valid 370 * @since 2.1 371 */ 372 public double evaluate(final double[] values, final double[] weights) 373 throws MathIllegalArgumentException { 374 return evaluate(values, weights, 0, values.length); 375 } 376 377 /** 378 * Returns the variance of the entries in the specified portion of 379 * the input array, using the precomputed mean value. Returns 380 * <code>Double.NaN</code> if the designated subarray is empty. 381 * <p> 382 * See {@link Variance} for details on the computing algorithm.</p> 383 * <p> 384 * The formula used assumes that the supplied mean value is the arithmetic 385 * mean of the sample data, not a known population parameter. This method 386 * is supplied only to save computation when the mean has already been 387 * computed.</p> 388 * <p> 389 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 390 * <p> 391 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p> 392 * <p> 393 * Does not change the internal state of the statistic.</p> 394 * 395 * @param values the input array 396 * @param mean the precomputed mean value 397 * @param begin index of the first array element to include 398 * @param length the number of elements to include 399 * @return the variance of the values or Double.NaN if length = 0 400 * @throws MathIllegalArgumentException if the array is null or the array index 401 * parameters are not valid 402 */ 403 public double evaluate(final double[] values, final double mean, 404 final int begin, final int length) throws MathIllegalArgumentException { 405 406 double var = Double.NaN; 407 408 if (test(values, begin, length)) { 409 if (length == 1) { 410 var = 0.0; 411 } else if (length > 1) { 412 double accum = 0.0; 413 double dev = 0.0; 414 double accum2 = 0.0; 415 for (int i = begin; i < begin + length; i++) { 416 dev = values[i] - mean; 417 accum += dev * dev; 418 accum2 += dev; 419 } 420 double len = length; 421 if (isBiasCorrected) { 422 var = (accum - (accum2 * accum2 / len)) / (len - 1.0); 423 } else { 424 var = (accum - (accum2 * accum2 / len)) / len; 425 } 426 } 427 } 428 return var; 429 } 430 431 /** 432 * Returns the variance of the entries in the input array, using the 433 * precomputed mean value. Returns <code>Double.NaN</code> if the array 434 * is empty. 435 * <p> 436 * See {@link Variance} for details on the computing algorithm.</p> 437 * <p> 438 * If <code>isBiasCorrected</code> is <code>true</code> the formula used 439 * assumes that the supplied mean value is the arithmetic mean of the 440 * sample data, not a known population parameter. If the mean is a known 441 * population parameter, or if the "population" version of the variance is 442 * desired, set <code>isBiasCorrected</code> to <code>false</code> before 443 * invoking this method.</p> 444 * <p> 445 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 446 * <p> 447 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p> 448 * <p> 449 * Does not change the internal state of the statistic.</p> 450 * 451 * @param values the input array 452 * @param mean the precomputed mean value 453 * @return the variance of the values or Double.NaN if the array is empty 454 * @throws MathIllegalArgumentException if the array is null 455 */ 456 public double evaluate(final double[] values, final double mean) throws MathIllegalArgumentException { 457 return evaluate(values, mean, 0, values.length); 458 } 459 460 /** 461 * Returns the weighted variance of the entries in the specified portion of 462 * the input array, using the precomputed weighted mean value. Returns 463 * <code>Double.NaN</code> if the designated subarray is empty. 464 * <p> 465 * Uses the formula <pre> 466 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1) 467 * </pre></p> 468 * <p> 469 * The formula used assumes that the supplied mean value is the weighted arithmetic 470 * mean of the sample data, not a known population parameter. This method 471 * is supplied only to save computation when the mean has already been 472 * computed.</p> 473 * <p> 474 * This formula will not return the same result as the unweighted variance when all 475 * weights are equal, unless all weights are equal to 1. The formula assumes that 476 * weights are to be treated as "expansion values," as will be the case if for example 477 * the weights represent frequency counts. To normalize weights so that the denominator 478 * in the variance computation equals the length of the input vector minus one, use <pre> 479 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code> 480 * </pre> 481 * <p> 482 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 483 * <p> 484 * Throws <code>MathIllegalArgumentException</code> if any of the following are true: 485 * <ul><li>the values array is null</li> 486 * <li>the weights array is null</li> 487 * <li>the weights array does not have the same length as the values array</li> 488 * <li>the weights array contains one or more infinite values</li> 489 * <li>the weights array contains one or more NaN values</li> 490 * <li>the weights array contains negative values</li> 491 * <li>the start and length arguments do not determine a valid array</li> 492 * </ul></p> 493 * <p> 494 * Does not change the internal state of the statistic.</p> 495 * 496 * @param values the input array 497 * @param weights the weights array 498 * @param mean the precomputed weighted mean value 499 * @param begin index of the first array element to include 500 * @param length the number of elements to include 501 * @return the variance of the values or Double.NaN if length = 0 502 * @throws MathIllegalArgumentException if the parameters are not valid 503 * @since 2.1 504 */ 505 public double evaluate(final double[] values, final double[] weights, 506 final double mean, final int begin, final int length) 507 throws MathIllegalArgumentException { 508 509 double var = Double.NaN; 510 511 if (test(values, weights, begin, length)) { 512 if (length == 1) { 513 var = 0.0; 514 } else if (length > 1) { 515 double accum = 0.0; 516 double dev = 0.0; 517 double accum2 = 0.0; 518 for (int i = begin; i < begin + length; i++) { 519 dev = values[i] - mean; 520 accum += weights[i] * (dev * dev); 521 accum2 += weights[i] * dev; 522 } 523 524 double sumWts = 0; 525 for (int i = begin; i < begin + length; i++) { 526 sumWts += weights[i]; 527 } 528 529 if (isBiasCorrected) { 530 var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0); 531 } else { 532 var = (accum - (accum2 * accum2 / sumWts)) / sumWts; 533 } 534 } 535 } 536 return var; 537 } 538 539 /** 540 * <p>Returns the weighted variance of the values in the input array, using 541 * the precomputed weighted mean value.</p> 542 * <p> 543 * Uses the formula <pre> 544 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1) 545 * </pre></p> 546 * <p> 547 * The formula used assumes that the supplied mean value is the weighted arithmetic 548 * mean of the sample data, not a known population parameter. This method 549 * is supplied only to save computation when the mean has already been 550 * computed.</p> 551 * <p> 552 * This formula will not return the same result as the unweighted variance when all 553 * weights are equal, unless all weights are equal to 1. The formula assumes that 554 * weights are to be treated as "expansion values," as will be the case if for example 555 * the weights represent frequency counts. To normalize weights so that the denominator 556 * in the variance computation equals the length of the input vector minus one, use <pre> 557 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code> 558 * </pre> 559 * <p> 560 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 561 * <p> 562 * Throws <code>MathIllegalArgumentException</code> if any of the following are true: 563 * <ul><li>the values array is null</li> 564 * <li>the weights array is null</li> 565 * <li>the weights array does not have the same length as the values array</li> 566 * <li>the weights array contains one or more infinite values</li> 567 * <li>the weights array contains one or more NaN values</li> 568 * <li>the weights array contains negative values</li> 569 * </ul></p> 570 * <p> 571 * Does not change the internal state of the statistic.</p> 572 * 573 * @param values the input array 574 * @param weights the weights array 575 * @param mean the precomputed weighted mean value 576 * @return the variance of the values or Double.NaN if length = 0 577 * @throws MathIllegalArgumentException if the parameters are not valid 578 * @since 2.1 579 */ 580 public double evaluate(final double[] values, final double[] weights, final double mean) 581 throws MathIllegalArgumentException { 582 return evaluate(values, weights, mean, 0, values.length); 583 } 584 585 /** 586 * @return Returns the isBiasCorrected. 587 */ 588 public boolean isBiasCorrected() { 589 return isBiasCorrected; 590 } 591 592 /** 593 * @param biasCorrected The isBiasCorrected to set. 594 */ 595 public void setBiasCorrected(boolean biasCorrected) { 596 this.isBiasCorrected = biasCorrected; 597 } 598 599 /** 600 * {@inheritDoc} 601 */ 602 @Override 603 public Variance copy() { 604 Variance result = new Variance(); 605 // No try-catch or advertised exception because parameters are guaranteed non-null 606 copy(this, result); 607 return result; 608 } 609 610 /** 611 * Copies source to dest. 612 * <p>Neither source nor dest can be null.</p> 613 * 614 * @param source Variance to copy 615 * @param dest Variance to copy to 616 * @throws NullArgumentException if either source or dest is null 617 */ 618 public static void copy(Variance source, Variance dest) 619 throws NullArgumentException { 620 MathUtils.checkNotNull(source); 621 MathUtils.checkNotNull(dest); 622 dest.setData(source.getDataRef()); 623 dest.moment = source.moment.copy(); 624 dest.isBiasCorrected = source.isBiasCorrected; 625 dest.incMoment = source.incMoment; 626 } 627 }