1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.math.stat.descriptive; 18 19 import java.io.Serializable; 20 import java.util.Arrays; 21 22 import org.apache.commons.math.DimensionMismatchException; 23 import org.apache.commons.math.linear.RealMatrix; 24 import org.apache.commons.math.stat.descriptive.moment.GeometricMean; 25 import org.apache.commons.math.stat.descriptive.moment.Mean; 26 import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance; 27 import org.apache.commons.math.stat.descriptive.rank.Max; 28 import org.apache.commons.math.stat.descriptive.rank.Min; 29 import org.apache.commons.math.stat.descriptive.summary.Sum; 30 import org.apache.commons.math.stat.descriptive.summary.SumOfLogs; 31 import org.apache.commons.math.stat.descriptive.summary.SumOfSquares; 32 import org.apache.commons.math.util.MathUtils; 33 34 /** 35 * <p>Computes summary statistics for a stream of n-tuples added using the 36 * {@link #addValue(double[]) addValue} method. The data values are not stored 37 * in memory, so this class can be used to compute statistics for very large 38 * n-tuple streams.</p> 39 * 40 * <p>The {@link StorelessUnivariateStatistic} instances used to maintain 41 * summary state and compute statistics are configurable via setters. 42 * For example, the default implementation for the mean can be overridden by 43 * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual 44 * parameters to these methods must implement the 45 * {@link StorelessUnivariateStatistic} interface and configuration must be 46 * completed before <code>addValue</code> is called. No configuration is 47 * necessary to use the default, commons-math provided implementations.</p> 48 * 49 * <p>To compute statistics for a stream of n-tuples, construct a 50 * MultivariateStatistics instance with dimension n and then use 51 * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code> 52 * methods where Xxx is a statistic return an array of <code>double</code> 53 * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the 54 * value of the given statistic for data range consisting of the i<sup>th</sup> element of 55 * each of the input n-tuples. For example, if <code>addValue</code> is called 56 * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8}, 57 * <code>getSum</code> will return a three-element array with values 58 * {0+3+6, 1+4+7, 2+5+8}</p> 59 * 60 * <p>Note: This class is not thread-safe. Use 61 * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple 62 * threads is required.</p> 63 * 64 * @since 1.2 65 * @version $Revision: 618097 $ $Date: 2008-02-03 22:39:08 +0100 (dim., 03 févr. 2008) $ 66 */ 67 public class MultivariateSummaryStatistics 68 implements StatisticalMultivariateSummary, Serializable { 69 70 /** Serialization UID */ 71 private static final long serialVersionUID = 2271900808994826718L; 72 73 /** 74 * Construct a MultivariateSummaryStatistics instance 75 * @param k dimension of the data 76 * @param isCovarianceBiasCorrected if true, the unbiased sample 77 * covariance is computed, otherwise the biased population covariance 78 * is computed 79 */ 80 public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) { 81 this.k = k; 82 83 sumImpl = new StorelessUnivariateStatistic[k]; 84 sumSqImpl = new StorelessUnivariateStatistic[k]; 85 minImpl = new StorelessUnivariateStatistic[k]; 86 maxImpl = new StorelessUnivariateStatistic[k]; 87 sumLogImpl = new StorelessUnivariateStatistic[k]; 88 geoMeanImpl = new StorelessUnivariateStatistic[k]; 89 meanImpl = new StorelessUnivariateStatistic[k]; 90 91 for (int i = 0; i < k; ++i) { 92 sumImpl[i] = new Sum(); 93 sumSqImpl[i] = new SumOfSquares(); 94 minImpl[i] = new Min(); 95 maxImpl[i] = new Max(); 96 sumLogImpl[i] = new SumOfLogs(); 97 geoMeanImpl[i] = new GeometricMean(); 98 meanImpl[i] = new Mean(); 99 } 100 101 covarianceImpl = 102 new VectorialCovariance(k, isCovarianceBiasCorrected); 103 104 } 105 106 /** Dimension of the data. */ 107 private int k; 108 109 /** Count of values that have been added */ 110 private long n = 0; 111 112 /** Sum statistic implementation - can be reset by setter. */ 113 private StorelessUnivariateStatistic[] sumImpl; 114 115 /** Sum of squares statistic implementation - can be reset by setter. */ 116 private StorelessUnivariateStatistic[] sumSqImpl; 117 118 /** Minimum statistic implementation - can be reset by setter. */ 119 private StorelessUnivariateStatistic[] minImpl; 120 121 /** Maximum statistic implementation - can be reset by setter. */ 122 private StorelessUnivariateStatistic[] maxImpl; 123 124 /** Sum of log statistic implementation - can be reset by setter. */ 125 private StorelessUnivariateStatistic[] sumLogImpl; 126 127 /** Geometric mean statistic implementation - can be reset by setter. */ 128 private StorelessUnivariateStatistic[] geoMeanImpl; 129 130 /** Mean statistic implementation - can be reset by setter. */ 131 private StorelessUnivariateStatistic[] meanImpl; 132 133 /** Covariance statistic implementation - cannot be reset. */ 134 private VectorialCovariance covarianceImpl; 135 136 /** 137 * Add an n-tuple to the data 138 * 139 * @param value the n-tuple to add 140 * @throws DimensionMismatchException if the length of the array 141 * does not match the one used at construction 142 */ 143 public void addValue(double[] value) 144 throws DimensionMismatchException { 145 checkDimension(value.length); 146 for (int i = 0; i < k; ++i) { 147 double v = value[i]; 148 sumImpl[i].increment(v); 149 sumSqImpl[i].increment(v); 150 minImpl[i].increment(v); 151 maxImpl[i].increment(v); 152 sumLogImpl[i].increment(v); 153 geoMeanImpl[i].increment(v); 154 meanImpl[i].increment(v); 155 } 156 covarianceImpl.increment(value); 157 n++; 158 } 159 160 /** 161 * Returns the dimension of the data 162 * @return The dimension of the data 163 */ 164 public int getDimension() { 165 return k; 166 } 167 168 /** 169 * Returns the number of available values 170 * @return The number of available values 171 */ 172 public long getN() { 173 return n; 174 } 175 176 /** 177 * Returns an array of the results of a statistic. 178 * @param stats univariate statistic array 179 * @return results array 180 */ 181 private double[] getResults(StorelessUnivariateStatistic[] stats) { 182 double[] results = new double[stats.length]; 183 for (int i = 0; i < results.length; ++i) { 184 results[i] = stats[i].getResult(); 185 } 186 return results; 187 } 188 189 /** 190 * Returns an array whose i<sup>th</sup> entry is the sum of the 191 * i<sup>th</sup> entries of the arrays that have been added using 192 * {@link #addValue(double[])} 193 * 194 * @return the array of component sums 195 */ 196 public double[] getSum() { 197 return getResults(sumImpl); 198 } 199 200 /** 201 * Returns an array whose i<sup>th</sup> entry is the sum of squares of the 202 * i<sup>th</sup> entries of the arrays that have been added using 203 * {@link #addValue(double[])} 204 * 205 * @return the array of component sums of squares 206 */ 207 public double[] getSumSq() { 208 return getResults(sumSqImpl); 209 } 210 211 /** 212 * Returns an array whose i<sup>th</sup> entry is the sum of logs of the 213 * i<sup>th</sup> entries of the arrays that have been added using 214 * {@link #addValue(double[])} 215 * 216 * @return the array of component log sums 217 */ 218 public double[] getSumLog() { 219 return getResults(sumLogImpl); 220 } 221 222 /** 223 * Returns an array whose i<sup>th</sup> entry is the mean of the 224 * i<sup>th</sup> entries of the arrays that have been added using 225 * {@link #addValue(double[])} 226 * 227 * @return the array of component means 228 */ 229 public double[] getMean() { 230 return getResults(meanImpl); 231 } 232 233 /** 234 * Returns an array whose i<sup>th</sup> entry is the standard deviation of the 235 * i<sup>th</sup> entries of the arrays that have been added using 236 * {@link #addValue(double[])} 237 * 238 * @return the array of component standard deviations 239 */ 240 public double[] getStandardDeviation() { 241 double[] stdDev = new double[k]; 242 if (getN() < 1) { 243 Arrays.fill(stdDev, Double.NaN); 244 } else if (getN() < 2) { 245 Arrays.fill(stdDev, 0.0); 246 } else { 247 RealMatrix matrix = covarianceImpl.getResult(); 248 for (int i = 0; i < k; ++i) { 249 stdDev[i] = Math.sqrt(matrix.getEntry(i, i)); 250 } 251 } 252 return stdDev; 253 } 254 255 /** 256 * Returns the covariance matrix of the values that have been added. 257 * 258 * @return the covariance matrix 259 */ 260 public RealMatrix getCovariance() { 261 return covarianceImpl.getResult(); 262 } 263 264 /** 265 * Returns an array whose i<sup>th</sup> entry is the maximum of the 266 * i<sup>th</sup> entries of the arrays that have been added using 267 * {@link #addValue(double[])} 268 * 269 * @return the array of component maxima 270 */ 271 public double[] getMax() { 272 return getResults(maxImpl); 273 } 274 275 /** 276 * Returns an array whose i<sup>th</sup> entry is the minimum of the 277 * i<sup>th</sup> entries of the arrays that have been added using 278 * {@link #addValue(double[])} 279 * 280 * @return the array of component minima 281 */ 282 public double[] getMin() { 283 return getResults(minImpl); 284 } 285 286 /** 287 * Returns an array whose i<sup>th</sup> entry is the geometric mean of the 288 * i<sup>th</sup> entries of the arrays that have been added using 289 * {@link #addValue(double[])} 290 * 291 * @return the array of component geometric means 292 */ 293 public double[] getGeometricMean() { 294 return getResults(geoMeanImpl); 295 } 296 297 /** 298 * Generates a text report displaying 299 * summary statistics from values that 300 * have been added. 301 * @return String with line feeds displaying statistics 302 */ 303 public String toString() { 304 StringBuffer outBuffer = new StringBuffer(); 305 outBuffer.append("MultivariateSummaryStatistics:\n"); 306 outBuffer.append("n: " + getN() + "\n"); 307 append(outBuffer, getMin(), "min: ", ", ", "\n"); 308 append(outBuffer, getMax(), "max: ", ", ", "\n"); 309 append(outBuffer, getMean(), "mean: ", ", ", "\n"); 310 append(outBuffer, getGeometricMean(), "geometric mean: ", ", ", "\n"); 311 append(outBuffer, getSumSq(), "sum of squares: ", ", ", "\n"); 312 append(outBuffer, getSumLog(), "sum of logarithms: ", ", ", "\n"); 313 append(outBuffer, getStandardDeviation(), "standard deviation: ", ", ", "\n"); 314 outBuffer.append("covariance: " + getCovariance().toString() + "\n"); 315 return outBuffer.toString(); 316 } 317 318 /** 319 * Append a text representation of an array to a buffer. 320 * @param buffer buffer to fill 321 * @param data data array 322 * @param prefix text prefix 323 * @param separator elements separator 324 * @param suffix text suffix 325 */ 326 private void append(StringBuffer buffer, double[] data, 327 String prefix, String separator, String suffix) { 328 buffer.append(prefix); 329 for (int i = 0; i < data.length; ++i) { 330 if (i > 0) { 331 buffer.append(separator); 332 } 333 buffer.append(data[i]); 334 } 335 buffer.append(suffix); 336 } 337 338 /** 339 * Resets all statistics and storage 340 */ 341 public void clear() { 342 this.n = 0; 343 for (int i = 0; i < k; ++i) { 344 minImpl[i].clear(); 345 maxImpl[i].clear(); 346 sumImpl[i].clear(); 347 sumLogImpl[i].clear(); 348 sumSqImpl[i].clear(); 349 geoMeanImpl[i].clear(); 350 meanImpl[i].clear(); 351 } 352 covarianceImpl.clear(); 353 } 354 355 /** 356 * Returns true iff <code>object</code> is a <code>SummaryStatistics</code> 357 * instance and all statistics have the same values as this. 358 * @param object the object to test equality against. 359 * @return true if object equals this 360 */ 361 public boolean equals(Object object) { 362 if (object == this ) { 363 return true; 364 } 365 if (object instanceof MultivariateSummaryStatistics == false) { 366 return false; 367 } 368 MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object; 369 return (MathUtils.equals(stat.getGeometricMean(), 370 this.getGeometricMean()) && 371 MathUtils.equals(stat.getMax(), this.getMax()) && 372 MathUtils.equals(stat.getMean(),this.getMean()) && 373 MathUtils.equals(stat.getMin(),this.getMin()) && 374 MathUtils.equals(stat.getN(), this.getN()) && 375 MathUtils.equals(stat.getSum(), this.getSum()) && 376 MathUtils.equals(stat.getSumSq(),this.getSumSq()) && 377 MathUtils.equals(stat.getSumLog(),this.getSumLog()) && 378 stat.getCovariance().equals(this.getCovariance())); 379 } 380 381 /** 382 * Returns hash code based on values of statistics 383 * 384 * @return hash code 385 */ 386 public int hashCode() { 387 int result = 31 + MathUtils.hash(getGeometricMean()); 388 result = result * 31 + MathUtils.hash(getGeometricMean()); 389 result = result * 31 + MathUtils.hash(getMax()); 390 result = result * 31 + MathUtils.hash(getMean()); 391 result = result * 31 + MathUtils.hash(getMin()); 392 result = result * 31 + MathUtils.hash(getN()); 393 result = result * 31 + MathUtils.hash(getSum()); 394 result = result * 31 + MathUtils.hash(getSumSq()); 395 result = result * 31 + MathUtils.hash(getSumLog()); 396 result = result * 31 + getCovariance().hashCode(); 397 return result; 398 } 399 400 // Getters and setters for statistics implementations 401 /** 402 * Sets statistics implementations. 403 * @param newImpl new implementations for statistics 404 * @param oldImpl old implementations for statistics 405 * @throws DimensionMismatchException if the array dimension 406 * does not match the one used at construction 407 * @throws IllegalStateException if data has already been added 408 * (i.e if n > 0) 409 */ 410 private void setImpl(StorelessUnivariateStatistic[] newImpl, 411 StorelessUnivariateStatistic[] oldImpl) 412 throws DimensionMismatchException, IllegalStateException { 413 checkEmpty(); 414 checkDimension(newImpl.length); 415 System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length); 416 } 417 418 /** 419 * Returns the currently configured Sum implementation 420 * 421 * @return the StorelessUnivariateStatistic implementing the sum 422 */ 423 public StorelessUnivariateStatistic[] getSumImpl() { 424 return (StorelessUnivariateStatistic[]) sumImpl.clone(); 425 } 426 427 /** 428 * <p>Sets the implementation for the Sum.</p> 429 * <p>This method must be activated before any data has been added - i.e., 430 * before {@link #addValue(double[]) addValue} has been used to add data; 431 * otherwise an IllegalStateException will be thrown.</p> 432 * 433 * @param sumImpl the StorelessUnivariateStatistic instance to use 434 * for computing the Sum 435 * @throws DimensionMismatchException if the array dimension 436 * does not match the one used at construction 437 * @throws IllegalStateException if data has already been added 438 * (i.e if n > 0) 439 */ 440 public void setSumImpl(StorelessUnivariateStatistic[] sumImpl) 441 throws DimensionMismatchException { 442 setImpl(sumImpl, this.sumImpl); 443 } 444 445 /** 446 * Returns the currently configured sum of squares implementation 447 * 448 * @return the StorelessUnivariateStatistic implementing the sum of squares 449 */ 450 public StorelessUnivariateStatistic[] getSumsqImpl() { 451 return (StorelessUnivariateStatistic[]) sumSqImpl.clone(); 452 } 453 454 /** 455 * <p>Sets the implementation for the sum of squares.</p> 456 * <p>This method must be activated before any data has been added - i.e., 457 * before {@link #addValue(double[]) addValue} has been used to add data; 458 * otherwise an IllegalStateException will be thrown.</p> 459 * 460 * @param sumsqImpl the StorelessUnivariateStatistic instance to use 461 * for computing the sum of squares 462 * @throws DimensionMismatchException if the array dimension 463 * does not match the one used at construction 464 * @throws IllegalStateException if data has already been added 465 * (i.e if n > 0) 466 */ 467 public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl) 468 throws DimensionMismatchException { 469 setImpl(sumsqImpl, this.sumSqImpl); 470 } 471 472 /** 473 * Returns the currently configured minimum implementation 474 * 475 * @return the StorelessUnivariateStatistic implementing the minimum 476 */ 477 public StorelessUnivariateStatistic[] getMinImpl() { 478 return (StorelessUnivariateStatistic[]) minImpl.clone(); 479 } 480 481 /** 482 * <p>Sets the implementation for the minimum.</p> 483 * <p>This method must be activated before any data has been added - i.e., 484 * before {@link #addValue(double[]) addValue} has been used to add data; 485 * otherwise an IllegalStateException will be thrown.</p> 486 * 487 * @param minImpl the StorelessUnivariateStatistic instance to use 488 * for computing the minimum 489 * @throws DimensionMismatchException if the array dimension 490 * does not match the one used at construction 491 * @throws IllegalStateException if data has already been added 492 * (i.e if n > 0) 493 */ 494 public void setMinImpl(StorelessUnivariateStatistic[] minImpl) 495 throws DimensionMismatchException { 496 setImpl(minImpl, this.minImpl); 497 } 498 499 /** 500 * Returns the currently configured maximum implementation 501 * 502 * @return the StorelessUnivariateStatistic implementing the maximum 503 */ 504 public StorelessUnivariateStatistic[] getMaxImpl() { 505 return (StorelessUnivariateStatistic[]) maxImpl.clone(); 506 } 507 508 /** 509 * <p>Sets the implementation for the maximum.</p> 510 * <p>This method must be activated before any data has been added - i.e., 511 * before {@link #addValue(double[]) addValue} has been used to add data; 512 * otherwise an IllegalStateException will be thrown.</p> 513 * 514 * @param maxImpl the StorelessUnivariateStatistic instance to use 515 * for computing the maximum 516 * @throws DimensionMismatchException if the array dimension 517 * does not match the one used at construction 518 * @throws IllegalStateException if data has already been added 519 * (i.e if n > 0) 520 */ 521 public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl) 522 throws DimensionMismatchException { 523 setImpl(maxImpl, this.maxImpl); 524 } 525 526 /** 527 * Returns the currently configured sum of logs implementation 528 * 529 * @return the StorelessUnivariateStatistic implementing the log sum 530 */ 531 public StorelessUnivariateStatistic[] getSumLogImpl() { 532 return (StorelessUnivariateStatistic[]) sumLogImpl.clone(); 533 } 534 535 /** 536 * <p>Sets the implementation for the sum of logs.</p> 537 * <p>This method must be activated before any data has been added - i.e., 538 * before {@link #addValue(double[]) addValue} has been used to add data; 539 * otherwise an IllegalStateException will be thrown.</p> 540 * 541 * @param sumLogImpl the StorelessUnivariateStatistic instance to use 542 * for computing the log sum 543 * @throws DimensionMismatchException if the array dimension 544 * does not match the one used at construction 545 * @throws IllegalStateException if data has already been added 546 * (i.e if n > 0) 547 */ 548 public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl) 549 throws DimensionMismatchException { 550 setImpl(sumLogImpl, this.sumLogImpl); 551 } 552 553 /** 554 * Returns the currently configured geometric mean implementation 555 * 556 * @return the StorelessUnivariateStatistic implementing the geometric mean 557 */ 558 public StorelessUnivariateStatistic[] getGeoMeanImpl() { 559 return (StorelessUnivariateStatistic[]) geoMeanImpl.clone(); 560 } 561 562 /** 563 * <p>Sets the implementation for the geometric mean.</p> 564 * <p>This method must be activated before any data has been added - i.e., 565 * before {@link #addValue(double[]) addValue} has been used to add data; 566 * otherwise an IllegalStateException will be thrown.</p> 567 * 568 * @param geoMeanImpl the StorelessUnivariateStatistic instance to use 569 * for computing the geometric mean 570 * @throws DimensionMismatchException if the array dimension 571 * does not match the one used at construction 572 * @throws IllegalStateException if data has already been added 573 * (i.e if n > 0) 574 */ 575 public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl) 576 throws DimensionMismatchException { 577 setImpl(geoMeanImpl, this.geoMeanImpl); 578 } 579 580 /** 581 * Returns the currently configured mean implementation 582 * 583 * @return the StorelessUnivariateStatistic implementing the mean 584 */ 585 public StorelessUnivariateStatistic[] getMeanImpl() { 586 return (StorelessUnivariateStatistic[]) meanImpl.clone(); 587 } 588 589 /** 590 * <p>Sets the implementation for the mean.</p> 591 * <p>This method must be activated before any data has been added - i.e., 592 * before {@link #addValue(double[]) addValue} has been used to add data; 593 * otherwise an IllegalStateException will be thrown.</p> 594 * 595 * @param meanImpl the StorelessUnivariateStatistic instance to use 596 * for computing the mean 597 * @throws DimensionMismatchException if the array dimension 598 * does not match the one used at construction 599 * @throws IllegalStateException if data has already been added 600 * (i.e if n > 0) 601 */ 602 public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl) 603 throws DimensionMismatchException { 604 setImpl(meanImpl, this.meanImpl); 605 } 606 607 /** 608 * Throws IllegalStateException if n > 0. 609 */ 610 private void checkEmpty() { 611 if (n > 0) { 612 throw new IllegalStateException( 613 "Implementations must be configured before values are added."); 614 } 615 } 616 617 /** 618 * Throws DimensionMismatchException if dimension != k. 619 * @param dimension dimension to check 620 * @throws DimensionMismatchException if dimension != k 621 */ 622 private void checkDimension(int dimension) 623 throws DimensionMismatchException { 624 if (dimension != k) { 625 throw new DimensionMismatchException(dimension, k); 626 } 627 } 628 629 }