View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.math.stat.descriptive;
18  
19  import java.io.Serializable;
20  import java.util.Arrays;
21  
22  import org.apache.commons.math.DimensionMismatchException;
23  import org.apache.commons.math.linear.RealMatrix;
24  import org.apache.commons.math.stat.descriptive.moment.GeometricMean;
25  import org.apache.commons.math.stat.descriptive.moment.Mean;
26  import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance;
27  import org.apache.commons.math.stat.descriptive.rank.Max;
28  import org.apache.commons.math.stat.descriptive.rank.Min;
29  import org.apache.commons.math.stat.descriptive.summary.Sum;
30  import org.apache.commons.math.stat.descriptive.summary.SumOfLogs;
31  import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
32  import org.apache.commons.math.util.MathUtils;
33  
34  /**
35   * <p>Computes summary statistics for a stream of n-tuples added using the 
36   * {@link #addValue(double[]) addValue} method. The data values are not stored
37   * in memory, so this class can be used to compute statistics for very large
38   * n-tuple streams.</p>
39   * 
40   * <p>The {@link StorelessUnivariateStatistic} instances used to maintain
41   * summary state and compute statistics are configurable via setters.
42   * For example, the default implementation for the mean can be overridden by
43   * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual
44   * parameters to these methods must implement the 
45   * {@link StorelessUnivariateStatistic} interface and configuration must be
46   * completed before <code>addValue</code> is called. No configuration is
47   * necessary to use the default, commons-math provided implementations.</p>
48   * 
49   * <p>To compute statistics for a stream of n-tuples, construct a
50   * MultivariateStatistics instance with dimension n and then use 
51   * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code>
52   * methods where Xxx is a statistic return an array of <code>double</code>
53   * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the
54   * value of the given statistic for data range consisting of the i<sup>th</sup> element of
55   * each of the input n-tuples.  For example, if <code>addValue</code> is called
56   * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8},
57   * <code>getSum</code> will return a three-element array with values
58   * {0+3+6, 1+4+7, 2+5+8}</p>
59   * 
60   * <p>Note: This class is not thread-safe. Use 
61   * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple
62   * threads is required.</p>
63   *
64   * @since 1.2
65   * @version $Revision: 618097 $ $Date: 2008-02-03 22:39:08 +0100 (dim., 03 févr. 2008) $
66   */
67  public class MultivariateSummaryStatistics
68    implements StatisticalMultivariateSummary, Serializable {
69  
70      /** Serialization UID */
71      private static final long serialVersionUID = 2271900808994826718L;
72  
73      /**
74       * Construct a MultivariateSummaryStatistics instance
75       * @param k dimension of the data
76       * @param isCovarianceBiasCorrected if true, the unbiased sample
77       * covariance is computed, otherwise the biased population covariance
78       * is computed
79       */
80      public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) {
81          this.k = k;
82  
83          sumImpl     = new StorelessUnivariateStatistic[k];
84          sumSqImpl   = new StorelessUnivariateStatistic[k];
85          minImpl     = new StorelessUnivariateStatistic[k];
86          maxImpl     = new StorelessUnivariateStatistic[k];
87          sumLogImpl  = new StorelessUnivariateStatistic[k];
88          geoMeanImpl = new StorelessUnivariateStatistic[k];
89          meanImpl    = new StorelessUnivariateStatistic[k];
90  
91          for (int i = 0; i < k; ++i) {
92              sumImpl[i]     = new Sum();
93              sumSqImpl[i]   = new SumOfSquares();
94              minImpl[i]     = new Min();
95              maxImpl[i]     = new Max();
96              sumLogImpl[i]  = new SumOfLogs();
97              geoMeanImpl[i] = new GeometricMean();
98              meanImpl[i]    = new Mean();
99          }
100 
101         covarianceImpl =
102             new VectorialCovariance(k, isCovarianceBiasCorrected);
103 
104     }
105 
106     /** Dimension of the data. */
107     private int k;
108 
109     /** Count of values that have been added */
110     private long n = 0;
111     
112     /** Sum statistic implementation - can be reset by setter. */
113     private StorelessUnivariateStatistic[] sumImpl;
114     
115     /** Sum of squares statistic implementation - can be reset by setter. */
116     private StorelessUnivariateStatistic[] sumSqImpl;
117     
118     /** Minimum statistic implementation - can be reset by setter. */
119     private StorelessUnivariateStatistic[] minImpl;
120     
121     /** Maximum statistic implementation - can be reset by setter. */
122     private StorelessUnivariateStatistic[] maxImpl;
123     
124     /** Sum of log statistic implementation - can be reset by setter. */
125     private StorelessUnivariateStatistic[] sumLogImpl;
126     
127     /** Geometric mean statistic implementation - can be reset by setter. */
128     private StorelessUnivariateStatistic[] geoMeanImpl;
129     
130     /** Mean statistic implementation - can be reset by setter. */
131     private StorelessUnivariateStatistic[] meanImpl;
132     
133     /** Covariance statistic implementation - cannot be reset. */
134     private VectorialCovariance covarianceImpl;
135 
136     /**
137      * Add an n-tuple to the data
138      * 
139      * @param value  the n-tuple to add
140      * @throws DimensionMismatchException if the length of the array
141      * does not match the one used at construction
142      */
143     public void addValue(double[] value)
144       throws DimensionMismatchException {
145         checkDimension(value.length);
146         for (int i = 0; i < k; ++i) {
147             double v = value[i];
148             sumImpl[i].increment(v);
149             sumSqImpl[i].increment(v);
150             minImpl[i].increment(v);
151             maxImpl[i].increment(v);
152             sumLogImpl[i].increment(v);
153             geoMeanImpl[i].increment(v);
154             meanImpl[i].increment(v);
155         }
156         covarianceImpl.increment(value);
157         n++;
158     }
159 
160     /** 
161      * Returns the dimension of the data
162      * @return The dimension of the data
163      */
164     public int getDimension() {
165         return k;
166     }
167 
168     /** 
169      * Returns the number of available values
170      * @return The number of available values
171      */
172     public long getN() {
173         return n;
174     }
175 
176     /**
177      * Returns an array of the results of a statistic.
178      * @param stats univariate statistic array
179      * @return results array
180      */
181     private double[] getResults(StorelessUnivariateStatistic[] stats) {
182         double[] results = new double[stats.length];
183         for (int i = 0; i < results.length; ++i) {
184             results[i] = stats[i].getResult();
185         }
186         return results;
187     }
188 
189     /**
190      * Returns an array whose i<sup>th</sup> entry is the sum of the
191      * i<sup>th</sup> entries of the arrays that have been added using 
192      * {@link #addValue(double[])}
193      * 
194      * @return the array of component sums
195      */
196     public double[] getSum() {
197         return getResults(sumImpl);
198     }
199 
200     /**
201      * Returns an array whose i<sup>th</sup> entry is the sum of squares of the
202      * i<sup>th</sup> entries of the arrays that have been added using 
203      * {@link #addValue(double[])}
204      * 
205      * @return the array of component sums of squares
206      */
207     public double[] getSumSq() {
208         return getResults(sumSqImpl);
209     }
210 
211     /**
212      * Returns an array whose i<sup>th</sup> entry is the sum of logs of the
213      * i<sup>th</sup> entries of the arrays that have been added using 
214      * {@link #addValue(double[])}
215      * 
216      * @return the array of component log sums
217      */
218     public double[] getSumLog() {
219         return getResults(sumLogImpl);
220     }
221 
222     /**
223      * Returns an array whose i<sup>th</sup> entry is the mean of the
224      * i<sup>th</sup> entries of the arrays that have been added using 
225      * {@link #addValue(double[])}
226      * 
227      * @return the array of component means
228      */
229     public double[] getMean() {
230         return getResults(meanImpl);
231     }
232 
233     /**
234      * Returns an array whose i<sup>th</sup> entry is the standard deviation of the
235      * i<sup>th</sup> entries of the arrays that have been added using 
236      * {@link #addValue(double[])}
237      * 
238      * @return the array of component standard deviations
239      */
240     public double[] getStandardDeviation() {
241         double[] stdDev = new double[k];
242         if (getN() < 1) {
243             Arrays.fill(stdDev, Double.NaN);
244         } else if (getN() < 2) {
245             Arrays.fill(stdDev, 0.0);
246         } else {
247             RealMatrix matrix = covarianceImpl.getResult();
248             for (int i = 0; i < k; ++i) {
249                 stdDev[i] = Math.sqrt(matrix.getEntry(i, i));
250             }
251         }
252         return stdDev;
253     }
254 
255     /**
256      * Returns the covariance matrix of the values that have been added.
257      *
258      * @return the covariance matrix 
259      */
260     public RealMatrix getCovariance() {
261         return covarianceImpl.getResult();
262     }
263 
264     /**
265      * Returns an array whose i<sup>th</sup> entry is the maximum of the
266      * i<sup>th</sup> entries of the arrays that have been added using 
267      * {@link #addValue(double[])}
268      * 
269      * @return the array of component maxima
270      */
271     public double[] getMax() {
272         return getResults(maxImpl);
273     }
274 
275     /**
276      * Returns an array whose i<sup>th</sup> entry is the minimum of the
277      * i<sup>th</sup> entries of the arrays that have been added using 
278      * {@link #addValue(double[])}
279      * 
280      * @return the array of component minima
281      */
282     public double[] getMin() {
283         return getResults(minImpl);
284     }
285 
286     /**
287      * Returns an array whose i<sup>th</sup> entry is the geometric mean of the
288      * i<sup>th</sup> entries of the arrays that have been added using 
289      * {@link #addValue(double[])}
290      * 
291      * @return the array of component geometric means
292      */
293     public double[] getGeometricMean() {
294         return getResults(geoMeanImpl);
295     }
296     
297     /**
298      * Generates a text report displaying
299      * summary statistics from values that
300      * have been added.
301      * @return String with line feeds displaying statistics
302      */
303     public String toString() {
304         StringBuffer outBuffer = new StringBuffer();
305         outBuffer.append("MultivariateSummaryStatistics:\n");
306         outBuffer.append("n: " + getN() + "\n");
307         append(outBuffer, getMin(), "min: ", ", ", "\n");
308         append(outBuffer, getMax(), "max: ", ", ", "\n");
309         append(outBuffer, getMean(), "mean: ", ", ", "\n");
310         append(outBuffer, getGeometricMean(), "geometric mean: ", ", ", "\n");
311         append(outBuffer, getSumSq(), "sum of squares: ", ", ", "\n");
312         append(outBuffer, getSumLog(), "sum of logarithms: ", ", ", "\n");
313         append(outBuffer, getStandardDeviation(), "standard deviation: ", ", ", "\n");
314         outBuffer.append("covariance: " + getCovariance().toString() + "\n");
315         return outBuffer.toString();
316     }
317 
318     /**
319      * Append a text representation of an array to a buffer.
320      * @param buffer buffer to fill
321      * @param data data array
322      * @param prefix text prefix
323      * @param separator elements separator
324      * @param suffix text suffix
325      */
326     private void append(StringBuffer buffer, double[] data,
327                         String prefix, String separator, String suffix) {
328         buffer.append(prefix);
329         for (int i = 0; i < data.length; ++i) {
330             if (i > 0) {
331                 buffer.append(separator);
332             }
333             buffer.append(data[i]);
334         }
335         buffer.append(suffix);
336     }
337 
338     /** 
339      * Resets all statistics and storage
340      */
341     public void clear() {
342         this.n = 0;
343         for (int i = 0; i < k; ++i) {
344             minImpl[i].clear();
345             maxImpl[i].clear();
346             sumImpl[i].clear();
347             sumLogImpl[i].clear();
348             sumSqImpl[i].clear();
349             geoMeanImpl[i].clear();
350             meanImpl[i].clear();
351         }
352         covarianceImpl.clear();
353     }
354     
355     /**
356      * Returns true iff <code>object</code> is a <code>SummaryStatistics</code>
357      * instance and all statistics have the same values as this.
358      * @param object the object to test equality against.
359      * @return true if object equals this
360      */
361     public boolean equals(Object object) {
362         if (object == this ) {
363             return true;
364         }
365         if (object instanceof MultivariateSummaryStatistics == false) {
366             return false;
367         }
368         MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object;
369         return (MathUtils.equals(stat.getGeometricMean(), 
370                 this.getGeometricMean()) &&
371                 MathUtils.equals(stat.getMax(), this.getMax()) && 
372                 MathUtils.equals(stat.getMean(),this.getMean()) &&
373                 MathUtils.equals(stat.getMin(),this.getMin()) &&
374                 MathUtils.equals(stat.getN(), this.getN()) &&
375                 MathUtils.equals(stat.getSum(), this.getSum()) &&
376                 MathUtils.equals(stat.getSumSq(),this.getSumSq()) &&
377                 MathUtils.equals(stat.getSumLog(),this.getSumLog()) &&
378                 stat.getCovariance().equals(this.getCovariance()));
379     }
380     
381     /**
382      * Returns hash code based on values of statistics
383      * 
384      * @return hash code
385      */
386     public int hashCode() {
387         int result = 31 + MathUtils.hash(getGeometricMean());
388         result = result * 31 + MathUtils.hash(getGeometricMean());
389         result = result * 31 + MathUtils.hash(getMax());
390         result = result * 31 + MathUtils.hash(getMean());
391         result = result * 31 + MathUtils.hash(getMin());
392         result = result * 31 + MathUtils.hash(getN());
393         result = result * 31 + MathUtils.hash(getSum());
394         result = result * 31 + MathUtils.hash(getSumSq());
395         result = result * 31 + MathUtils.hash(getSumLog());
396         result = result * 31 + getCovariance().hashCode();
397         return result;
398     }
399 
400     // Getters and setters for statistics implementations
401     /**
402      * Sets statistics implementations.
403      * @param newImpl new implementations for statistics
404      * @param oldImpl old implementations for statistics
405      * @throws DimensionMismatchException if the array dimension
406      * does not match the one used at construction
407      * @throws IllegalStateException if data has already been added
408      *  (i.e if n > 0)
409      */
410     private void setImpl(StorelessUnivariateStatistic[] newImpl,
411                          StorelessUnivariateStatistic[] oldImpl)
412        throws DimensionMismatchException, IllegalStateException {
413         checkEmpty();
414         checkDimension(newImpl.length);
415         System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length);
416     }
417 
418     /**
419      * Returns the currently configured Sum implementation
420      * 
421      * @return the StorelessUnivariateStatistic implementing the sum
422      */
423     public StorelessUnivariateStatistic[] getSumImpl() {
424         return (StorelessUnivariateStatistic[]) sumImpl.clone();
425     }
426 
427     /**
428      * <p>Sets the implementation for the Sum.</p>
429      * <p>This method must be activated before any data has been added - i.e.,
430      * before {@link #addValue(double[]) addValue} has been used to add data; 
431      * otherwise an IllegalStateException will be thrown.</p>
432      * 
433      * @param sumImpl the StorelessUnivariateStatistic instance to use
434      * for computing the Sum
435      * @throws DimensionMismatchException if the array dimension
436      * does not match the one used at construction
437      * @throws IllegalStateException if data has already been added
438      *  (i.e if n > 0)
439      */
440     public void setSumImpl(StorelessUnivariateStatistic[] sumImpl)
441       throws DimensionMismatchException {
442         setImpl(sumImpl, this.sumImpl);
443     }
444 
445     /**
446      * Returns the currently configured sum of squares implementation
447      * 
448      * @return the StorelessUnivariateStatistic implementing the sum of squares
449      */
450     public StorelessUnivariateStatistic[] getSumsqImpl() {
451         return (StorelessUnivariateStatistic[]) sumSqImpl.clone();
452     }
453 
454     /**
455      * <p>Sets the implementation for the sum of squares.</p>
456      * <p>This method must be activated before any data has been added - i.e.,
457      * before {@link #addValue(double[]) addValue} has been used to add data; 
458      * otherwise an IllegalStateException will be thrown.</p>
459      * 
460      * @param sumsqImpl the StorelessUnivariateStatistic instance to use
461      * for computing the sum of squares
462      * @throws DimensionMismatchException if the array dimension
463      * does not match the one used at construction
464      * @throws IllegalStateException if data has already been added
465      *  (i.e if n > 0)
466      */
467     public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl)
468       throws DimensionMismatchException {
469         setImpl(sumsqImpl, this.sumSqImpl);
470     }
471 
472     /**
473      * Returns the currently configured minimum implementation
474      * 
475      * @return the StorelessUnivariateStatistic implementing the minimum
476      */
477     public StorelessUnivariateStatistic[] getMinImpl() {
478         return (StorelessUnivariateStatistic[]) minImpl.clone();
479     }
480 
481     /**
482      * <p>Sets the implementation for the minimum.</p>
483      * <p>This method must be activated before any data has been added - i.e.,
484      * before {@link #addValue(double[]) addValue} has been used to add data; 
485      * otherwise an IllegalStateException will be thrown.</p>
486      * 
487      * @param minImpl the StorelessUnivariateStatistic instance to use
488      * for computing the minimum
489      * @throws DimensionMismatchException if the array dimension
490      * does not match the one used at construction
491      * @throws IllegalStateException if data has already been added
492      *  (i.e if n > 0)
493      */
494     public void setMinImpl(StorelessUnivariateStatistic[] minImpl)
495       throws DimensionMismatchException {
496         setImpl(minImpl, this.minImpl);
497     }
498 
499     /**
500      * Returns the currently configured maximum implementation
501      * 
502      * @return the StorelessUnivariateStatistic implementing the maximum
503      */
504     public StorelessUnivariateStatistic[] getMaxImpl() {
505         return (StorelessUnivariateStatistic[]) maxImpl.clone();
506     }
507 
508     /**
509      * <p>Sets the implementation for the maximum.</p>
510      * <p>This method must be activated before any data has been added - i.e.,
511      * before {@link #addValue(double[]) addValue} has been used to add data; 
512      * otherwise an IllegalStateException will be thrown.</p>
513      * 
514      * @param maxImpl the StorelessUnivariateStatistic instance to use
515      * for computing the maximum
516      * @throws DimensionMismatchException if the array dimension
517      * does not match the one used at construction
518      * @throws IllegalStateException if data has already been added
519      *  (i.e if n > 0)
520      */
521     public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl)
522       throws DimensionMismatchException {
523         setImpl(maxImpl, this.maxImpl);
524     }
525 
526     /**
527      * Returns the currently configured sum of logs implementation
528      * 
529      * @return the StorelessUnivariateStatistic implementing the log sum
530      */
531     public StorelessUnivariateStatistic[] getSumLogImpl() {
532         return (StorelessUnivariateStatistic[]) sumLogImpl.clone();
533     }
534 
535     /**
536      * <p>Sets the implementation for the sum of logs.</p>
537      * <p>This method must be activated before any data has been added - i.e.,
538      * before {@link #addValue(double[]) addValue} has been used to add data; 
539      * otherwise an IllegalStateException will be thrown.</p>
540      * 
541      * @param sumLogImpl the StorelessUnivariateStatistic instance to use
542      * for computing the log sum
543      * @throws DimensionMismatchException if the array dimension
544      * does not match the one used at construction
545      * @throws IllegalStateException if data has already been added 
546      *  (i.e if n > 0)
547      */
548     public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl)
549       throws DimensionMismatchException {
550         setImpl(sumLogImpl, this.sumLogImpl);
551     }
552 
553     /**
554      * Returns the currently configured geometric mean implementation
555      * 
556      * @return the StorelessUnivariateStatistic implementing the geometric mean
557      */
558     public StorelessUnivariateStatistic[] getGeoMeanImpl() {
559         return (StorelessUnivariateStatistic[]) geoMeanImpl.clone();
560     }
561 
562     /**
563      * <p>Sets the implementation for the geometric mean.</p>
564      * <p>This method must be activated before any data has been added - i.e.,
565      * before {@link #addValue(double[]) addValue} has been used to add data; 
566      * otherwise an IllegalStateException will be thrown.</p>
567      * 
568      * @param geoMeanImpl the StorelessUnivariateStatistic instance to use
569      * for computing the geometric mean
570      * @throws DimensionMismatchException if the array dimension
571      * does not match the one used at construction
572      * @throws IllegalStateException if data has already been added
573      *  (i.e if n > 0)
574      */
575     public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl)
576       throws DimensionMismatchException {
577         setImpl(geoMeanImpl, this.geoMeanImpl);
578     }
579 
580     /**
581      * Returns the currently configured mean implementation
582      * 
583      * @return the StorelessUnivariateStatistic implementing the mean
584      */
585     public StorelessUnivariateStatistic[] getMeanImpl() {
586         return (StorelessUnivariateStatistic[]) meanImpl.clone();
587     }
588 
589     /**
590      * <p>Sets the implementation for the mean.</p>
591      * <p>This method must be activated before any data has been added - i.e.,
592      * before {@link #addValue(double[]) addValue} has been used to add data; 
593      * otherwise an IllegalStateException will be thrown.</p>
594      * 
595      * @param meanImpl the StorelessUnivariateStatistic instance to use
596      * for computing the mean
597      * @throws DimensionMismatchException if the array dimension
598      * does not match the one used at construction
599      * @throws IllegalStateException if data has already been added
600      *  (i.e if n > 0)
601      */
602     public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl)
603       throws DimensionMismatchException {
604         setImpl(meanImpl, this.meanImpl);
605     }
606 
607     /**
608      * Throws IllegalStateException if n > 0.
609      */
610     private void checkEmpty() {
611         if (n > 0) {
612             throw new IllegalStateException(
613                 "Implementations must be configured before values are added.");
614         }
615     }
616 
617     /**
618      * Throws DimensionMismatchException if dimension != k.
619      * @param dimension dimension to check
620      * @throws DimensionMismatchException if dimension != k
621      */
622     private void checkDimension(int dimension)
623       throws DimensionMismatchException {
624         if (dimension != k) {
625             throw new DimensionMismatchException(dimension, k);
626         }
627     }
628 
629 }