001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math3.stat.inference;
018    
019    import org.apache.commons.math3.distribution.FDistribution;
020    import org.apache.commons.math3.exception.ConvergenceException;
021    import org.apache.commons.math3.exception.DimensionMismatchException;
022    import org.apache.commons.math3.exception.MaxCountExceededException;
023    import org.apache.commons.math3.exception.NullArgumentException;
024    import org.apache.commons.math3.exception.OutOfRangeException;
025    import org.apache.commons.math3.exception.util.LocalizedFormats;
026    import org.apache.commons.math3.stat.descriptive.summary.Sum;
027    import org.apache.commons.math3.stat.descriptive.summary.SumOfSquares;
028    
029    import java.util.Collection;
030    
031    /**
032     * Implements one-way ANOVA (analysis of variance) statistics.
033     *
034     * <p> Tests for differences between two or more categories of univariate data
035     * (for example, the body mass index of accountants, lawyers, doctors and
036     * computer programmers).  When two categories are given, this is equivalent to
037     * the {@link org.apache.commons.math3.stat.inference.TTest}.
038     * </p><p>
039     * Uses the {@link org.apache.commons.math3.distribution.FDistribution
040     * commons-math F Distribution implementation} to estimate exact p-values.</p>
041     * <p>This implementation is based on a description at
042     * http://faculty.vassar.edu/lowry/ch13pt1.html</p>
043     * <pre>
044     * Abbreviations: bg = between groups,
045     *                wg = within groups,
046     *                ss = sum squared deviations
047     * </pre>
048     *
049     * @since 1.2
050     * @version $Id: OneWayAnova.java 1416643 2012-12-03 19:37:14Z tn $
051     */
052    public class OneWayAnova {
053    
054        /**
055         * Default constructor.
056         */
057        public OneWayAnova() {
058        }
059    
060        /**
061         * Computes the ANOVA F-value for a collection of <code>double[]</code>
062         * arrays.
063         *
064         * <p><strong>Preconditions</strong>: <ul>
065         * <li>The categoryData <code>Collection</code> must contain
066         * <code>double[]</code> arrays.</li>
067         * <li> There must be at least two <code>double[]</code> arrays in the
068         * <code>categoryData</code> collection and each of these arrays must
069         * contain at least two values.</li></ul></p><p>
070         * This implementation computes the F statistic using the definitional
071         * formula<pre>
072         *   F = msbg/mswg</pre>
073         * where<pre>
074         *  msbg = between group mean square
075         *  mswg = within group mean square</pre>
076         * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
077         * here</a></p>
078         *
079         * @param categoryData <code>Collection</code> of <code>double[]</code>
080         * arrays each containing data for one category
081         * @return Fvalue
082         * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
083         * @throws DimensionMismatchException if the length of the <code>categoryData</code>
084         * array is less than 2 or a contained <code>double[]</code> array does not have
085         * at least two values
086         */
087        public double anovaFValue(final Collection<double[]> categoryData)
088            throws NullArgumentException, DimensionMismatchException {
089    
090            AnovaStats a = anovaStats(categoryData);
091            return a.F;
092    
093        }
094    
095        /**
096         * Computes the ANOVA P-value for a collection of <code>double[]</code>
097         * arrays.
098         *
099         * <p><strong>Preconditions</strong>: <ul>
100         * <li>The categoryData <code>Collection</code> must contain
101         * <code>double[]</code> arrays.</li>
102         * <li> There must be at least two <code>double[]</code> arrays in the
103         * <code>categoryData</code> collection and each of these arrays must
104         * contain at least two values.</li></ul></p><p>
105         * This implementation uses the
106         * {@link org.apache.commons.math3.distribution.FDistribution
107         * commons-math F Distribution implementation} to estimate the exact
108         * p-value, using the formula<pre>
109         *   p = 1 - cumulativeProbability(F)</pre>
110         * where <code>F</code> is the F value and <code>cumulativeProbability</code>
111         * is the commons-math implementation of the F distribution.</p>
112         *
113         * @param categoryData <code>Collection</code> of <code>double[]</code>
114         * arrays each containing data for one category
115         * @return Pvalue
116         * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
117         * @throws DimensionMismatchException if the length of the <code>categoryData</code>
118         * array is less than 2 or a contained <code>double[]</code> array does not have
119         * at least two values
120         * @throws ConvergenceException if the p-value can not be computed due to a convergence error
121         * @throws MaxCountExceededException if the maximum number of iterations is exceeded
122         */
123        public double anovaPValue(final Collection<double[]> categoryData)
124            throws NullArgumentException, DimensionMismatchException,
125            ConvergenceException, MaxCountExceededException {
126    
127            AnovaStats a = anovaStats(categoryData);
128            // No try-catch or advertised exception because args are valid
129            FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
130            return 1.0 - fdist.cumulativeProbability(a.F);
131    
132        }
133    
134        /**
135         * Performs an ANOVA test, evaluating the null hypothesis that there
136         * is no difference among the means of the data categories.
137         *
138         * <p><strong>Preconditions</strong>: <ul>
139         * <li>The categoryData <code>Collection</code> must contain
140         * <code>double[]</code> arrays.</li>
141         * <li> There must be at least two <code>double[]</code> arrays in the
142         * <code>categoryData</code> collection and each of these arrays must
143         * contain at least two values.</li>
144         * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
145         * </li></ul></p><p>
146         * This implementation uses the
147         * {@link org.apache.commons.math3.distribution.FDistribution
148         * commons-math F Distribution implementation} to estimate the exact
149         * p-value, using the formula<pre>
150         *   p = 1 - cumulativeProbability(F)</pre>
151         * where <code>F</code> is the F value and <code>cumulativeProbability</code>
152         * is the commons-math implementation of the F distribution.</p>
153         * <p>True is returned iff the estimated p-value is less than alpha.</p>
154         *
155         * @param categoryData <code>Collection</code> of <code>double[]</code>
156         * arrays each containing data for one category
157         * @param alpha significance level of the test
158         * @return true if the null hypothesis can be rejected with
159         * confidence 1 - alpha
160         * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
161         * @throws DimensionMismatchException if the length of the <code>categoryData</code>
162         * array is less than 2 or a contained <code>double[]</code> array does not have
163         * at least two values
164         * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5]
165         * @throws ConvergenceException if the p-value can not be computed due to a convergence error
166         * @throws MaxCountExceededException if the maximum number of iterations is exceeded
167         */
168        public boolean anovaTest(final Collection<double[]> categoryData,
169                                 final double alpha)
170            throws NullArgumentException, DimensionMismatchException,
171            OutOfRangeException, ConvergenceException, MaxCountExceededException {
172    
173            if ((alpha <= 0) || (alpha > 0.5)) {
174                throw new OutOfRangeException(
175                        LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
176                        alpha, 0, 0.5);
177            }
178            return anovaPValue(categoryData) < alpha;
179    
180        }
181    
182        /**
183         * This method actually does the calculations (except P-value).
184         *
185         * @param categoryData <code>Collection</code> of <code>double[]</code>
186         * arrays each containing data for one category
187         * @return computed AnovaStats
188         * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
189         * @throws DimensionMismatchException if the length of the <code>categoryData</code>
190         * array is less than 2 or a contained <code>double[]</code> array does not contain
191         * at least two values
192         */
193        private AnovaStats anovaStats(final Collection<double[]> categoryData)
194            throws NullArgumentException, DimensionMismatchException {
195    
196            if (categoryData == null) {
197                throw new NullArgumentException();
198            }
199    
200            // check if we have enough categories
201            if (categoryData.size() < 2) {
202                throw new DimensionMismatchException(
203                        LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
204                        categoryData.size(), 2);
205            }
206    
207            // check if each category has enough data and all is double[]
208            for (double[] array : categoryData) {
209                if (array.length <= 1) {
210                    throw new DimensionMismatchException(
211                            LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
212                            array.length, 2);
213                }
214            }
215    
216            int dfwg = 0;
217            double sswg = 0;
218            Sum totsum = new Sum();
219            SumOfSquares totsumsq = new SumOfSquares();
220            int totnum = 0;
221    
222            for (double[] data : categoryData) {
223    
224                Sum sum = new Sum();
225                SumOfSquares sumsq = new SumOfSquares();
226                int num = 0;
227    
228                for (int i = 0; i < data.length; i++) {
229                    double val = data[i];
230    
231                    // within category
232                    num++;
233                    sum.increment(val);
234                    sumsq.increment(val);
235    
236                    // for all categories
237                    totnum++;
238                    totsum.increment(val);
239                    totsumsq.increment(val);
240                }
241                dfwg += num - 1;
242                double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num;
243                sswg += ss;
244            }
245            double sst = totsumsq.getResult() - totsum.getResult() *
246                totsum.getResult()/totnum;
247            double ssbg = sst - sswg;
248            int dfbg = categoryData.size() - 1;
249            double msbg = ssbg/dfbg;
250            double mswg = sswg/dfwg;
251            double F = msbg/mswg;
252    
253            return new AnovaStats(dfbg, dfwg, F);
254        }
255    
256        /**
257            Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
258            No get/set methods provided.
259        */
260        private static class AnovaStats {
261    
262            /** Degrees of freedom in numerator (between groups). */
263            private final int dfbg;
264    
265            /** Degrees of freedom in denominator (within groups). */
266            private final int dfwg;
267    
268            /** Statistic. */
269            private final double F;
270    
271            /**
272             * Constructor
273             * @param dfbg degrees of freedom in numerator (between groups)
274             * @param dfwg degrees of freedom in denominator (within groups)
275             * @param F statistic
276             */
277            private AnovaStats(int dfbg, int dfwg, double F) {
278                this.dfbg = dfbg;
279                this.dfwg = dfwg;
280                this.F = F;
281            }
282        }
283    
284    }