001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.math3.distribution;
019    
020    import org.apache.commons.math3.exception.NotStrictlyPositiveException;
021    import org.apache.commons.math3.exception.util.LocalizedFormats;
022    import org.apache.commons.math3.util.FastMath;
023    import org.apache.commons.math3.random.RandomGenerator;
024    import org.apache.commons.math3.random.Well19937c;
025    
026    /**
027     * Implementation of the Zipf distribution.
028     *
029     * @see <a href="http://mathworld.wolfram.com/ZipfDistribution.html">Zipf distribution (MathWorld)</a>
030     * @version $Id: ZipfDistribution.java 1416643 2012-12-03 19:37:14Z tn $
031     */
032    public class ZipfDistribution extends AbstractIntegerDistribution {
033        /** Serializable version identifier. */
034        private static final long serialVersionUID = -140627372283420404L;
035        /** Number of elements. */
036        private final int numberOfElements;
037        /** Exponent parameter of the distribution. */
038        private final double exponent;
039        /** Cached numerical mean */
040        private double numericalMean = Double.NaN;
041        /** Whether or not the numerical mean has been calculated */
042        private boolean numericalMeanIsCalculated = false;
043        /** Cached numerical variance */
044        private double numericalVariance = Double.NaN;
045        /** Whether or not the numerical variance has been calculated */
046        private boolean numericalVarianceIsCalculated = false;
047    
048        /**
049         * Create a new Zipf distribution with the given number of elements and
050         * exponent.
051         *
052         * @param numberOfElements Number of elements.
053         * @param exponent Exponent.
054         * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
055         * or {@code exponent <= 0}.
056         */
057        public ZipfDistribution(final int numberOfElements, final double exponent) {
058            this(new Well19937c(), numberOfElements, exponent);
059        }
060    
061        /**
062         * Creates a Zipf distribution.
063         *
064         * @param rng Random number generator.
065         * @param numberOfElements Number of elements.
066         * @param exponent Exponent.
067         * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
068         * or {@code exponent <= 0}.
069         * @since 3.1
070         */
071        public ZipfDistribution(RandomGenerator rng,
072                                int numberOfElements,
073                                double exponent)
074            throws NotStrictlyPositiveException {
075            super(rng);
076    
077            if (numberOfElements <= 0) {
078                throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
079                                                       numberOfElements);
080            }
081            if (exponent <= 0) {
082                throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
083                                                       exponent);
084            }
085    
086            this.numberOfElements = numberOfElements;
087            this.exponent = exponent;
088        }
089    
090        /**
091         * Get the number of elements (e.g. corpus size) for the distribution.
092         *
093         * @return the number of elements
094         */
095        public int getNumberOfElements() {
096            return numberOfElements;
097        }
098    
099        /**
100         * Get the exponent characterizing the distribution.
101         *
102         * @return the exponent
103         */
104        public double getExponent() {
105            return exponent;
106        }
107    
108        /** {@inheritDoc} */
109        public double probability(final int x) {
110            if (x <= 0 || x > numberOfElements) {
111                return 0.0;
112            }
113    
114            return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
115        }
116    
117        /** {@inheritDoc} */
118        public double cumulativeProbability(final int x) {
119            if (x <= 0) {
120                return 0.0;
121            } else if (x >= numberOfElements) {
122                return 1.0;
123            }
124    
125            return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
126        }
127    
128        /**
129         * {@inheritDoc}
130         *
131         * For number of elements {@code N} and exponent {@code s}, the mean is
132         * {@code Hs1 / Hs}, where
133         * <ul>
134         *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
135         *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
136         * </ul>
137         */
138        public double getNumericalMean() {
139            if (!numericalMeanIsCalculated) {
140                numericalMean = calculateNumericalMean();
141                numericalMeanIsCalculated = true;
142            }
143            return numericalMean;
144        }
145    
146        /**
147         * Used by {@link #getNumericalMean()}.
148         *
149         * @return the mean of this distribution
150         */
151        protected double calculateNumericalMean() {
152            final int N = getNumberOfElements();
153            final double s = getExponent();
154    
155            final double Hs1 = generalizedHarmonic(N, s - 1);
156            final double Hs = generalizedHarmonic(N, s);
157    
158            return Hs1 / Hs;
159        }
160    
161        /**
162         * {@inheritDoc}
163         *
164         * For number of elements {@code N} and exponent {@code s}, the mean is
165         * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
166         * <ul>
167         *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
168         *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
169         *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
170         * </ul>
171         */
172        public double getNumericalVariance() {
173            if (!numericalVarianceIsCalculated) {
174                numericalVariance = calculateNumericalVariance();
175                numericalVarianceIsCalculated = true;
176            }
177            return numericalVariance;
178        }
179    
180        /**
181         * Used by {@link #getNumericalVariance()}.
182         *
183         * @return the variance of this distribution
184         */
185        protected double calculateNumericalVariance() {
186            final int N = getNumberOfElements();
187            final double s = getExponent();
188    
189            final double Hs2 = generalizedHarmonic(N, s - 2);
190            final double Hs1 = generalizedHarmonic(N, s - 1);
191            final double Hs = generalizedHarmonic(N, s);
192    
193            return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
194        }
195    
196        /**
197         * Calculates the Nth generalized harmonic number. See
198         * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
199         * Series</a>.
200         *
201         * @param n Term in the series to calculate (must be larger than 1)
202         * @param m Exponent (special case {@code m = 1} is the harmonic series).
203         * @return the n<sup>th</sup> generalized harmonic number.
204         */
205        private double generalizedHarmonic(final int n, final double m) {
206            double value = 0;
207            for (int k = n; k > 0; --k) {
208                value += 1.0 / FastMath.pow(k, m);
209            }
210            return value;
211        }
212    
213        /**
214         * {@inheritDoc}
215         *
216         * The lower bound of the support is always 1 no matter the parameters.
217         *
218         * @return lower bound of the support (always 1)
219         */
220        public int getSupportLowerBound() {
221            return 1;
222        }
223    
224        /**
225         * {@inheritDoc}
226         *
227         * The upper bound of the support is the number of elements.
228         *
229         * @return upper bound of the support
230         */
231        public int getSupportUpperBound() {
232            return getNumberOfElements();
233        }
234    
235        /**
236         * {@inheritDoc}
237         *
238         * The support of this distribution is connected.
239         *
240         * @return {@code true}
241         */
242        public boolean isSupportConnected() {
243            return true;
244        }
245    }
246