View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.math.random;
19  
20  import java.io.EOFException;
21  import java.io.Serializable;
22  import java.io.BufferedReader;
23  import java.io.FileReader;
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.InputStreamReader;
27  import java.net.URL;
28  import java.util.ArrayList;
29  import java.util.List;
30  
31  import org.apache.commons.math.stat.descriptive.SummaryStatistics;
32  import org.apache.commons.math.stat.descriptive.StatisticalSummary;
33  
34  /**
35   * Implements <code>EmpiricalDistribution</code> interface.  This implementation
36   * uses what amounts to the
37   * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
38   * Variable Kernel Method</a> with Gaussian smoothing:<p>
39   * <strong>Digesting the input file</strong>
40   * <ol><li>Pass the file once to compute min and max.</li>
41   * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
42   * <li>Pass the data file again, computing bin counts and univariate
43   *     statistics (mean, std dev.) for each of the bins </li>
44   * <li>Divide the interval (0,1) into subintervals associated with the bins,
45   *     with the length of a bin's subinterval proportional to its count.</li></ol>
46   * <strong>Generating random values from the distribution</strong><ol>
47   * <li>Generate a uniformly distributed value in (0,1) </li>
48   * <li>Select the subinterval to which the value belongs.
49   * <li>Generate a random Gaussian value with mean = mean of the associated
50   *     bin and std dev = std dev of associated bin.</li></ol></p><p>
51   *<strong>USAGE NOTES:</strong><ul>
52   *<li>The <code>binCount</code> is set by default to 1000.  A good rule of thumb
53   *    is to set the bin count to approximately the length of the input file divided
54   *    by 10. </li>
55   *<li>The input file <i>must</i> be a plain text file containing one valid numeric
56   *    entry per line.</li>
57   * </ul></p>
58   *
59   * @version $Revision: 617850 $ $Date: 2008-02-02 11:01:29 -0700 (Sat, 02 Feb 2008) $
60   */
61  public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
62  
63      /** Serializable version identifier */
64      private static final long serialVersionUID = -6773236347582113490L;
65  
66      /** List of SummaryStatistics objects characterizing the bins */
67      private ArrayList binStats = null;
68  
69      /** Sample statistics */
70      private SummaryStatistics sampleStats = null;
71  
72      /** number of bins */
73      private int binCount = 1000;
74  
75      /** is the distribution loaded? */
76      private boolean loaded = false;
77  
78      /** upper bounds of subintervals in (0,1) "belonging" to the bins */
79      private double[] upperBounds = null;
80  
81      /** RandomData instance to use in repeated calls to getNext() */
82      private RandomData randomData = new RandomDataImpl();
83  
84      /**
85       * Creates a new EmpiricalDistribution with the default bin count.
86       */
87      public EmpiricalDistributionImpl() {
88          binStats = new ArrayList();
89      }
90  
91      /**
92       * Creates a new EmpiricalDistribution  with the specified bin count.
93       * 
94       * @param binCount number of bins
95       */
96      public EmpiricalDistributionImpl(int binCount) {
97          this.binCount = binCount;
98          binStats = new ArrayList();
99      }
100 
101      /**
102      * Computes the empirical distribution from the provided
103      * array of numbers.
104      * 
105      * @param in the input data array
106      */
107     public void load(double[] in) {
108         DataAdapter da = new ArrayDataAdapter(in);
109         try {
110             da.computeStats();
111             fillBinStats(in);
112         } catch (Exception e) {
113             throw new RuntimeException(e.getMessage());
114         }
115         loaded = true;
116 
117     }
118 
119     /**
120      * Computes the empirical distribution using data read from a URL.
121      * @param url  url of the input file
122      * 
123      * @throws IOException if an IO error occurs
124      */
125     public void load(URL url) throws IOException {
126         BufferedReader in =
127             new BufferedReader(new InputStreamReader(url.openStream()));
128         try {
129             DataAdapter da = new StreamDataAdapter(in);
130             try {
131                 da.computeStats();
132             } catch (Exception e) {
133                 throw new IOException(e.getMessage());
134             }
135             if (sampleStats.getN() == 0) {
136                 throw new EOFException("URL " + url + " contains no data");
137             }
138             in = new BufferedReader(new InputStreamReader(url.openStream()));
139             fillBinStats(in);
140             loaded = true;
141         } finally {
142            if (in != null) {
143                try {
144                    in.close();
145                } catch (Exception ex) {
146                    // ignore
147                }
148            }
149         }
150     }
151 
152     /**
153      * Computes the empirical distribution from the input file.
154      * 
155      * @param file the input file
156      * @throws IOException if an IO error occurs
157      */
158     public void load(File file) throws IOException {
159         BufferedReader in = new BufferedReader(new FileReader(file));
160         try {
161             DataAdapter da = new StreamDataAdapter(in);
162             try {
163                 da.computeStats();
164             } catch (Exception e) {
165                 throw new IOException(e.getMessage());
166             }
167             in = new BufferedReader(new FileReader(file));
168             fillBinStats(in);
169             loaded = true;
170         } finally {
171             if (in != null) {
172                 try {
173                     in.close();
174                 } catch (Exception ex) {
175                     // ignore
176                 }
177             }
178         }
179     }
180 
181     /**
182      * Provides methods for computing <code>sampleStats</code> and
183      * <code>beanStats</code> abstracting the source of data.
184      */
185     private abstract class DataAdapter{
186         /** 
187          * Compute bin stats.
188          * 
189          * @param min minimum value
190          * @param delta  grid size
191          * @throws Exception  if an error occurs computing bin stats
192          */
193         public abstract void computeBinStats(double min, double delta)
194                 throws Exception;
195         /**
196          * Compute sample statistics.
197          * 
198          * @throws Exception if an error occurs computing sample stats
199          */
200         public abstract void computeStats() throws Exception;
201     }
202     /**
203      * Factory of <code>DataAdapter</code> objects. For every supported source
204      * of data (array of doubles, file, etc.) an instance of the proper object
205      * is returned.
206      */
207     private class DataAdapterFactory{
208         /**
209          * Creates a DataAdapter from a data object
210          * 
211          * @param in object providing access to the data
212          * @return DataAdapter instance
213          */
214         public DataAdapter getAdapter(Object in) {
215             if (in instanceof BufferedReader) {
216                 BufferedReader inputStream = (BufferedReader) in;
217                 return new StreamDataAdapter(inputStream);
218             } else if (in instanceof double[]) {
219                 double[] inputArray = (double[]) in;
220                 return new ArrayDataAdapter(inputArray);
221             } else {
222                 throw new IllegalArgumentException(
223                     "Input data comes from the" + " unsupported source");
224             }
225         }
226     }
227     /**
228      * <code>DataAdapter</code> for data provided through some input stream
229      */
230     private class StreamDataAdapter extends DataAdapter{
231         
232         /** Input stream providng access to the data */
233         private BufferedReader inputStream;
234         
235         /**
236          * Create a StreamDataAdapter from a BufferedReader
237          * 
238          * @param in BufferedReader input stream
239          */
240         public StreamDataAdapter(BufferedReader in){
241             super();
242             inputStream = in;
243         }
244         /**
245          * Computes binStats
246          * 
247          * @param min  minimum value
248          * @param delta  grid size
249          * @throws IOException if an IO error occurs
250          */
251         public void computeBinStats(double min, double delta)
252                 throws IOException {
253             String str = null;
254             double val = 0.0d;
255             while ((str = inputStream.readLine()) != null) {
256                 val = Double.parseDouble(str);
257                 SummaryStatistics stats =
258                     (SummaryStatistics) binStats.get(findBin(min, val, delta));
259                 stats.addValue(val);
260             }
261 
262             inputStream.close();
263             inputStream = null;
264         }
265         /**
266          * Computes sampleStats
267          * 
268          * @throws IOException if an IOError occurs
269          */
270         public void computeStats() throws IOException {
271             String str = null;
272             double val = 0.0;
273             sampleStats = new SummaryStatistics();
274             while ((str = inputStream.readLine()) != null) {
275                 val = new Double(str).doubleValue();
276                 sampleStats.addValue(val);
277             }
278             inputStream.close();
279             inputStream = null;
280         }
281     }
282 
283     /**
284      * <code>DataAdapter</code> for data provided as array of doubles.
285      */
286     private class ArrayDataAdapter extends DataAdapter{
287         
288         /** Array of input  data values */
289         private double[] inputArray;
290         
291         /**
292          * Construct an ArrayDataAdapter from a double[] array
293          * 
294          * @param in double[] array holding the data
295          */
296         public ArrayDataAdapter(double[] in){
297             super();
298             inputArray = in;
299         }
300         /**
301          * Computes sampleStats
302          * 
303          * @throws IOException if an IO error occurs
304          */
305         public void computeStats() throws IOException {
306             sampleStats = new SummaryStatistics();
307             for (int i = 0; i < inputArray.length; i++) {
308                 sampleStats.addValue(inputArray[i]);
309             }
310         }
311         /**
312          * Computes binStats
313          * 
314          * @param min  minimum value
315          * @param delta  grid size
316          * @throws IOException  if an IO error occurs
317          */
318         public void computeBinStats(double min, double delta)
319             throws IOException {
320             for (int i = 0; i < inputArray.length; i++) {
321                 SummaryStatistics stats =
322                     (SummaryStatistics) binStats.get(
323                             findBin(min, inputArray[i], delta));
324                 stats.addValue(inputArray[i]);
325             }
326         }
327     }
328 
329     /**
330      * Fills binStats array (second pass through data file).
331      * 
332      * @param in object providing access to the data
333      * @throws IOException  if an IO error occurs
334      */
335     private void fillBinStats(Object in) throws IOException {
336         // Load array of bin upper bounds -- evenly spaced from min - max
337         double min = sampleStats.getMin();
338         double max = sampleStats.getMax();
339         double delta = (max - min)/(new Double(binCount)).doubleValue();
340         double[] binUpperBounds = new double[binCount];
341         binUpperBounds[0] = min + delta;
342         for (int i = 1; i< binCount - 1; i++) {
343             binUpperBounds[i] = binUpperBounds[i-1] + delta;
344         }
345         binUpperBounds[binCount -1] = max;
346 
347         // Initialize binStats ArrayList
348         if (!binStats.isEmpty()) {
349             binStats.clear();
350         }
351         for (int i = 0; i < binCount; i++) {
352             SummaryStatistics stats = new SummaryStatistics();
353             binStats.add(i,stats);
354         }
355 
356         // Filling data in binStats Array
357         DataAdapterFactory aFactory = new DataAdapterFactory();
358         DataAdapter da = aFactory.getAdapter(in);
359         try {
360             da.computeBinStats(min, delta);
361         } catch (Exception e) {
362             if(e instanceof RuntimeException){
363                 throw new RuntimeException(e.getMessage());
364             }else{
365                 throw new IOException(e.getMessage());
366             }
367         }
368 
369         // Assign upperBounds based on bin counts
370         upperBounds = new double[binCount];
371         upperBounds[0] =
372         ((double)((SummaryStatistics)binStats.get(0)).getN())/
373         (double)sampleStats.getN();
374         for (int i = 1; i < binCount-1; i++) {
375             upperBounds[i] = upperBounds[i-1] +
376             ((double)((SummaryStatistics)binStats.get(i)).getN())/
377             (double)sampleStats.getN();
378         }
379         upperBounds[binCount-1] = 1.0d;
380     }
381     
382     /**
383      * Returns the index of the bin to which the given value belongs
384      * 
385      * @param min  the minimum value
386      * @param value  the value whose bin we are trying to find
387      * @param delta  the grid size
388      * @return the index of the bin containing the value
389      */
390     private int findBin(double min, double value, double delta) {
391         return Math.min(
392                 Math.max((int) Math.ceil((value- min) / delta) - 1, 0), 
393                 binCount - 1);
394         }
395 
396     /**
397      * Generates a random value from this distribution.
398      * 
399      * @return the random value.
400      * @throws IllegalStateException if the distribution has not been loaded
401      */
402     public double getNextValue() throws IllegalStateException {
403 
404         if (!loaded) {
405             throw new IllegalStateException("distribution not loaded");
406         }
407 
408         // Start with a uniformly distributed random number in (0,1)
409         double x = Math.random();
410 
411         // Use this to select the bin and generate a Gaussian within the bin
412         for (int i = 0; i < binCount; i++) {
413            if (x <= upperBounds[i]) {
414                SummaryStatistics stats = (SummaryStatistics)binStats.get(i);
415                if (stats.getN() > 0) {
416                    if (stats.getStandardDeviation() > 0) {  // more than one obs
417                         return randomData.nextGaussian
418                             (stats.getMean(),stats.getStandardDeviation());
419                    } else {
420                        return stats.getMean(); // only one obs in bin
421                    }
422                }
423            }
424         }
425         throw new RuntimeException("No bin selected");
426     }
427 
428     /**
429      * Returns a {@link StatisticalSummary} describing this distribution.
430      * <strong>Preconditions:</strong><ul>
431      * <li>the distribution must be loaded before invoking this method</li></ul>
432      * 
433      * @return the sample statistics
434      * @throws IllegalStateException if the distribution has not been loaded
435      */
436     public StatisticalSummary getSampleStats() {
437         return sampleStats;
438     }
439 
440     /**
441      * Returns the number of bins.
442      * 
443      * @return the number of bins.
444      */
445     public int getBinCount() {
446         return binCount;
447     }
448 
449     /**
450      * Returns an ArrayList of {@link SummaryStatistics} instances containing
451      * statistics describing the values in each of the bins.  The ArrayList is
452      * indexed on the bin number.
453      * 
454      * @return List of bin statistics.
455      */
456     public List getBinStats() {
457         return binStats;
458     }
459 
460     /**
461      * Returns (a fresh copy of) the array of upper bounds for the bins.
462        Bins are: <br/>
463      * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
464      *  (upperBounds[binCount-1],max]
465      * 
466      * @return array of bin upper bounds
467      */
468     public double[] getUpperBounds() {
469         int len = upperBounds.length;
470         double[] out = new double[len];
471         System.arraycopy(upperBounds, 0, out, 0, len);
472         return out;
473     }
474 
475     /**
476      * Property indicating whether or not the distribution has been loaded.
477      * 
478      * @return true if the distribution has been loaded
479      */
480     public boolean isLoaded() {
481         return loaded;
482     }
483 }