1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.math.random;
18
19 import java.io.Serializable;
20 import java.io.BufferedReader;
21 import java.io.FileReader;
22 import java.io.File;
23 import java.io.IOException;
24 import java.io.InputStreamReader;
25 import java.net.URL;
26 import java.util.ArrayList;
27 import java.util.List;
28
29 import org.apache.commons.math.stat.descriptive.SummaryStatistics;
30 import org.apache.commons.math.stat.descriptive.StatisticalSummary;
31
32 /***
33 * Implements <code>EmpiricalDistribution</code> interface. This implementation
34 * uses what amounts to the
35 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
36 * Variable Kernel Method</a> with Gaussian smoothing:<p>
37 * <strong>Digesting the input file</strong>
38 * <ol><li>Pass the file once to compute min and max.</li>
39 * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
40 * <li>Pass the data file again, computing bin counts and univariate
41 * statistics (mean, std dev.) for each of the bins </li>
42 * <li>Divide the interval (0,1) into subintervals associated with the bins,
43 * with the length of a bin's subinterval proportional to its count.</li></ol>
44 * <strong>Generating random values from the distribution</strong><ol>
45 * <li>Generate a uniformly distributed value in (0,1) </li>
46 * <li>Select the subinterval to which the value belongs.
47 * <li>Generate a random Gaussian value with mean = mean of the associated
48 * bin and std dev = std dev of associated bin.</li></ol></p><p>
49 *<strong>USAGE NOTES:</strong><ul>
50 *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb
51 * is to set the bin count to approximately the length of the input file divided
52 * by 10. </li>
53 *<li>The input file <i>must</i> be a plain text file containing one valid numeric
54 * entry per line.</li>
55 * </ul></p>
56 *
57 * @version $Revision: 1.30 $ $Date: 2004/10/08 05:08:16 $
58 */
59 public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
60
61 /*** Serializable version identifier */
62 static final long serialVersionUID = -6773236347582113490L;
63
64 /*** List of SummaryStatistics objects characterizing the bins */
65 private ArrayList binStats = null;
66
67 /*** Sample statistics */
68 SummaryStatistics sampleStats = null;
69
70 /*** number of bins */
71 private int binCount = 1000;
72
73 /*** is the distribution loaded? */
74 private boolean loaded = false;
75
76 /*** upper bounds of subintervals in (0,1) "belonging" to the bins */
77 private double[] upperBounds = null;
78
79 /*** RandomData instance to use in repeated calls to getNext() */
80 private RandomData randomData = new RandomDataImpl();
81
82 /***
83 * Creates a new EmpiricalDistribution with the default bin count.
84 */
85 public EmpiricalDistributionImpl() {
86 binStats = new ArrayList();
87 }
88
89 /***
90 * Creates a new EmpiricalDistribution with the specified bin count.
91 *
92 * @param binCount number of bins
93 */
94 public EmpiricalDistributionImpl(int binCount) {
95 this.binCount = binCount;
96 binStats = new ArrayList();
97 }
98
99 /***
100 * Computes the empirical distribution from the provided
101 * array of numbers.
102 *
103 * @param in the input data array
104 */
105 public void load(double[] in) {
106 DataAdapter da = new ArrayDataAdapter(in);
107 try {
108 da.computeStats();
109 fillBinStats(in);
110 } catch (Exception e) {
111 throw new RuntimeException(e.getMessage());
112 }
113 loaded = true;
114
115 }
116
117 /***
118 * Computes the empirical distribution using data read from a URL.
119 * @param url url of the input file
120 *
121 * @throws IOException if an IO error occurs
122 */
123 public void load(URL url) throws IOException {
124 BufferedReader in =
125 new BufferedReader(new InputStreamReader(url.openStream()));
126 try {
127 DataAdapter da = new StreamDataAdapter(in);
128 try {
129 da.computeStats();
130 } catch (Exception e) {
131 throw new IOException(e.getMessage());
132 }
133 in = new BufferedReader(new InputStreamReader(url.openStream()));
134 fillBinStats(in);
135 loaded = true;
136 } finally {
137 if (in != null) {
138 try {
139 in.close();
140 } catch (Exception ex) {
141
142 }
143 }
144 }
145 }
146
147 /***
148 * Computes the empirical distribution from the input file.
149 *
150 * @param file the input file
151 * @throws IOException if an IO error occurs
152 */
153 public void load(File file) throws IOException {
154 BufferedReader in = new BufferedReader(new FileReader(file));
155 try {
156 DataAdapter da = new StreamDataAdapter(in);
157 try {
158 da.computeStats();
159 } catch (Exception e) {
160 throw new IOException(e.getMessage());
161 }
162 in = new BufferedReader(new FileReader(file));
163 fillBinStats(in);
164 loaded = true;
165 } finally {
166 if (in != null) {
167 try {
168 in.close();
169 } catch (Exception ex) {
170
171 }
172 }
173 }
174 }
175
176 /***
177 * Provides methods for computing <code>sampleStats</code> and
178 * <code>beanStats</code> abstracting the source of data.
179 */
180 private abstract class DataAdapter{
181 /***
182 * Compute bin stats.
183 *
184 * @param min minimum value
185 * @param delta grid size
186 * @throws Exception if an error occurs computing bin stats
187 */
188 public abstract void computeBinStats(double min, double delta)
189 throws Exception;
190 /***
191 * Compute sample statistics.
192 *
193 * @throws Exception if an error occurs computing sample stats
194 */
195 public abstract void computeStats() throws Exception;
196 }
197 /***
198 * Factory of <code>DataAdapter</code> objects. For every supported source
199 * of data (array of doubles, file, etc.) an instance of the proper object
200 * is returned.
201 */
202 private class DataAdapterFactory{
203 /***
204 * Creates a DataAdapter from a data object
205 *
206 * @param in object providing access to the data
207 * @return DataAdapter instance
208 */
209 public DataAdapter getAdapter(Object in) {
210 if (in instanceof BufferedReader) {
211 BufferedReader inputStream = (BufferedReader) in;
212 return new StreamDataAdapter(inputStream);
213 } else if (in instanceof double[]) {
214 double[] inputArray = (double[]) in;
215 return new ArrayDataAdapter(inputArray);
216 } else {
217 throw new IllegalArgumentException(
218 "Input data comes from the" + " unsupported source");
219 }
220 }
221 }
222 /***
223 * <code>DataAdapter</code> for data provided through some input stream
224 */
225 private class StreamDataAdapter extends DataAdapter{
226
227 /*** Input stream providng access to the data */
228 BufferedReader inputStream;
229
230 /***
231 * Create a StreamDataAdapter from a BufferedReader
232 *
233 * @param in BufferedReader input stream
234 */
235 public StreamDataAdapter(BufferedReader in){
236 super();
237 inputStream = in;
238 }
239 /***
240 * Computes binStats
241 *
242 * @param min minimum value
243 * @param delta grid size
244 * @throws IOException if an IO error occurs
245 */
246 public void computeBinStats(double min, double delta)
247 throws IOException {
248 String str = null;
249 double val = 0.0d;
250 while ((str = inputStream.readLine()) != null) {
251 val = Double.parseDouble(str);
252 SummaryStatistics stats =
253 (SummaryStatistics) binStats.get(
254 Math.max((int) Math.ceil((val - min) / delta) - 1, 0));
255 stats.addValue(val);
256 }
257
258 inputStream.close();
259 inputStream = null;
260 }
261 /***
262 * Computes sampleStats
263 *
264 * @throws IOException if an IOError occurs
265 */
266 public void computeStats() throws IOException {
267 String str = null;
268 double val = 0.0;
269 sampleStats = SummaryStatistics.newInstance();
270 while ((str = inputStream.readLine()) != null) {
271 val = new Double(str).doubleValue();
272 sampleStats.addValue(val);
273 }
274 inputStream.close();
275 inputStream = null;
276 }
277 }
278
279 /***
280 * <code>DataAdapter</code> for data provided as array of doubles.
281 */
282 private class ArrayDataAdapter extends DataAdapter{
283
284 /*** Array of input data values */
285 private double[] inputArray;
286
287 /***
288 * Construct an ArrayDataAdapter from a double[] array
289 *
290 * @param in double[] array holding the data
291 */
292 public ArrayDataAdapter(double[] in){
293 super();
294 inputArray = in;
295 }
296 /***
297 * Computes sampleStats
298 *
299 * @throws IOException if an IO error occurs
300 */
301 public void computeStats() throws IOException {
302 sampleStats = SummaryStatistics.newInstance();
303 for (int i = 0; i < inputArray.length; i++) {
304 sampleStats.addValue(inputArray[i]);
305 }
306 }
307 /***
308 * Computes binStats
309 *
310 * @param min minimum value
311 * @param delta grid size
312 * @throws IOException if an IO error occurs
313 */
314 public void computeBinStats(double min, double delta)
315 throws IOException {
316 for (int i = 0; i < inputArray.length; i++) {
317 SummaryStatistics stats =
318 (SummaryStatistics) binStats.get(
319 Math.max((int) Math.ceil(
320 (inputArray[i] - min) / delta)- 1, 0));
321 stats.addValue(inputArray[i]);
322 }
323 }
324 }
325
326 /***
327 * Fills binStats array (second pass through data file).
328 *
329 * @param in object providing access to the data
330 * @throws IOException if an IO error occurs
331 */
332 private void fillBinStats(Object in) throws IOException {
333
334 double min = sampleStats.getMin();
335 double max = sampleStats.getMax();
336 double delta = (max - min)/(new Double(binCount)).doubleValue();
337 double[] binUpperBounds = new double[binCount];
338 binUpperBounds[0] = min + delta;
339 for (int i = 1; i< binCount - 1; i++) {
340 binUpperBounds[i] = binUpperBounds[i-1] + delta;
341 }
342 binUpperBounds[binCount -1] = max;
343
344
345 if (!binStats.isEmpty()) {
346 binStats.clear();
347 }
348 for (int i = 0; i < binCount; i++) {
349 SummaryStatistics stats = SummaryStatistics.newInstance();
350 binStats.add(i,stats);
351 }
352
353
354 DataAdapterFactory aFactory = new DataAdapterFactory();
355 DataAdapter da = aFactory.getAdapter(in);
356 try {
357 da.computeBinStats(min, delta);
358 } catch (Exception e) {
359 if(e instanceof RuntimeException){
360 throw new RuntimeException(e.getMessage());
361 }else{
362 throw new IOException(e.getMessage());
363 }
364 }
365
366
367 upperBounds = new double[binCount];
368 upperBounds[0] =
369 ((double)((SummaryStatistics)binStats.get(0)).getN())/
370 (double)sampleStats.getN();
371 for (int i = 1; i < binCount-1; i++) {
372 upperBounds[i] = upperBounds[i-1] +
373 ((double)((SummaryStatistics)binStats.get(i)).getN())/
374 (double)sampleStats.getN();
375 }
376 upperBounds[binCount-1] = 1.0d;
377 }
378
379 /***
380 * Generates a random value from this distribution.
381 *
382 * @return the random value.
383 * @throws IllegalStateException if the distribution has not been loaded
384 */
385 public double getNextValue() throws IllegalStateException {
386
387 if (!loaded) {
388 throw new IllegalStateException("distribution not loaded");
389 }
390
391
392 double x = Math.random();
393
394
395 for (int i = 0; i < binCount; i++) {
396 if (x <= upperBounds[i]) {
397 SummaryStatistics stats = (SummaryStatistics)binStats.get(i);
398 if (stats.getN() > 0) {
399 if (stats.getStandardDeviation() > 0) {
400 return randomData.nextGaussian
401 (stats.getMean(),stats.getStandardDeviation());
402 } else {
403 return stats.getMean();
404 }
405 }
406 }
407 }
408 throw new RuntimeException("No bin selected");
409 }
410
411 /***
412 * Returns a {@link StatisticalSummary} describing this distribution.
413 * <strong>Preconditions:</strong><ul>
414 * <li>the distribution must be loaded before invoking this method</li></ul>
415 *
416 * @return the sample statistics
417 * @throws IllegalStateException if the distribution has not been loaded
418 */
419 public StatisticalSummary getSampleStats() {
420 return sampleStats;
421 }
422
423 /***
424 * Returns the number of bins.
425 *
426 * @return the number of bins.
427 */
428 public int getBinCount() {
429 return binCount;
430 }
431
432 /***
433 * Returns an ArrayList of {@link SummaryStatistics} instances containing
434 * statistics describing the values in each of the bins. The ArrayList is
435 * indexed on the bin number.
436 *
437 * @return List of bin statistics.
438 */
439 public List getBinStats() {
440 return binStats;
441 }
442
443 /***
444 * Returns the array of upper bounds for the bins. Bins are: <br/>
445 * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
446 * (upperBounds[binCount-1],max]
447 *
448 * @return array of bin upper bounds
449 */
450 public double[] getUpperBounds() {
451 return upperBounds;
452 }
453
454 /***
455 * Property indicating whether or not the distribution has been loaded.
456 *
457 * @return true if the distribution has been loaded
458 */
459 public boolean isLoaded() {
460 return loaded;
461 }
462 }