View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.mapreduce.hadoopbackport;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.List;
25  import java.util.Random;
26  
27  import org.apache.commons.logging.Log;
28  import org.apache.commons.logging.LogFactory;
29  
30  import org.apache.hadoop.conf.Configuration;
31  import org.apache.hadoop.conf.Configured;
32  import org.apache.hadoop.fs.FileSystem;
33  import org.apache.hadoop.fs.Path;
34  import org.apache.hadoop.io.NullWritable;
35  import org.apache.hadoop.io.RawComparator;
36  import org.apache.hadoop.io.SequenceFile;
37  import org.apache.hadoop.io.WritableComparable;
38  import org.apache.hadoop.mapreduce.InputFormat;
39  import org.apache.hadoop.mapreduce.InputSplit;
40  import org.apache.hadoop.mapreduce.Job;
41  import org.apache.hadoop.mapreduce.RecordReader;
42  import org.apache.hadoop.mapreduce.TaskAttemptContext;
43  import org.apache.hadoop.mapreduce.TaskAttemptID;
44  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
45  import org.apache.hadoop.util.ReflectionUtils;
46  import org.apache.hadoop.util.Tool;
47  import org.apache.hadoop.util.ToolRunner;
48  
49  /**
50   * Utility for collecting samples and writing a partition file for
51   * {@link TotalOrderPartitioner}.
52   *
53   * This is an identical copy of o.a.h.mapreduce.lib.partition.TotalOrderPartitioner
54   * from Hadoop trunk at r910774, with the exception of replacing
55   * TaskAttemptContextImpl with TaskAttemptContext.
56   */
57  public class InputSampler<K,V> extends Configured implements Tool  {
58  
59    private static final Log LOG = LogFactory.getLog(InputSampler.class);
60  
61    static int printUsage() {
62      System.out.println("sampler -r <reduces>\n" +
63        "      [-inFormat <input format class>]\n" +
64        "      [-keyClass <map input & output key class>]\n" +
65        "      [-splitRandom <double pcnt> <numSamples> <maxsplits> | " +
66        "// Sample from random splits at random (general)\n" +
67        "       -splitSample <numSamples> <maxsplits> | " +
68        "             // Sample from first records in splits (random data)\n"+
69        "       -splitInterval <double pcnt> <maxsplits>]" +
70        "             // Sample from splits at intervals (sorted data)");
71      System.out.println("Default sampler: -splitRandom 0.1 10000 10");
72      ToolRunner.printGenericCommandUsage(System.out);
73      return -1;
74    }
75  
76    public InputSampler(Configuration conf) {
77      setConf(conf);
78    }
79  
80    /**
81     * Interface to sample using an 
82     * {@link org.apache.hadoop.mapreduce.InputFormat}.
83     */
84    public interface Sampler<K,V> {
85      /**
86       * For a given job, collect and return a subset of the keys from the
87       * input data.
88       */
89      K[] getSample(InputFormat<K,V> inf, Job job) 
90      throws IOException, InterruptedException;
91    }
92  
93    /**
94     * Samples the first n records from s splits.
95     * Inexpensive way to sample random data.
96     */
97    public static class SplitSampler<K,V> implements Sampler<K,V> {
98  
99      private final int numSamples;
100     private final int maxSplitsSampled;
101 
102     /**
103      * Create a SplitSampler sampling <em>all</em> splits.
104      * Takes the first numSamples / numSplits records from each split.
105      * @param numSamples Total number of samples to obtain from all selected
106      *                   splits.
107      */
108     public SplitSampler(int numSamples) {
109       this(numSamples, Integer.MAX_VALUE);
110     }
111 
112     /**
113      * Create a new SplitSampler.
114      * @param numSamples Total number of samples to obtain from all selected
115      *                   splits.
116      * @param maxSplitsSampled The maximum number of splits to examine.
117      */
118     public SplitSampler(int numSamples, int maxSplitsSampled) {
119       this.numSamples = numSamples;
120       this.maxSplitsSampled = maxSplitsSampled;
121     }
122 
123     /**
124      * From each split sampled, take the first numSamples / numSplits records.
125      */
126     @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
127     public K[] getSample(InputFormat<K,V> inf, Job job) 
128         throws IOException, InterruptedException {
129       List<InputSplit> splits = inf.getSplits(job);
130       ArrayList<K> samples = new ArrayList<K>(numSamples);
131       int splitsToSample = Math.min(maxSplitsSampled, splits.size());
132       int splitStep = splits.size() / splitsToSample;
133       int samplesPerSplit = numSamples / splitsToSample;
134       long records = 0;
135       for (int i = 0; i < splitsToSample; ++i) {
136         RecordReader<K,V> reader = inf.createRecordReader(
137           splits.get(i * splitStep), 
138           new TaskAttemptContext(job.getConfiguration(), 
139                                  new TaskAttemptID()));
140         while (reader.nextKeyValue()) {
141           samples.add(reader.getCurrentKey());
142           ++records;
143           if ((i+1) * samplesPerSplit <= records) {
144             break;
145           }
146         }
147         reader.close();
148       }
149       return (K[])samples.toArray();
150     }
151   }
152 
153   /**
154    * Sample from random points in the input.
155    * General-purpose sampler. Takes numSamples / maxSplitsSampled inputs from
156    * each split.
157    */
158   public static class RandomSampler<K,V> implements Sampler<K,V> {
159     private double freq;
160     private final int numSamples;
161     private final int maxSplitsSampled;
162 
163     /**
164      * Create a new RandomSampler sampling <em>all</em> splits.
165      * This will read every split at the client, which is very expensive.
166      * @param freq Probability with which a key will be chosen.
167      * @param numSamples Total number of samples to obtain from all selected
168      *                   splits.
169      */
170     public RandomSampler(double freq, int numSamples) {
171       this(freq, numSamples, Integer.MAX_VALUE);
172     }
173 
174     /**
175      * Create a new RandomSampler.
176      * @param freq Probability with which a key will be chosen.
177      * @param numSamples Total number of samples to obtain from all selected
178      *                   splits.
179      * @param maxSplitsSampled The maximum number of splits to examine.
180      */
181     public RandomSampler(double freq, int numSamples, int maxSplitsSampled) {
182       this.freq = freq;
183       this.numSamples = numSamples;
184       this.maxSplitsSampled = maxSplitsSampled;
185     }
186 
187     /**
188      * Randomize the split order, then take the specified number of keys from
189      * each split sampled, where each key is selected with the specified
190      * probability and possibly replaced by a subsequently selected key when
191      * the quota of keys from that split is satisfied.
192      */
193     @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
194     public K[] getSample(InputFormat<K,V> inf, Job job) 
195         throws IOException, InterruptedException {
196       List<InputSplit> splits = inf.getSplits(job);
197       ArrayList<K> samples = new ArrayList<K>(numSamples);
198       int splitsToSample = Math.min(maxSplitsSampled, splits.size());
199 
200       Random r = new Random();
201       long seed = r.nextLong();
202       r.setSeed(seed);
203       LOG.debug("seed: " + seed);
204       // shuffle splits
205       for (int i = 0; i < splits.size(); ++i) {
206         InputSplit tmp = splits.get(i);
207         int j = r.nextInt(splits.size());
208         splits.set(i, splits.get(j));
209         splits.set(j, tmp);
210       }
211       // our target rate is in terms of the maximum number of sample splits,
212       // but we accept the possibility of sampling additional splits to hit
213       // the target sample keyset
214       for (int i = 0; i < splitsToSample ||
215                      (i < splits.size() && samples.size() < numSamples); ++i) {
216         RecordReader<K,V> reader = inf.createRecordReader(splits.get(i), 
217           new TaskAttemptContext(job.getConfiguration(), 
218                                  new TaskAttemptID()));
219         while (reader.nextKeyValue()) {
220           if (r.nextDouble() <= freq) {
221             if (samples.size() < numSamples) {
222               samples.add(reader.getCurrentKey());
223             } else {
224               // When exceeding the maximum number of samples, replace a
225               // random element with this one, then adjust the frequency
226               // to reflect the possibility of existing elements being
227               // pushed out
228               int ind = r.nextInt(numSamples);
229               if (ind != numSamples) {
230                 samples.set(ind, reader.getCurrentKey());
231               }
232               freq *= (numSamples - 1) / (double) numSamples;
233             }
234           }
235         }
236         reader.close();
237       }
238       return (K[])samples.toArray();
239     }
240   }
241 
242   /**
243    * Sample from s splits at regular intervals.
244    * Useful for sorted data.
245    */
246   public static class IntervalSampler<K,V> implements Sampler<K,V> {
247     private final double freq;
248     private final int maxSplitsSampled;
249 
250     /**
251      * Create a new IntervalSampler sampling <em>all</em> splits.
252      * @param freq The frequency with which records will be emitted.
253      */
254     public IntervalSampler(double freq) {
255       this(freq, Integer.MAX_VALUE);
256     }
257 
258     /**
259      * Create a new IntervalSampler.
260      * @param freq The frequency with which records will be emitted.
261      * @param maxSplitsSampled The maximum number of splits to examine.
262      * @see #getSample
263      */
264     public IntervalSampler(double freq, int maxSplitsSampled) {
265       this.freq = freq;
266       this.maxSplitsSampled = maxSplitsSampled;
267     }
268 
269     /**
270      * For each split sampled, emit when the ratio of the number of records
271      * retained to the total record count is less than the specified
272      * frequency.
273      */
274     @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
275     public K[] getSample(InputFormat<K,V> inf, Job job) 
276         throws IOException, InterruptedException {
277       List<InputSplit> splits = inf.getSplits(job);
278       ArrayList<K> samples = new ArrayList<K>();
279       int splitsToSample = Math.min(maxSplitsSampled, splits.size());
280       int splitStep = splits.size() / splitsToSample;
281       long records = 0;
282       long kept = 0;
283       for (int i = 0; i < splitsToSample; ++i) {
284         RecordReader<K,V> reader = inf.createRecordReader(
285           splits.get(i * splitStep),
286           new TaskAttemptContext(job.getConfiguration(), 
287                                  new TaskAttemptID()));
288         while (reader.nextKeyValue()) {
289           ++records;
290           if ((double) kept / records < freq) {
291             ++kept;
292             samples.add(reader.getCurrentKey());
293           }
294         }
295         reader.close();
296       }
297       return (K[])samples.toArray();
298     }
299   }
300 
301   /**
302    * Write a partition file for the given job, using the Sampler provided.
303    * Queries the sampler for a sample keyset, sorts by the output key
304    * comparator, selects the keys for each rank, and writes to the destination
305    * returned from {@link TotalOrderPartitioner#getPartitionFile}.
306    */
307   @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
308   public static <K,V> void writePartitionFile(Job job, Sampler<K,V> sampler) 
309       throws IOException, ClassNotFoundException, InterruptedException {
310     Configuration conf = job.getConfiguration();
311     final InputFormat inf = 
312         ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
313     int numPartitions = job.getNumReduceTasks();
314     K[] samples = sampler.getSample(inf, job);
315     LOG.info("Using " + samples.length + " samples");
316     RawComparator<K> comparator =
317       (RawComparator<K>) job.getSortComparator();
318     Arrays.sort(samples, comparator);
319     Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
320     FileSystem fs = dst.getFileSystem(conf);
321     if (fs.exists(dst)) {
322       fs.delete(dst, false);
323     }
324     SequenceFile.Writer writer = SequenceFile.createWriter(fs, 
325       conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
326     NullWritable nullValue = NullWritable.get();
327     float stepSize = samples.length / (float) numPartitions;
328     int last = -1;
329     for(int i = 1; i < numPartitions; ++i) {
330       int k = Math.round(stepSize * i);
331       while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
332         ++k;
333       }
334       writer.append(samples[k], nullValue);
335       last = k;
336     }
337     writer.close();
338   }
339 
340   /**
341    * Driver for InputSampler from the command line.
342    * Configures a JobConf instance and calls {@link #writePartitionFile}.
343    */
344   public int run(String[] args) throws Exception {
345     Job job = new Job(getConf());
346     ArrayList<String> otherArgs = new ArrayList<String>();
347     Sampler<K,V> sampler = null;
348     for(int i=0; i < args.length; ++i) {
349       try {
350         if ("-r".equals(args[i])) {
351           job.setNumReduceTasks(Integer.parseInt(args[++i]));
352         } else if ("-inFormat".equals(args[i])) {
353           job.setInputFormatClass(
354               Class.forName(args[++i]).asSubclass(InputFormat.class));
355         } else if ("-keyClass".equals(args[i])) {
356           job.setMapOutputKeyClass(
357               Class.forName(args[++i]).asSubclass(WritableComparable.class));
358         } else if ("-splitSample".equals(args[i])) {
359           int numSamples = Integer.parseInt(args[++i]);
360           int maxSplits = Integer.parseInt(args[++i]);
361           if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE;
362           sampler = new SplitSampler<K,V>(numSamples, maxSplits);
363         } else if ("-splitRandom".equals(args[i])) {
364           double pcnt = Double.parseDouble(args[++i]);
365           int numSamples = Integer.parseInt(args[++i]);
366           int maxSplits = Integer.parseInt(args[++i]);
367           if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE;
368           sampler = new RandomSampler<K,V>(pcnt, numSamples, maxSplits);
369         } else if ("-splitInterval".equals(args[i])) {
370           double pcnt = Double.parseDouble(args[++i]);
371           int maxSplits = Integer.parseInt(args[++i]);
372           if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE;
373           sampler = new IntervalSampler<K,V>(pcnt, maxSplits);
374         } else {
375           otherArgs.add(args[i]);
376         }
377       } catch (NumberFormatException except) {
378         System.out.println("ERROR: Integer expected instead of " + args[i]);
379         return printUsage();
380       } catch (ArrayIndexOutOfBoundsException except) {
381         System.out.println("ERROR: Required parameter missing from " +
382             args[i-1]);
383         return printUsage();
384       }
385     }
386     if (job.getNumReduceTasks() <= 1) {
387       System.err.println("Sampler requires more than one reducer");
388       return printUsage();
389     }
390     if (otherArgs.size() < 2) {
391       System.out.println("ERROR: Wrong number of parameters: ");
392       return printUsage();
393     }
394     if (null == sampler) {
395       sampler = new RandomSampler<K,V>(0.1, 10000, 10);
396     }
397 
398     Path outf = new Path(otherArgs.remove(otherArgs.size() - 1));
399     TotalOrderPartitioner.setPartitionFile(getConf(), outf);
400     for (String s : otherArgs) {
401       FileInputFormat.addInputPath(job, new Path(s));
402     }
403     InputSampler.<K,V>writePartitionFile(job, sampler);
404 
405     return 0;
406   }
407 
408   public static void main(String[] args) throws Exception {
409     InputSampler<?,?> sampler = new InputSampler(new Configuration());
410     int res = ToolRunner.run(sampler, args);
411     System.exit(res);
412   }
413 }