View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import java.io.IOException;
22  import java.lang.reflect.InvocationTargetException;
23  import java.lang.reflect.Method;
24  import java.net.URL;
25  import java.net.URLDecoder;
26  import java.util.ArrayList;
27  import java.util.Enumeration;
28  import java.util.HashMap;
29  import java.util.HashSet;
30  import java.util.List;
31  import java.util.Map;
32  import java.util.Set;
33  import java.util.zip.ZipEntry;
34  import java.util.zip.ZipFile;
35  
36  import org.apache.commons.logging.Log;
37  import org.apache.commons.logging.LogFactory;
38  import org.apache.hadoop.classification.InterfaceAudience;
39  import org.apache.hadoop.classification.InterfaceStability;
40  import org.apache.hadoop.conf.Configuration;
41  import org.apache.hadoop.fs.FileSystem;
42  import org.apache.hadoop.fs.Path;
43  import org.apache.hadoop.hbase.HBaseConfiguration;
44  import org.apache.hadoop.hbase.catalog.MetaReader;
45  import org.apache.hadoop.hbase.client.Put;
46  import org.apache.hadoop.hbase.client.Scan;
47  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
48  import org.apache.hadoop.hbase.mapreduce.hadoopbackport.JarFinder;
49  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
50  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
51  import org.apache.hadoop.hbase.security.User;
52  import org.apache.hadoop.hbase.util.Base64;
53  import org.apache.hadoop.hbase.util.Bytes;
54  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
55  import org.apache.hadoop.io.Writable;
56  import org.apache.hadoop.io.WritableComparable;
57  import org.apache.hadoop.mapreduce.InputFormat;
58  import org.apache.hadoop.mapreduce.Job;
59  import org.apache.hadoop.util.StringUtils;
60  
61  import com.google.protobuf.InvalidProtocolBufferException;
62  
63  /**
64   * Utility for {@link TableMapper} and {@link TableReducer}
65   */
66  @SuppressWarnings("unchecked")
67  @InterfaceAudience.Public
68  @InterfaceStability.Stable
69  public class TableMapReduceUtil {
70    static Log LOG = LogFactory.getLog(TableMapReduceUtil.class);
71  
72    /**
73     * Use this before submitting a TableMap job. It will appropriately set up
74     * the job.
75     *
76     * @param table  The table name to read from.
77     * @param scan  The scan instance with the columns, time range etc.
78     * @param mapper  The mapper class to use.
79     * @param outputKeyClass  The class of the output key.
80     * @param outputValueClass  The class of the output value.
81     * @param job  The current job to adjust.  Make sure the passed job is
82     * carrying all necessary HBase configuration.
83     * @throws IOException When setting up the details fails.
84     */
85    public static void initTableMapperJob(String table, Scan scan,
86        Class<? extends TableMapper> mapper,
87        Class<?> outputKeyClass,
88        Class<?> outputValueClass, Job job)
89    throws IOException {
90      initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass,
91          job, true);
92    }
93  
94  
95    /**
96     * Use this before submitting a TableMap job. It will appropriately set up
97     * the job.
98     *
99     * @param table Binary representation of the table name to read from.
100    * @param scan  The scan instance with the columns, time range etc.
101    * @param mapper  The mapper class to use.
102    * @param outputKeyClass  The class of the output key.
103    * @param outputValueClass  The class of the output value.
104    * @param job  The current job to adjust.  Make sure the passed job is
105    * carrying all necessary HBase configuration.
106    * @throws IOException When setting up the details fails.
107    */
108    public static void initTableMapperJob(byte[] table, Scan scan,
109       Class<? extends TableMapper> mapper,
110       Class<?> outputKeyClass,
111       Class<?> outputValueClass, Job job)
112   throws IOException {
113       initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass,
114               job, true);
115   }
116 
117   /**
118    * Use this before submitting a TableMap job. It will appropriately set up
119    * the job.
120    *
121    * @param table  The table name to read from.
122    * @param scan  The scan instance with the columns, time range etc.
123    * @param mapper  The mapper class to use.
124    * @param outputKeyClass  The class of the output key.
125    * @param outputValueClass  The class of the output value.
126    * @param job  The current job to adjust.  Make sure the passed job is
127    * carrying all necessary HBase configuration.
128    * @param addDependencyJars upload HBase jars and jars for any of the configured
129    *           job classes via the distributed cache (tmpjars).
130    * @throws IOException When setting up the details fails.
131    */
132   public static void initTableMapperJob(String table, Scan scan,
133       Class<? extends TableMapper> mapper,
134       Class<?> outputKeyClass,
135       Class<?> outputValueClass, Job job,
136       boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
137   throws IOException {
138     job.setInputFormatClass(inputFormatClass);
139     if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
140     if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
141     job.setMapperClass(mapper);
142     if (Put.class.equals(outputValueClass)) {
143       job.setCombinerClass(PutCombiner.class);
144     }
145     Configuration conf = job.getConfiguration();
146     HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
147     conf.set(TableInputFormat.INPUT_TABLE, table);
148     conf.set(TableInputFormat.SCAN, convertScanToString(scan));
149     conf.setStrings("io.serializations", conf.get("io.serializations"),
150         MutationSerialization.class.getName(), ResultSerialization.class.getName(),
151         KeyValueSerialization.class.getName());
152     if (addDependencyJars) {
153       addDependencyJars(job);
154     }
155     initCredentials(job);
156   }
157 
158   /**
159    * Use this before submitting a TableMap job. It will appropriately set up
160    * the job.
161    *
162    * @param table Binary representation of the table name to read from.
163    * @param scan  The scan instance with the columns, time range etc.
164    * @param mapper  The mapper class to use.
165    * @param outputKeyClass  The class of the output key.
166    * @param outputValueClass  The class of the output value.
167    * @param job  The current job to adjust.  Make sure the passed job is
168    * carrying all necessary HBase configuration.
169    * @param addDependencyJars upload HBase jars and jars for any of the configured
170    *           job classes via the distributed cache (tmpjars).
171    * @param inputFormatClass The class of the input format
172    * @throws IOException When setting up the details fails.
173    */
174   public static void initTableMapperJob(byte[] table, Scan scan,
175       Class<? extends TableMapper> mapper,
176       Class<?> outputKeyClass,
177       Class<?> outputValueClass, Job job,
178       boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
179   throws IOException {
180       initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
181               outputValueClass, job, addDependencyJars, inputFormatClass);
182   }
183 
184   /**
185    * Use this before submitting a TableMap job. It will appropriately set up
186    * the job.
187    *
188    * @param table Binary representation of the table name to read from.
189    * @param scan  The scan instance with the columns, time range etc.
190    * @param mapper  The mapper class to use.
191    * @param outputKeyClass  The class of the output key.
192    * @param outputValueClass  The class of the output value.
193    * @param job  The current job to adjust.  Make sure the passed job is
194    * carrying all necessary HBase configuration.
195    * @param addDependencyJars upload HBase jars and jars for any of the configured
196    *           job classes via the distributed cache (tmpjars).
197    * @throws IOException When setting up the details fails.
198    */
199   public static void initTableMapperJob(byte[] table, Scan scan,
200       Class<? extends TableMapper> mapper,
201       Class<?> outputKeyClass,
202       Class<?> outputValueClass, Job job,
203       boolean addDependencyJars)
204   throws IOException {
205       initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
206               outputValueClass, job, addDependencyJars, TableInputFormat.class);
207   }
208 
209   /**
210    * Use this before submitting a TableMap job. It will appropriately set up
211    * the job.
212    *
213    * @param table The table name to read from.
214    * @param scan  The scan instance with the columns, time range etc.
215    * @param mapper  The mapper class to use.
216    * @param outputKeyClass  The class of the output key.
217    * @param outputValueClass  The class of the output value.
218    * @param job  The current job to adjust.  Make sure the passed job is
219    * carrying all necessary HBase configuration.
220    * @param addDependencyJars upload HBase jars and jars for any of the configured
221    *           job classes via the distributed cache (tmpjars).
222    * @throws IOException When setting up the details fails.
223    */
224   public static void initTableMapperJob(String table, Scan scan,
225       Class<? extends TableMapper> mapper,
226       Class<?> outputKeyClass,
227       Class<?> outputValueClass, Job job,
228       boolean addDependencyJars)
229   throws IOException {
230       initTableMapperJob(table, scan, mapper, outputKeyClass,
231               outputValueClass, job, addDependencyJars, TableInputFormat.class);
232   }
233 
234   /**
235    * Use this before submitting a Multi TableMap job. It will appropriately set
236    * up the job.
237    *
238    * @param scans The list of {@link Scan} objects to read from.
239    * @param mapper The mapper class to use.
240    * @param outputKeyClass The class of the output key.
241    * @param outputValueClass The class of the output value.
242    * @param job The current job to adjust. Make sure the passed job is carrying
243    *          all necessary HBase configuration.
244    * @throws IOException When setting up the details fails.
245    */
246   public static void initTableMapperJob(List<Scan> scans,
247       Class<? extends TableMapper> mapper,
248       Class<? extends WritableComparable> outputKeyClass,
249       Class<? extends Writable> outputValueClass, Job job) throws IOException {
250     initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
251         true);
252   }
253 
254   /**
255    * Use this before submitting a Multi TableMap job. It will appropriately set
256    * up the job.
257    *
258    * @param scans The list of {@link Scan} objects to read from.
259    * @param mapper The mapper class to use.
260    * @param outputKeyClass The class of the output key.
261    * @param outputValueClass The class of the output value.
262    * @param job The current job to adjust. Make sure the passed job is carrying
263    *          all necessary HBase configuration.
264    * @param addDependencyJars upload HBase jars and jars for any of the
265    *          configured job classes via the distributed cache (tmpjars).
266    * @throws IOException When setting up the details fails.
267    */
268   public static void initTableMapperJob(List<Scan> scans,
269       Class<? extends TableMapper> mapper,
270       Class<? extends WritableComparable> outputKeyClass,
271       Class<? extends Writable> outputValueClass, Job job,
272       boolean addDependencyJars) throws IOException {
273     job.setInputFormatClass(MultiTableInputFormat.class);
274     if (outputValueClass != null) {
275       job.setMapOutputValueClass(outputValueClass);
276     }
277     if (outputKeyClass != null) {
278       job.setMapOutputKeyClass(outputKeyClass);
279     }
280     job.setMapperClass(mapper);
281     HBaseConfiguration.addHbaseResources(job.getConfiguration());
282     List<String> scanStrings = new ArrayList<String>();
283 
284     for (Scan scan : scans) {
285       scanStrings.add(convertScanToString(scan));
286     }
287     job.getConfiguration().setStrings(MultiTableInputFormat.SCANS,
288       scanStrings.toArray(new String[scanStrings.size()]));
289 
290     if (addDependencyJars) {
291       addDependencyJars(job);
292     }
293   }
294 
295   public static void initCredentials(Job job) throws IOException {
296     if (User.isHBaseSecurityEnabled(job.getConfiguration())) {
297       try {
298         // init credentials for remote cluster
299         String quorumAddress = job.getConfiguration().get(TableOutputFormat.QUORUM_ADDRESS);
300         if (quorumAddress != null) {
301           Configuration peerConf = HBaseConfiguration.create(job.getConfiguration());
302           ZKUtil.applyClusterKeyToConf(peerConf, quorumAddress);
303           User.getCurrent().obtainAuthTokenForJob(peerConf, job);
304         }
305         User.getCurrent().obtainAuthTokenForJob(job.getConfiguration(), job);
306       } catch (InterruptedException ie) {
307         LOG.info("Interrupted obtaining user authentication token");
308         Thread.interrupted();
309       }
310     }
311   }
312 
313   /**
314    * Writes the given scan into a Base64 encoded string.
315    *
316    * @param scan  The scan to write out.
317    * @return The scan saved in a Base64 encoded string.
318    * @throws IOException When writing the scan fails.
319    */
320   static String convertScanToString(Scan scan) throws IOException {
321     ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
322     return Base64.encodeBytes(proto.toByteArray());
323   }
324 
325   /**
326    * Converts the given Base64 string back into a Scan instance.
327    *
328    * @param base64  The scan details.
329    * @return The newly created Scan instance.
330    * @throws IOException When reading the scan instance fails.
331    */
332   static Scan convertStringToScan(String base64) throws IOException {
333     byte [] decoded = Base64.decode(base64);
334     ClientProtos.Scan scan;
335     try {
336       scan = ClientProtos.Scan.parseFrom(decoded);
337     } catch (InvalidProtocolBufferException ipbe) {
338       throw new IOException(ipbe);
339     }
340 
341     return ProtobufUtil.toScan(scan);
342   }
343 
344   /**
345    * Use this before submitting a TableReduce job. It will
346    * appropriately set up the JobConf.
347    *
348    * @param table  The output table.
349    * @param reducer  The reducer class to use.
350    * @param job  The current job to adjust.
351    * @throws IOException When determining the region count fails.
352    */
353   public static void initTableReducerJob(String table,
354     Class<? extends TableReducer> reducer, Job job)
355   throws IOException {
356     initTableReducerJob(table, reducer, job, null);
357   }
358 
359   /**
360    * Use this before submitting a TableReduce job. It will
361    * appropriately set up the JobConf.
362    *
363    * @param table  The output table.
364    * @param reducer  The reducer class to use.
365    * @param job  The current job to adjust.
366    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
367    * default partitioner.
368    * @throws IOException When determining the region count fails.
369    */
370   public static void initTableReducerJob(String table,
371     Class<? extends TableReducer> reducer, Job job,
372     Class partitioner) throws IOException {
373     initTableReducerJob(table, reducer, job, partitioner, null, null, null);
374   }
375 
376   /**
377    * Use this before submitting a TableReduce job. It will
378    * appropriately set up the JobConf.
379    *
380    * @param table  The output table.
381    * @param reducer  The reducer class to use.
382    * @param job  The current job to adjust.  Make sure the passed job is
383    * carrying all necessary HBase configuration.
384    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
385    * default partitioner.
386    * @param quorumAddress Distant cluster to write to; default is null for
387    * output to the cluster that is designated in <code>hbase-site.xml</code>.
388    * Set this String to the zookeeper ensemble of an alternate remote cluster
389    * when you would have the reduce write a cluster that is other than the
390    * default; e.g. copying tables between clusters, the source would be
391    * designated by <code>hbase-site.xml</code> and this param would have the
392    * ensemble address of the remote cluster.  The format to pass is particular.
393    * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
394    * </code> such as <code>server,server2,server3:2181:/hbase</code>.
395    * @param serverClass redefined hbase.regionserver.class
396    * @param serverImpl redefined hbase.regionserver.impl
397    * @throws IOException When determining the region count fails.
398    */
399   public static void initTableReducerJob(String table,
400     Class<? extends TableReducer> reducer, Job job,
401     Class partitioner, String quorumAddress, String serverClass,
402     String serverImpl) throws IOException {
403     initTableReducerJob(table, reducer, job, partitioner, quorumAddress,
404         serverClass, serverImpl, true);
405   }
406 
407   /**
408    * Use this before submitting a TableReduce job. It will
409    * appropriately set up the JobConf.
410    *
411    * @param table  The output table.
412    * @param reducer  The reducer class to use.
413    * @param job  The current job to adjust.  Make sure the passed job is
414    * carrying all necessary HBase configuration.
415    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
416    * default partitioner.
417    * @param quorumAddress Distant cluster to write to; default is null for
418    * output to the cluster that is designated in <code>hbase-site.xml</code>.
419    * Set this String to the zookeeper ensemble of an alternate remote cluster
420    * when you would have the reduce write a cluster that is other than the
421    * default; e.g. copying tables between clusters, the source would be
422    * designated by <code>hbase-site.xml</code> and this param would have the
423    * ensemble address of the remote cluster.  The format to pass is particular.
424    * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
425    * </code> such as <code>server,server2,server3:2181:/hbase</code>.
426    * @param serverClass redefined hbase.regionserver.class
427    * @param serverImpl redefined hbase.regionserver.impl
428    * @param addDependencyJars upload HBase jars and jars for any of the configured
429    *           job classes via the distributed cache (tmpjars).
430    * @throws IOException When determining the region count fails.
431    */
432   public static void initTableReducerJob(String table,
433     Class<? extends TableReducer> reducer, Job job,
434     Class partitioner, String quorumAddress, String serverClass,
435     String serverImpl, boolean addDependencyJars) throws IOException {
436 
437     Configuration conf = job.getConfiguration();
438     HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
439     job.setOutputFormatClass(TableOutputFormat.class);
440     if (reducer != null) job.setReducerClass(reducer);
441     conf.set(TableOutputFormat.OUTPUT_TABLE, table);
442     conf.setStrings("io.serializations", conf.get("io.serializations"),
443         MutationSerialization.class.getName(), ResultSerialization.class.getName());
444     // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
445     if (quorumAddress != null) {
446       // Calling this will validate the format
447       ZKUtil.transformClusterKey(quorumAddress);
448       conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress);
449     }
450     if (serverClass != null && serverImpl != null) {
451       conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
452       conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
453     }
454     job.setOutputKeyClass(ImmutableBytesWritable.class);
455     job.setOutputValueClass(Writable.class);
456     if (partitioner == HRegionPartitioner.class) {
457       job.setPartitionerClass(HRegionPartitioner.class);
458       int regions = MetaReader.getRegionCount(conf, table);
459       if (job.getNumReduceTasks() > regions) {
460         job.setNumReduceTasks(regions);
461       }
462     } else if (partitioner != null) {
463       job.setPartitionerClass(partitioner);
464     }
465 
466     if (addDependencyJars) {
467       addDependencyJars(job);
468     }
469 
470     initCredentials(job);
471   }
472 
473   /**
474    * Ensures that the given number of reduce tasks for the given job
475    * configuration does not exceed the number of regions for the given table.
476    *
477    * @param table  The table to get the region count for.
478    * @param job  The current job to adjust.
479    * @throws IOException When retrieving the table details fails.
480    */
481   public static void limitNumReduceTasks(String table, Job job)
482   throws IOException {
483     int regions = MetaReader.getRegionCount(job.getConfiguration(), table);
484     if (job.getNumReduceTasks() > regions)
485       job.setNumReduceTasks(regions);
486   }
487 
488   /**
489    * Sets the number of reduce tasks for the given job configuration to the
490    * number of regions the given table has.
491    *
492    * @param table  The table to get the region count for.
493    * @param job  The current job to adjust.
494    * @throws IOException When retrieving the table details fails.
495    */
496   public static void setNumReduceTasks(String table, Job job)
497   throws IOException {
498     job.setNumReduceTasks(MetaReader.getRegionCount(job.getConfiguration(), table));
499   }
500 
501   /**
502    * Sets the number of rows to return and cache with each scanner iteration.
503    * Higher caching values will enable faster mapreduce jobs at the expense of
504    * requiring more heap to contain the cached rows.
505    *
506    * @param job The current job to adjust.
507    * @param batchSize The number of rows to return in batch with each scanner
508    * iteration.
509    */
510   public static void setScannerCaching(Job job, int batchSize) {
511     job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
512   }
513 
514   /**
515    * Add the HBase dependency jars as well as jars for any of the configured
516    * job classes to the job configuration, so that JobClient will ship them
517    * to the cluster and add them to the DistributedCache.
518    */
519   public static void addDependencyJars(Job job) throws IOException {
520     try {
521       addDependencyJars(job.getConfiguration(),
522           // explicitly pull a class from each module
523           org.apache.hadoop.hbase.HConstants.class,                      // hbase-common
524           org.apache.hadoop.hbase.protobuf.generated.ClientProtos.class, // hbase-protocol
525           org.apache.hadoop.hbase.client.Put.class,                      // hbase-client
526           org.apache.hadoop.hbase.CompatibilityFactory.class,            // hbase-hadoop-compat
527           // pull necessary dependencies
528           org.apache.zookeeper.ZooKeeper.class,
529           com.google.protobuf.Message.class,
530           com.google.common.collect.Lists.class,
531           org.cloudera.htrace.Trace.class,
532           // pull job classes
533           job.getMapOutputKeyClass(),
534           job.getMapOutputValueClass(),
535           job.getInputFormatClass(),
536           job.getOutputKeyClass(),
537           job.getOutputValueClass(),
538           job.getOutputFormatClass(),
539           job.getPartitionerClass(),
540           job.getCombinerClass());
541     } catch (ClassNotFoundException e) {
542       throw new IOException(e);
543     }
544   }
545 
546   /**
547    * Add the jars containing the given classes to the job's configuration
548    * such that JobClient will ship them to the cluster and add them to
549    * the DistributedCache.
550    */
551   public static void addDependencyJars(Configuration conf,
552       Class<?>... classes) throws IOException {
553 
554     FileSystem localFs = FileSystem.getLocal(conf);
555     Set<String> jars = new HashSet<String>();
556     // Add jars that are already in the tmpjars variable
557     jars.addAll(conf.getStringCollection("tmpjars"));
558 
559     // add jars as we find them to a map of contents jar name so that we can avoid
560     // creating new jars for classes that have already been packaged.
561     Map<String, String> packagedClasses = new HashMap<String, String>();
562 
563     // Add jars containing the specified classes
564     for (Class<?> clazz : classes) {
565       if (clazz == null) continue;
566 
567       Path path = findOrCreateJar(clazz, localFs, packagedClasses);
568       if (path == null) {
569         LOG.warn("Could not find jar for class " + clazz +
570                  " in order to ship it to the cluster.");
571         continue;
572       }
573       if (!localFs.exists(path)) {
574         LOG.warn("Could not validate jar file " + path + " for class "
575                  + clazz);
576         continue;
577       }
578       jars.add(path.toString());
579     }
580     if (jars.isEmpty()) return;
581 
582     conf.set("tmpjars",
583              StringUtils.arrayToString(jars.toArray(new String[0])));
584   }
585 
586   /**
587    * If org.apache.hadoop.util.JarFinder is available (0.23+ hadoop), finds
588    * the Jar for a class or creates it if it doesn't exist. If the class is in
589    * a directory in the classpath, it creates a Jar on the fly with the
590    * contents of the directory and returns the path to that Jar. If a Jar is
591    * created, it is created in the system temporary directory. Otherwise,
592    * returns an existing jar that contains a class of the same name. Maintains
593    * a mapping from jar contents to the tmp jar created.
594    * @param my_class the class to find.
595    * @param fs the FileSystem with which to qualify the returned path.
596    * @param packagedClasses a map of class name to path.
597    * @return a jar file that contains the class.
598    * @throws IOException
599    */
600   private static Path findOrCreateJar(Class<?> my_class, FileSystem fs,
601       Map<String, String> packagedClasses)
602   throws IOException {
603     // attempt to locate an existing jar for the class.
604     String jar = findContainingJar(my_class, packagedClasses);
605     if (null == jar || jar.isEmpty()) {
606       jar = getJar(my_class);
607       updateMap(jar, packagedClasses);
608     }
609 
610     if (null == jar || jar.isEmpty()) {
611       throw new IOException("Cannot locate resource for class " + my_class.getName());
612     }
613 
614     LOG.debug(String.format("For class %s, using jar %s", my_class.getName(), jar));
615     return new Path(jar).makeQualified(fs);
616   }
617 
618   /**
619    * Add entries to <code>packagedClasses</code> corresponding to class files
620    * contained in <code>jar</code>.
621    * @param jar The jar who's content to list.
622    * @param packagedClasses map[class -> jar]
623    */
624   private static void updateMap(String jar, Map<String, String> packagedClasses) throws IOException {
625     ZipFile zip = null;
626     try {
627       zip = new ZipFile(jar);
628       for (Enumeration<? extends ZipEntry> iter = zip.entries(); iter.hasMoreElements();) {
629         ZipEntry entry = iter.nextElement();
630         if (entry.getName().endsWith("class")) {
631           packagedClasses.put(entry.getName(), jar);
632         }
633       }
634     } finally {
635       if (null != zip) zip.close();
636     }
637   }
638 
639   /**
640    * Find a jar that contains a class of the same name, if any. It will return
641    * a jar file, even if that is not the first thing on the class path that
642    * has a class with the same name. Looks first on the classpath and then in
643    * the <code>packagedClasses</code> map.
644    * @param my_class the class to find.
645    * @return a jar file that contains the class, or null.
646    * @throws IOException
647    */
648   private static String findContainingJar(Class<?> my_class, Map<String, String> packagedClasses)
649       throws IOException {
650     ClassLoader loader = my_class.getClassLoader();
651     String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
652 
653     // first search the classpath
654     for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
655       URL url = itr.nextElement();
656       if ("jar".equals(url.getProtocol())) {
657         String toReturn = url.getPath();
658         if (toReturn.startsWith("file:")) {
659           toReturn = toReturn.substring("file:".length());
660         }
661         // URLDecoder is a misnamed class, since it actually decodes
662         // x-www-form-urlencoded MIME type rather than actual
663         // URL encoding (which the file path has). Therefore it would
664         // decode +s to ' 's which is incorrect (spaces are actually
665         // either unencoded or encoded as "%20"). Replace +s first, so
666         // that they are kept sacred during the decoding process.
667         toReturn = toReturn.replaceAll("\\+", "%2B");
668         toReturn = URLDecoder.decode(toReturn, "UTF-8");
669         return toReturn.replaceAll("!.*$", "");
670       }
671     }
672 
673     // now look in any jars we've packaged using JarFinder. Returns null when
674     // no jar is found.
675     return packagedClasses.get(class_file);
676   }
677 
678   /**
679    * Invoke 'getJar' on a JarFinder implementation. Useful for some job
680    * configuration contexts (HBASE-8140) and also for testing on MRv2. First
681    * check if we have HADOOP-9426. Lacking that, fall back to the backport.
682    * @param my_class the class to find.
683    * @return a jar file that contains the class, or null.
684    */
685   private static String getJar(Class<?> my_class) {
686     String ret = null;
687     String hadoopJarFinder = "org.apache.hadoop.util.JarFinder";
688     Class<?> jarFinder = null;
689     try {
690       LOG.debug("Looking for " + hadoopJarFinder + ".");
691       jarFinder = Class.forName(hadoopJarFinder);
692       LOG.debug(hadoopJarFinder + " found.");
693       Method getJar = jarFinder.getMethod("getJar", Class.class);
694       ret = (String) getJar.invoke(null, my_class);
695     } catch (ClassNotFoundException e) {
696       LOG.debug("Using backported JarFinder.");
697       ret = JarFinder.getJar(my_class);
698     } catch (InvocationTargetException e) {
699       // function was properly called, but threw it's own exception. Unwrap it
700       // and pass it on.
701       throw new RuntimeException(e.getCause());
702     } catch (Exception e) {
703       // toss all other exceptions, related to reflection failure
704       throw new RuntimeException("getJar invocation failed.", e);
705     }
706 
707     return ret;
708   }
709 }