View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import java.io.File;
22  import java.io.IOException;
23  import java.lang.reflect.InvocationTargetException;
24  import java.lang.reflect.Method;
25  import java.net.URL;
26  import java.net.URLDecoder;
27  import java.util.ArrayList;
28  import java.util.Enumeration;
29  import java.util.HashMap;
30  import java.util.HashSet;
31  import java.util.List;
32  import java.util.Map;
33  import java.util.Set;
34  import java.util.zip.ZipEntry;
35  import java.util.zip.ZipFile;
36  
37  import org.apache.commons.logging.Log;
38  import org.apache.commons.logging.LogFactory;
39  import org.apache.hadoop.classification.InterfaceAudience;
40  import org.apache.hadoop.classification.InterfaceStability;
41  import org.apache.hadoop.conf.Configuration;
42  import org.apache.hadoop.fs.FileSystem;
43  import org.apache.hadoop.fs.Path;
44  import org.apache.hadoop.hbase.HBaseConfiguration;
45  import org.apache.hadoop.hbase.catalog.MetaReader;
46  import org.apache.hadoop.hbase.client.Put;
47  import org.apache.hadoop.hbase.client.Scan;
48  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
49  import org.apache.hadoop.hbase.mapreduce.hadoopbackport.JarFinder;
50  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
51  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
52  import org.apache.hadoop.hbase.security.User;
53  import org.apache.hadoop.hbase.security.UserProvider;
54  import org.apache.hadoop.hbase.security.token.AuthenticationTokenIdentifier;
55  import org.apache.hadoop.hbase.security.token.AuthenticationTokenSelector;
56  import org.apache.hadoop.hbase.util.Base64;
57  import org.apache.hadoop.hbase.util.Bytes;
58  import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
59  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
60  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
61  import org.apache.hadoop.io.Text;
62  import org.apache.hadoop.io.Writable;
63  import org.apache.hadoop.io.WritableComparable;
64  import org.apache.hadoop.mapreduce.InputFormat;
65  import org.apache.hadoop.mapreduce.Job;
66  import org.apache.hadoop.security.token.Token;
67  import org.apache.hadoop.util.StringUtils;
68  import org.apache.zookeeper.KeeperException;
69  
70  import com.google.protobuf.InvalidProtocolBufferException;
71  
72  /**
73   * Utility for {@link TableMapper} and {@link TableReducer}
74   */
75  @SuppressWarnings({ "rawtypes", "unchecked" })
76  @InterfaceAudience.Public
77  @InterfaceStability.Stable
78  public class TableMapReduceUtil {
79    static Log LOG = LogFactory.getLog(TableMapReduceUtil.class);
80  
81    /**
82     * Use this before submitting a TableMap job. It will appropriately set up
83     * the job.
84     *
85     * @param table  The table name to read from.
86     * @param scan  The scan instance with the columns, time range etc.
87     * @param mapper  The mapper class to use.
88     * @param outputKeyClass  The class of the output key.
89     * @param outputValueClass  The class of the output value.
90     * @param job  The current job to adjust.  Make sure the passed job is
91     * carrying all necessary HBase configuration.
92     * @throws IOException When setting up the details fails.
93     */
94    public static void initTableMapperJob(String table, Scan scan,
95        Class<? extends TableMapper> mapper,
96        Class<?> outputKeyClass,
97        Class<?> outputValueClass, Job job)
98    throws IOException {
99      initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass,
100         job, true);
101   }
102 
103   /**
104    * Use this before submitting a TableMap job. It will appropriately set up
105    * the job.
106    *
107    * @param table Binary representation of the table name to read from.
108    * @param scan  The scan instance with the columns, time range etc.
109    * @param mapper  The mapper class to use.
110    * @param outputKeyClass  The class of the output key.
111    * @param outputValueClass  The class of the output value.
112    * @param job  The current job to adjust.  Make sure the passed job is
113    * carrying all necessary HBase configuration.
114    * @throws IOException When setting up the details fails.
115    */
116    public static void initTableMapperJob(byte[] table, Scan scan,
117       Class<? extends TableMapper> mapper,
118       Class<?> outputKeyClass,
119       Class<?> outputValueClass, Job job)
120   throws IOException {
121       initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass,
122               job, true);
123   }
124 
125   /**
126    * Use this before submitting a TableMap job. It will appropriately set up
127    * the job.
128    *
129    * @param table  The table name to read from.
130    * @param scan  The scan instance with the columns, time range etc.
131    * @param mapper  The mapper class to use.
132    * @param outputKeyClass  The class of the output key.
133    * @param outputValueClass  The class of the output value.
134    * @param job  The current job to adjust.  Make sure the passed job is
135    * carrying all necessary HBase configuration.
136    * @param addDependencyJars upload HBase jars and jars for any of the configured
137    *           job classes via the distributed cache (tmpjars).
138    * @throws IOException When setting up the details fails.
139    */
140   public static void initTableMapperJob(String table, Scan scan,
141       Class<? extends TableMapper> mapper,
142       Class<?> outputKeyClass,
143       Class<?> outputValueClass, Job job,
144       boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
145   throws IOException {
146     job.setInputFormatClass(inputFormatClass);
147     if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
148     if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
149     job.setMapperClass(mapper);
150     if (Put.class.equals(outputValueClass)) {
151       job.setCombinerClass(PutCombiner.class);
152     }
153     Configuration conf = job.getConfiguration();
154     HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
155     conf.set(TableInputFormat.INPUT_TABLE, table);
156     conf.set(TableInputFormat.SCAN, convertScanToString(scan));
157     conf.setStrings("io.serializations", conf.get("io.serializations"),
158         MutationSerialization.class.getName(), ResultSerialization.class.getName(),
159         KeyValueSerialization.class.getName());
160     if (addDependencyJars) {
161       addDependencyJars(job);
162     }
163     initCredentials(job);
164   }
165 
166   /**
167    * Use this before submitting a TableMap job. It will appropriately set up
168    * the job.
169    *
170    * @param table Binary representation of the table name to read from.
171    * @param scan  The scan instance with the columns, time range etc.
172    * @param mapper  The mapper class to use.
173    * @param outputKeyClass  The class of the output key.
174    * @param outputValueClass  The class of the output value.
175    * @param job  The current job to adjust.  Make sure the passed job is
176    * carrying all necessary HBase configuration.
177    * @param addDependencyJars upload HBase jars and jars for any of the configured
178    *           job classes via the distributed cache (tmpjars).
179    * @param inputFormatClass The class of the input format
180    * @throws IOException When setting up the details fails.
181    */
182   public static void initTableMapperJob(byte[] table, Scan scan,
183       Class<? extends TableMapper> mapper,
184       Class<?> outputKeyClass,
185       Class<?> outputValueClass, Job job,
186       boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
187   throws IOException {
188       initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
189               outputValueClass, job, addDependencyJars, inputFormatClass);
190   }
191 
192   /**
193    * Use this before submitting a TableMap job. It will appropriately set up
194    * the job.
195    *
196    * @param table Binary representation of the table name to read from.
197    * @param scan  The scan instance with the columns, time range etc.
198    * @param mapper  The mapper class to use.
199    * @param outputKeyClass  The class of the output key.
200    * @param outputValueClass  The class of the output value.
201    * @param job  The current job to adjust.  Make sure the passed job is
202    * carrying all necessary HBase configuration.
203    * @param addDependencyJars upload HBase jars and jars for any of the configured
204    *           job classes via the distributed cache (tmpjars).
205    * @throws IOException When setting up the details fails.
206    */
207   public static void initTableMapperJob(byte[] table, Scan scan,
208       Class<? extends TableMapper> mapper,
209       Class<?> outputKeyClass,
210       Class<?> outputValueClass, Job job,
211       boolean addDependencyJars)
212   throws IOException {
213       initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
214               outputValueClass, job, addDependencyJars, TableInputFormat.class);
215   }
216 
217   /**
218    * Use this before submitting a TableMap job. It will appropriately set up
219    * the job.
220    *
221    * @param table The table name to read from.
222    * @param scan  The scan instance with the columns, time range etc.
223    * @param mapper  The mapper class to use.
224    * @param outputKeyClass  The class of the output key.
225    * @param outputValueClass  The class of the output value.
226    * @param job  The current job to adjust.  Make sure the passed job is
227    * carrying all necessary HBase configuration.
228    * @param addDependencyJars upload HBase jars and jars for any of the configured
229    *           job classes via the distributed cache (tmpjars).
230    * @throws IOException When setting up the details fails.
231    */
232   public static void initTableMapperJob(String table, Scan scan,
233       Class<? extends TableMapper> mapper,
234       Class<?> outputKeyClass,
235       Class<?> outputValueClass, Job job,
236       boolean addDependencyJars)
237   throws IOException {
238       initTableMapperJob(table, scan, mapper, outputKeyClass,
239               outputValueClass, job, addDependencyJars, TableInputFormat.class);
240   }
241 
242   /**
243    * Use this before submitting a Multi TableMap job. It will appropriately set
244    * up the job.
245    *
246    * @param scans The list of {@link Scan} objects to read from.
247    * @param mapper The mapper class to use.
248    * @param outputKeyClass The class of the output key.
249    * @param outputValueClass The class of the output value.
250    * @param job The current job to adjust. Make sure the passed job is carrying
251    *          all necessary HBase configuration.
252    * @throws IOException When setting up the details fails.
253    */
254   public static void initTableMapperJob(List<Scan> scans,
255       Class<? extends TableMapper> mapper,
256       Class<? extends WritableComparable> outputKeyClass,
257       Class<? extends Writable> outputValueClass, Job job) throws IOException {
258     initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
259         true);
260   }
261 
262   /**
263    * Use this before submitting a Multi TableMap job. It will appropriately set
264    * up the job.
265    *
266    * @param scans The list of {@link Scan} objects to read from.
267    * @param mapper The mapper class to use.
268    * @param outputKeyClass The class of the output key.
269    * @param outputValueClass The class of the output value.
270    * @param job The current job to adjust. Make sure the passed job is carrying
271    *          all necessary HBase configuration.
272    * @param addDependencyJars upload HBase jars and jars for any of the
273    *          configured job classes via the distributed cache (tmpjars).
274    * @throws IOException When setting up the details fails.
275    */
276   public static void initTableMapperJob(List<Scan> scans,
277       Class<? extends TableMapper> mapper,
278       Class<? extends WritableComparable> outputKeyClass,
279       Class<? extends Writable> outputValueClass, Job job,
280       boolean addDependencyJars) throws IOException {
281     job.setInputFormatClass(MultiTableInputFormat.class);
282     if (outputValueClass != null) {
283       job.setMapOutputValueClass(outputValueClass);
284     }
285     if (outputKeyClass != null) {
286       job.setMapOutputKeyClass(outputKeyClass);
287     }
288     job.setMapperClass(mapper);
289     HBaseConfiguration.addHbaseResources(job.getConfiguration());
290     List<String> scanStrings = new ArrayList<String>();
291 
292     for (Scan scan : scans) {
293       scanStrings.add(convertScanToString(scan));
294     }
295     job.getConfiguration().setStrings(MultiTableInputFormat.SCANS,
296       scanStrings.toArray(new String[scanStrings.size()]));
297 
298     if (addDependencyJars) {
299       addDependencyJars(job);
300     }
301   }
302 
303   public static void initCredentials(Job job) throws IOException {
304     UserProvider userProvider = UserProvider.instantiate(job.getConfiguration());
305     if (userProvider.isHadoopSecurityEnabled()) {
306       // propagate delegation related props from launcher job to MR job
307       if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
308         job.getConfiguration().set("mapreduce.job.credentials.binary",
309                                    System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
310       }
311     }
312 
313     if (userProvider.isHBaseSecurityEnabled()) {
314       try {
315         // init credentials for remote cluster
316         String quorumAddress = job.getConfiguration().get(TableOutputFormat.QUORUM_ADDRESS);
317         User user = userProvider.getCurrent();
318         if (quorumAddress != null) {
319           Configuration peerConf = HBaseConfiguration.create(job.getConfiguration());
320           ZKUtil.applyClusterKeyToConf(peerConf, quorumAddress);
321           obtainAuthTokenForJob(job, peerConf, user);
322         }
323 
324         obtainAuthTokenForJob(job, job.getConfiguration(), user);
325       } catch (InterruptedException ie) {
326         LOG.info("Interrupted obtaining user authentication token");
327         Thread.interrupted();
328       }
329     }
330   }
331 
332   private static void obtainAuthTokenForJob(Job job, Configuration conf, User user)
333       throws IOException, InterruptedException {
334     Token<AuthenticationTokenIdentifier> authToken = getAuthToken(conf, user);
335     if (authToken == null) {
336       user.obtainAuthTokenForJob(conf, job);
337     } else {
338       job.getCredentials().addToken(authToken.getService(), authToken);
339     }
340   }
341 
342   /**
343    * Get the authentication token of the user for the cluster specified in the configuration
344    * @return null if the user does not have the token, otherwise the auth token for the cluster.
345    */
346   private static Token<AuthenticationTokenIdentifier> getAuthToken(Configuration conf, User user)
347       throws IOException, InterruptedException {
348     ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "mr-init-credentials", null);
349     try {
350       String clusterId = ZKClusterId.readClusterIdZNode(zkw);
351       return new AuthenticationTokenSelector().selectToken(new Text(clusterId), user.getUGI().getTokens());
352     } catch (KeeperException e) {
353       throw new IOException(e);
354     } finally {
355       zkw.close();
356     }
357   }
358 
359   /**
360    * Writes the given scan into a Base64 encoded string.
361    *
362    * @param scan  The scan to write out.
363    * @return The scan saved in a Base64 encoded string.
364    * @throws IOException When writing the scan fails.
365    */
366   static String convertScanToString(Scan scan) throws IOException {
367     ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
368     return Base64.encodeBytes(proto.toByteArray());
369   }
370 
371   /**
372    * Converts the given Base64 string back into a Scan instance.
373    *
374    * @param base64  The scan details.
375    * @return The newly created Scan instance.
376    * @throws IOException When reading the scan instance fails.
377    */
378   static Scan convertStringToScan(String base64) throws IOException {
379     byte [] decoded = Base64.decode(base64);
380     ClientProtos.Scan scan;
381     try {
382       scan = ClientProtos.Scan.parseFrom(decoded);
383     } catch (InvalidProtocolBufferException ipbe) {
384       throw new IOException(ipbe);
385     }
386 
387     return ProtobufUtil.toScan(scan);
388   }
389 
390   /**
391    * Use this before submitting a TableReduce job. It will
392    * appropriately set up the JobConf.
393    *
394    * @param table  The output table.
395    * @param reducer  The reducer class to use.
396    * @param job  The current job to adjust.
397    * @throws IOException When determining the region count fails.
398    */
399   public static void initTableReducerJob(String table,
400     Class<? extends TableReducer> reducer, Job job)
401   throws IOException {
402     initTableReducerJob(table, reducer, job, null);
403   }
404 
405   /**
406    * Use this before submitting a TableReduce job. It will
407    * appropriately set up the JobConf.
408    *
409    * @param table  The output table.
410    * @param reducer  The reducer class to use.
411    * @param job  The current job to adjust.
412    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
413    * default partitioner.
414    * @throws IOException When determining the region count fails.
415    */
416   public static void initTableReducerJob(String table,
417     Class<? extends TableReducer> reducer, Job job,
418     Class partitioner) throws IOException {
419     initTableReducerJob(table, reducer, job, partitioner, null, null, null);
420   }
421 
422   /**
423    * Use this before submitting a TableReduce job. It will
424    * appropriately set up the JobConf.
425    *
426    * @param table  The output table.
427    * @param reducer  The reducer class to use.
428    * @param job  The current job to adjust.  Make sure the passed job is
429    * carrying all necessary HBase configuration.
430    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
431    * default partitioner.
432    * @param quorumAddress Distant cluster to write to; default is null for
433    * output to the cluster that is designated in <code>hbase-site.xml</code>.
434    * Set this String to the zookeeper ensemble of an alternate remote cluster
435    * when you would have the reduce write a cluster that is other than the
436    * default; e.g. copying tables between clusters, the source would be
437    * designated by <code>hbase-site.xml</code> and this param would have the
438    * ensemble address of the remote cluster.  The format to pass is particular.
439    * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
440    * </code> such as <code>server,server2,server3:2181:/hbase</code>.
441    * @param serverClass redefined hbase.regionserver.class
442    * @param serverImpl redefined hbase.regionserver.impl
443    * @throws IOException When determining the region count fails.
444    */
445   public static void initTableReducerJob(String table,
446     Class<? extends TableReducer> reducer, Job job,
447     Class partitioner, String quorumAddress, String serverClass,
448     String serverImpl) throws IOException {
449     initTableReducerJob(table, reducer, job, partitioner, quorumAddress,
450         serverClass, serverImpl, true);
451   }
452 
453   /**
454    * Use this before submitting a TableReduce job. It will
455    * appropriately set up the JobConf.
456    *
457    * @param table  The output table.
458    * @param reducer  The reducer class to use.
459    * @param job  The current job to adjust.  Make sure the passed job is
460    * carrying all necessary HBase configuration.
461    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
462    * default partitioner.
463    * @param quorumAddress Distant cluster to write to; default is null for
464    * output to the cluster that is designated in <code>hbase-site.xml</code>.
465    * Set this String to the zookeeper ensemble of an alternate remote cluster
466    * when you would have the reduce write a cluster that is other than the
467    * default; e.g. copying tables between clusters, the source would be
468    * designated by <code>hbase-site.xml</code> and this param would have the
469    * ensemble address of the remote cluster.  The format to pass is particular.
470    * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
471    * </code> such as <code>server,server2,server3:2181:/hbase</code>.
472    * @param serverClass redefined hbase.regionserver.class
473    * @param serverImpl redefined hbase.regionserver.impl
474    * @param addDependencyJars upload HBase jars and jars for any of the configured
475    *           job classes via the distributed cache (tmpjars).
476    * @throws IOException When determining the region count fails.
477    */
478   public static void initTableReducerJob(String table,
479     Class<? extends TableReducer> reducer, Job job,
480     Class partitioner, String quorumAddress, String serverClass,
481     String serverImpl, boolean addDependencyJars) throws IOException {
482 
483     Configuration conf = job.getConfiguration();
484     HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
485     job.setOutputFormatClass(TableOutputFormat.class);
486     if (reducer != null) job.setReducerClass(reducer);
487     conf.set(TableOutputFormat.OUTPUT_TABLE, table);
488     conf.setStrings("io.serializations", conf.get("io.serializations"),
489         MutationSerialization.class.getName(), ResultSerialization.class.getName());
490     // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
491     if (quorumAddress != null) {
492       // Calling this will validate the format
493       ZKUtil.transformClusterKey(quorumAddress);
494       conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress);
495     }
496     if (serverClass != null && serverImpl != null) {
497       conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
498       conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
499     }
500     job.setOutputKeyClass(ImmutableBytesWritable.class);
501     job.setOutputValueClass(Writable.class);
502     if (partitioner == HRegionPartitioner.class) {
503       job.setPartitionerClass(HRegionPartitioner.class);
504       int regions = MetaReader.getRegionCount(conf, table);
505       if (job.getNumReduceTasks() > regions) {
506         job.setNumReduceTasks(regions);
507       }
508     } else if (partitioner != null) {
509       job.setPartitionerClass(partitioner);
510     }
511 
512     if (addDependencyJars) {
513       addDependencyJars(job);
514     }
515 
516     initCredentials(job);
517   }
518 
519   /**
520    * Ensures that the given number of reduce tasks for the given job
521    * configuration does not exceed the number of regions for the given table.
522    *
523    * @param table  The table to get the region count for.
524    * @param job  The current job to adjust.
525    * @throws IOException When retrieving the table details fails.
526    */
527   public static void limitNumReduceTasks(String table, Job job)
528   throws IOException {
529     int regions = MetaReader.getRegionCount(job.getConfiguration(), table);
530     if (job.getNumReduceTasks() > regions)
531       job.setNumReduceTasks(regions);
532   }
533 
534   /**
535    * Sets the number of reduce tasks for the given job configuration to the
536    * number of regions the given table has.
537    *
538    * @param table  The table to get the region count for.
539    * @param job  The current job to adjust.
540    * @throws IOException When retrieving the table details fails.
541    */
542   public static void setNumReduceTasks(String table, Job job)
543   throws IOException {
544     job.setNumReduceTasks(MetaReader.getRegionCount(job.getConfiguration(), table));
545   }
546 
547   /**
548    * Sets the number of rows to return and cache with each scanner iteration.
549    * Higher caching values will enable faster mapreduce jobs at the expense of
550    * requiring more heap to contain the cached rows.
551    *
552    * @param job The current job to adjust.
553    * @param batchSize The number of rows to return in batch with each scanner
554    * iteration.
555    */
556   public static void setScannerCaching(Job job, int batchSize) {
557     job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
558   }
559 
560   /**
561    * Add HBase and its dependencies (only) to the job configuration.
562    * <p>
563    * This is intended as a low-level API, facilitating code reuse between this
564    * class and its mapred counterpart. It also of use to extenral tools that
565    * need to build a MapReduce job that interacts with HBase but want
566    * fine-grained control over the jars shipped to the cluster.
567    * </p>
568    * @param conf The Configuration object to extend with dependencies.
569    * @see org.apache.hadoop.hbase.mapred.TableMapReduceUtil
570    * @see <a href="https://issues.apache.org/jira/browse/PIG-3285">PIG-3285</a>
571    */
572   public static void addHBaseDependencyJars(Configuration conf) throws IOException {
573     addDependencyJars(conf,
574       // explicitly pull a class from each module
575       org.apache.hadoop.hbase.HConstants.class,                      // hbase-common
576       org.apache.hadoop.hbase.protobuf.generated.ClientProtos.class, // hbase-protocol
577       org.apache.hadoop.hbase.client.Put.class,                      // hbase-client
578       org.apache.hadoop.hbase.CompatibilityFactory.class,            // hbase-hadoop-compat
579       org.apache.hadoop.hbase.mapreduce.TableMapper.class,           // hbase-server
580       // pull necessary dependencies
581       org.apache.zookeeper.ZooKeeper.class,
582       org.jboss.netty.channel.ChannelFactory.class,
583       com.google.protobuf.Message.class,
584       com.google.common.collect.Lists.class,
585       org.cloudera.htrace.Trace.class);
586   }
587 
588   /**
589    * Returns a classpath string built from the content of the "tmpjars" value in {@code conf}.
590    * Also exposed to shell scripts via `bin/hbase mapredcp`.
591    */
592   public static String buildDependencyClasspath(Configuration conf) {
593     if (conf == null) {
594       throw new IllegalArgumentException("Must provide a configuration object.");
595     }
596     Set<String> paths = new HashSet<String>(conf.getStringCollection("tmpjars"));
597     if (paths.size() == 0) {
598       throw new IllegalArgumentException("Configuration contains no tmpjars.");
599     }
600     StringBuilder sb = new StringBuilder();
601     for (String s : paths) {
602       // entries can take the form 'file:/path/to/file.jar'.
603       int idx = s.indexOf(":");
604       if (idx != -1) s = s.substring(idx + 1);
605       if (sb.length() > 0) sb.append(File.pathSeparator);
606       sb.append(s);
607     }
608     return sb.toString();
609   }
610 
611   /**
612    * Add the HBase dependency jars as well as jars for any of the configured
613    * job classes to the job configuration, so that JobClient will ship them
614    * to the cluster and add them to the DistributedCache.
615    */
616   public static void addDependencyJars(Job job) throws IOException {
617     addHBaseDependencyJars(job.getConfiguration());
618     try {
619       addDependencyJars(job.getConfiguration(),
620           // when making changes here, consider also mapred.TableMapReduceUtil
621           // pull job classes
622           job.getMapOutputKeyClass(),
623           job.getMapOutputValueClass(),
624           job.getInputFormatClass(),
625           job.getOutputKeyClass(),
626           job.getOutputValueClass(),
627           job.getOutputFormatClass(),
628           job.getPartitionerClass(),
629           job.getCombinerClass());
630     } catch (ClassNotFoundException e) {
631       throw new IOException(e);
632     }
633   }
634 
635   /**
636    * Add the jars containing the given classes to the job's configuration
637    * such that JobClient will ship them to the cluster and add them to
638    * the DistributedCache.
639    */
640   public static void addDependencyJars(Configuration conf,
641       Class<?>... classes) throws IOException {
642 
643     FileSystem localFs = FileSystem.getLocal(conf);
644     Set<String> jars = new HashSet<String>();
645     // Add jars that are already in the tmpjars variable
646     jars.addAll(conf.getStringCollection("tmpjars"));
647 
648     // add jars as we find them to a map of contents jar name so that we can avoid
649     // creating new jars for classes that have already been packaged.
650     Map<String, String> packagedClasses = new HashMap<String, String>();
651 
652     // Add jars containing the specified classes
653     for (Class<?> clazz : classes) {
654       if (clazz == null) continue;
655 
656       Path path = findOrCreateJar(clazz, localFs, packagedClasses);
657       if (path == null) {
658         LOG.warn("Could not find jar for class " + clazz +
659                  " in order to ship it to the cluster.");
660         continue;
661       }
662       if (!localFs.exists(path)) {
663         LOG.warn("Could not validate jar file " + path + " for class "
664                  + clazz);
665         continue;
666       }
667       jars.add(path.toString());
668     }
669     if (jars.isEmpty()) return;
670 
671     conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[jars.size()])));
672   }
673 
674   /**
675    * If org.apache.hadoop.util.JarFinder is available (0.23+ hadoop), finds
676    * the Jar for a class or creates it if it doesn't exist. If the class is in
677    * a directory in the classpath, it creates a Jar on the fly with the
678    * contents of the directory and returns the path to that Jar. If a Jar is
679    * created, it is created in the system temporary directory. Otherwise,
680    * returns an existing jar that contains a class of the same name. Maintains
681    * a mapping from jar contents to the tmp jar created.
682    * @param my_class the class to find.
683    * @param fs the FileSystem with which to qualify the returned path.
684    * @param packagedClasses a map of class name to path.
685    * @return a jar file that contains the class.
686    * @throws IOException
687    */
688   private static Path findOrCreateJar(Class<?> my_class, FileSystem fs,
689       Map<String, String> packagedClasses)
690   throws IOException {
691     // attempt to locate an existing jar for the class.
692     String jar = findContainingJar(my_class, packagedClasses);
693     if (null == jar || jar.isEmpty()) {
694       jar = getJar(my_class);
695       updateMap(jar, packagedClasses);
696     }
697 
698     if (null == jar || jar.isEmpty()) {
699       return null;
700     }
701 
702     LOG.debug(String.format("For class %s, using jar %s", my_class.getName(), jar));
703     return new Path(jar).makeQualified(fs);
704   }
705 
706   /**
707    * Add entries to <code>packagedClasses</code> corresponding to class files
708    * contained in <code>jar</code>.
709    * @param jar The jar who's content to list.
710    * @param packagedClasses map[class -> jar]
711    */
712   private static void updateMap(String jar, Map<String, String> packagedClasses) throws IOException {
713     if (null == jar || jar.isEmpty()) {
714       return;
715     }
716     ZipFile zip = null;
717     try {
718       zip = new ZipFile(jar);
719       for (Enumeration<? extends ZipEntry> iter = zip.entries(); iter.hasMoreElements();) {
720         ZipEntry entry = iter.nextElement();
721         if (entry.getName().endsWith("class")) {
722           packagedClasses.put(entry.getName(), jar);
723         }
724       }
725     } finally {
726       if (null != zip) zip.close();
727     }
728   }
729 
730   /**
731    * Find a jar that contains a class of the same name, if any. It will return
732    * a jar file, even if that is not the first thing on the class path that
733    * has a class with the same name. Looks first on the classpath and then in
734    * the <code>packagedClasses</code> map.
735    * @param my_class the class to find.
736    * @return a jar file that contains the class, or null.
737    * @throws IOException
738    */
739   private static String findContainingJar(Class<?> my_class, Map<String, String> packagedClasses)
740       throws IOException {
741     ClassLoader loader = my_class.getClassLoader();
742     String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
743 
744     // first search the classpath
745     for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
746       URL url = itr.nextElement();
747       if ("jar".equals(url.getProtocol())) {
748         String toReturn = url.getPath();
749         if (toReturn.startsWith("file:")) {
750           toReturn = toReturn.substring("file:".length());
751         }
752         // URLDecoder is a misnamed class, since it actually decodes
753         // x-www-form-urlencoded MIME type rather than actual
754         // URL encoding (which the file path has). Therefore it would
755         // decode +s to ' 's which is incorrect (spaces are actually
756         // either unencoded or encoded as "%20"). Replace +s first, so
757         // that they are kept sacred during the decoding process.
758         toReturn = toReturn.replaceAll("\\+", "%2B");
759         toReturn = URLDecoder.decode(toReturn, "UTF-8");
760         return toReturn.replaceAll("!.*$", "");
761       }
762     }
763 
764     // now look in any jars we've packaged using JarFinder. Returns null when
765     // no jar is found.
766     return packagedClasses.get(class_file);
767   }
768 
769   /**
770    * Invoke 'getJar' on a JarFinder implementation. Useful for some job
771    * configuration contexts (HBASE-8140) and also for testing on MRv2. First
772    * check if we have HADOOP-9426. Lacking that, fall back to the backport.
773    * @param my_class the class to find.
774    * @return a jar file that contains the class, or null.
775    */
776   private static String getJar(Class<?> my_class) {
777     String ret = null;
778     String hadoopJarFinder = "org.apache.hadoop.util.JarFinder";
779     Class<?> jarFinder = null;
780     try {
781       LOG.debug("Looking for " + hadoopJarFinder + ".");
782       jarFinder = Class.forName(hadoopJarFinder);
783       LOG.debug(hadoopJarFinder + " found.");
784       Method getJar = jarFinder.getMethod("getJar", Class.class);
785       ret = (String) getJar.invoke(null, my_class);
786     } catch (ClassNotFoundException e) {
787       LOG.debug("Using backported JarFinder.");
788       ret = JarFinder.getJar(my_class);
789     } catch (InvocationTargetException e) {
790       // function was properly called, but threw it's own exception. Unwrap it
791       // and pass it on.
792       throw new RuntimeException(e.getCause());
793     } catch (Exception e) {
794       // toss all other exceptions, related to reflection failure
795       throw new RuntimeException("getJar invocation failed.", e);
796     }
797 
798     return ret;
799   }
800 }