View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import com.google.protobuf.InvalidProtocolBufferException;
22  import com.yammer.metrics.core.MetricsRegistry;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.hbase.classification.InterfaceAudience;
27  import org.apache.hadoop.hbase.classification.InterfaceStability;
28  import org.apache.hadoop.conf.Configuration;
29  import org.apache.hadoop.fs.FileSystem;
30  import org.apache.hadoop.fs.Path;
31  import org.apache.hadoop.hbase.HBaseConfiguration;
32  import org.apache.hadoop.hbase.HConstants;
33  import org.apache.hadoop.hbase.catalog.MetaReader;
34  import org.apache.hadoop.hbase.client.HConnection;
35  import org.apache.hadoop.hbase.client.HConnectionManager;
36  import org.apache.hadoop.hbase.client.Put;
37  import org.apache.hadoop.hbase.client.Scan;
38  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
39  import org.apache.hadoop.hbase.mapreduce.hadoopbackport.JarFinder;
40  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
41  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
42  import org.apache.hadoop.hbase.security.User;
43  import org.apache.hadoop.hbase.security.UserProvider;
44  import org.apache.hadoop.hbase.security.token.TokenUtil;
45  import org.apache.hadoop.hbase.util.Base64;
46  import org.apache.hadoop.hbase.util.Bytes;
47  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
48  import org.apache.hadoop.io.Writable;
49  import org.apache.hadoop.io.WritableComparable;
50  import org.apache.hadoop.mapreduce.InputFormat;
51  import org.apache.hadoop.mapreduce.Job;
52  import org.apache.hadoop.util.StringUtils;
53  
54  import java.io.File;
55  import java.io.IOException;
56  import java.lang.reflect.InvocationTargetException;
57  import java.lang.reflect.Method;
58  import java.net.URL;
59  import java.net.URLDecoder;
60  import java.util.ArrayList;
61  import java.util.Collection;
62  import java.util.Enumeration;
63  import java.util.HashMap;
64  import java.util.HashSet;
65  import java.util.List;
66  import java.util.Map;
67  import java.util.Set;
68  import java.util.zip.ZipEntry;
69  import java.util.zip.ZipFile;
70  
71  /**
72   * Utility for {@link TableMapper} and {@link TableReducer}
73   */
74  @SuppressWarnings({ "rawtypes", "unchecked" })
75  @InterfaceAudience.Public
76  @InterfaceStability.Stable
77  public class TableMapReduceUtil {
78    static Log LOG = LogFactory.getLog(TableMapReduceUtil.class);
79  
80    /**
81     * Use this before submitting a TableMap job. It will appropriately set up
82     * the job.
83     *
84     * @param table  The table name to read from.
85     * @param scan  The scan instance with the columns, time range etc.
86     * @param mapper  The mapper class to use.
87     * @param outputKeyClass  The class of the output key.
88     * @param outputValueClass  The class of the output value.
89     * @param job  The current job to adjust.  Make sure the passed job is
90     * carrying all necessary HBase configuration.
91     * @throws IOException When setting up the details fails.
92     */
93    public static void initTableMapperJob(String table, Scan scan,
94        Class<? extends TableMapper> mapper,
95        Class<?> outputKeyClass,
96        Class<?> outputValueClass, Job job)
97    throws IOException {
98      initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass,
99          job, true);
100   }
101 
102   /**
103    * Use this before submitting a TableMap job. It will appropriately set up
104    * the job.
105    *
106    * @param table Binary representation of the table name to read from.
107    * @param scan  The scan instance with the columns, time range etc.
108    * @param mapper  The mapper class to use.
109    * @param outputKeyClass  The class of the output key.
110    * @param outputValueClass  The class of the output value.
111    * @param job  The current job to adjust.  Make sure the passed job is
112    * carrying all necessary HBase configuration.
113    * @throws IOException When setting up the details fails.
114    */
115    public static void initTableMapperJob(byte[] table, Scan scan,
116       Class<? extends TableMapper> mapper,
117       Class<?> outputKeyClass,
118       Class<?> outputValueClass, Job job)
119   throws IOException {
120       initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass,
121               job, true);
122   }
123 
124    /**
125     * Use this before submitting a TableMap job. It will appropriately set up
126     * the job.
127     *
128     * @param table  The table name to read from.
129     * @param scan  The scan instance with the columns, time range etc.
130     * @param mapper  The mapper class to use.
131     * @param outputKeyClass  The class of the output key.
132     * @param outputValueClass  The class of the output value.
133     * @param job  The current job to adjust.  Make sure the passed job is
134     * carrying all necessary HBase configuration.
135     * @param addDependencyJars upload HBase jars and jars for any of the configured
136     *           job classes via the distributed cache (tmpjars).
137     * @throws IOException When setting up the details fails.
138     */
139    public static void initTableMapperJob(String table, Scan scan,
140        Class<? extends TableMapper> mapper,
141        Class<?> outputKeyClass,
142        Class<?> outputValueClass, Job job,
143        boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
144    throws IOException {
145      initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass, job,
146          addDependencyJars, true, inputFormatClass);
147    }
148 
149 
150   /**
151    * Use this before submitting a TableMap job. It will appropriately set up
152    * the job.
153    *
154    * @param table  The table name to read from.
155    * @param scan  The scan instance with the columns, time range etc.
156    * @param mapper  The mapper class to use.
157    * @param outputKeyClass  The class of the output key.
158    * @param outputValueClass  The class of the output value.
159    * @param job  The current job to adjust.  Make sure the passed job is
160    * carrying all necessary HBase configuration.
161    * @param addDependencyJars upload HBase jars and jars for any of the configured
162    *           job classes via the distributed cache (tmpjars).
163    * @param initCredentials whether to initialize hbase auth credentials for the job
164    * @param inputFormatClass the input format
165    * @throws IOException When setting up the details fails.
166    */
167   public static void initTableMapperJob(String table, Scan scan,
168       Class<? extends TableMapper> mapper,
169       Class<?> outputKeyClass,
170       Class<?> outputValueClass, Job job,
171       boolean addDependencyJars, boolean initCredentials,
172       Class<? extends InputFormat> inputFormatClass)
173   throws IOException {
174     job.setInputFormatClass(inputFormatClass);
175     if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
176     if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
177     job.setMapperClass(mapper);
178     if (Put.class.equals(outputValueClass)) {
179       job.setCombinerClass(PutCombiner.class);
180     }
181     Configuration conf = job.getConfiguration();
182     HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
183     conf.set(TableInputFormat.INPUT_TABLE, table);
184     conf.set(TableInputFormat.SCAN, convertScanToString(scan));
185     conf.setStrings("io.serializations", conf.get("io.serializations"),
186         MutationSerialization.class.getName(), ResultSerialization.class.getName(),
187         KeyValueSerialization.class.getName());
188     if (addDependencyJars) {
189       addDependencyJars(job);
190     }
191     if (initCredentials) {
192       initCredentials(job);
193     }
194   }
195 
196   /**
197    * Use this before submitting a TableMap job. It will appropriately set up
198    * the job.
199    *
200    * @param table Binary representation of the table name to read from.
201    * @param scan  The scan instance with the columns, time range etc.
202    * @param mapper  The mapper class to use.
203    * @param outputKeyClass  The class of the output key.
204    * @param outputValueClass  The class of the output value.
205    * @param job  The current job to adjust.  Make sure the passed job is
206    * carrying all necessary HBase configuration.
207    * @param addDependencyJars upload HBase jars and jars for any of the configured
208    *           job classes via the distributed cache (tmpjars).
209    * @param inputFormatClass The class of the input format
210    * @throws IOException When setting up the details fails.
211    */
212   public static void initTableMapperJob(byte[] table, Scan scan,
213       Class<? extends TableMapper> mapper,
214       Class<?> outputKeyClass,
215       Class<?> outputValueClass, Job job,
216       boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
217   throws IOException {
218       initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
219               outputValueClass, job, addDependencyJars, inputFormatClass);
220   }
221 
222   /**
223    * Use this before submitting a TableMap job. It will appropriately set up
224    * the job.
225    *
226    * @param table Binary representation of the table name to read from.
227    * @param scan  The scan instance with the columns, time range etc.
228    * @param mapper  The mapper class to use.
229    * @param outputKeyClass  The class of the output key.
230    * @param outputValueClass  The class of the output value.
231    * @param job  The current job to adjust.  Make sure the passed job is
232    * carrying all necessary HBase configuration.
233    * @param addDependencyJars upload HBase jars and jars for any of the configured
234    *           job classes via the distributed cache (tmpjars).
235    * @throws IOException When setting up the details fails.
236    */
237   public static void initTableMapperJob(byte[] table, Scan scan,
238       Class<? extends TableMapper> mapper,
239       Class<?> outputKeyClass,
240       Class<?> outputValueClass, Job job,
241       boolean addDependencyJars)
242   throws IOException {
243       initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
244               outputValueClass, job, addDependencyJars, TableInputFormat.class);
245   }
246 
247   /**
248    * Use this before submitting a TableMap job. It will appropriately set up
249    * the job.
250    *
251    * @param table The table name to read from.
252    * @param scan  The scan instance with the columns, time range etc.
253    * @param mapper  The mapper class to use.
254    * @param outputKeyClass  The class of the output key.
255    * @param outputValueClass  The class of the output value.
256    * @param job  The current job to adjust.  Make sure the passed job is
257    * carrying all necessary HBase configuration.
258    * @param addDependencyJars upload HBase jars and jars for any of the configured
259    *           job classes via the distributed cache (tmpjars).
260    * @throws IOException When setting up the details fails.
261    */
262   public static void initTableMapperJob(String table, Scan scan,
263       Class<? extends TableMapper> mapper,
264       Class<?> outputKeyClass,
265       Class<?> outputValueClass, Job job,
266       boolean addDependencyJars)
267   throws IOException {
268       initTableMapperJob(table, scan, mapper, outputKeyClass,
269               outputValueClass, job, addDependencyJars, TableInputFormat.class);
270   }
271 
272   /**
273    * Enable a basic on-heap cache for these jobs. Any BlockCache implementation based on
274    * direct memory will likely cause the map tasks to OOM when opening the region. This
275    * is done here instead of in TableSnapshotRegionRecordReader in case an advanced user
276    * wants to override this behavior in their job.
277    */
278   public static void resetCacheConfig(Configuration conf) {
279     conf.setFloat(
280       HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, HConstants.HFILE_BLOCK_CACHE_SIZE_DEFAULT);
281     conf.setFloat("hbase.offheapcache.percentage", 0f);
282     conf.setFloat("hbase.bucketcache.size", 0f);
283     conf.unset("hbase.bucketcache.ioengine");
284   }
285 
286   /**
287    * Sets up the job for reading from one or more table snapshots, with one or more scans
288    * per snapshot.
289    * It bypasses hbase servers and read directly from snapshot files.
290    *
291    * @param snapshotScans     map of snapshot name to scans on that snapshot.
292    * @param mapper            The mapper class to use.
293    * @param outputKeyClass    The class of the output key.
294    * @param outputValueClass  The class of the output value.
295    * @param job               The current job to adjust.  Make sure the passed job is
296    *                          carrying all necessary HBase configuration.
297    * @param addDependencyJars upload HBase jars and jars for any of the configured
298    *                          job classes via the distributed cache (tmpjars).
299    */
300   public static void initMultiTableSnapshotMapperJob(Map<String, Collection<Scan>> snapshotScans,
301       Class<? extends TableMapper> mapper, Class<?> outputKeyClass, Class<?> outputValueClass,
302       Job job, boolean addDependencyJars, Path tmpRestoreDir) throws IOException {
303     MultiTableSnapshotInputFormat.setInput(job.getConfiguration(), snapshotScans, tmpRestoreDir);
304 
305     job.setInputFormatClass(MultiTableSnapshotInputFormat.class);
306     if (outputValueClass != null) {
307       job.setMapOutputValueClass(outputValueClass);
308     }
309     if (outputKeyClass != null) {
310       job.setMapOutputKeyClass(outputKeyClass);
311     }
312     job.setMapperClass(mapper);
313     Configuration conf = job.getConfiguration();
314     HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
315 
316     if (addDependencyJars) {
317       addDependencyJars(job);
318       addDependencyJars(job.getConfiguration(), MetricsRegistry.class);
319     }
320 
321     resetCacheConfig(job.getConfiguration());
322   }
323 
324   /**
325    * Sets up the job for reading from a table snapshot. It bypasses hbase servers
326    * and read directly from snapshot files.
327    *
328    * @param snapshotName The name of the snapshot (of a table) to read from.
329    * @param scan  The scan instance with the columns, time range etc.
330    * @param mapper  The mapper class to use.
331    * @param outputKeyClass  The class of the output key.
332    * @param outputValueClass  The class of the output value.
333    * @param job  The current job to adjust.  Make sure the passed job is
334    * carrying all necessary HBase configuration.
335    * @param addDependencyJars upload HBase jars and jars for any of the configured
336    *           job classes via the distributed cache (tmpjars).
337    *
338    * @param tmpRestoreDir a temporary directory to copy the snapshot files into. Current user should
339    * have write permissions to this directory, and this should not be a subdirectory of rootdir.
340    * After the job is finished, restore directory can be deleted.
341    * @throws IOException When setting up the details fails.
342    * @see TableSnapshotInputFormat
343    */
344   public static void initTableSnapshotMapperJob(String snapshotName, Scan scan,
345       Class<? extends TableMapper> mapper,
346       Class<?> outputKeyClass,
347       Class<?> outputValueClass, Job job,
348       boolean addDependencyJars, Path tmpRestoreDir)
349   throws IOException {
350     TableSnapshotInputFormat.setInput(job, snapshotName, tmpRestoreDir);
351     initTableMapperJob(snapshotName, scan, mapper, outputKeyClass,
352         outputValueClass, job, addDependencyJars, false, TableSnapshotInputFormat.class);
353     resetCacheConfig(job.getConfiguration());
354   }
355 
356   /**
357    * Use this before submitting a Multi TableMap job. It will appropriately set
358    * up the job.
359    *
360    * @param scans The list of {@link Scan} objects to read from.
361    * @param mapper The mapper class to use.
362    * @param outputKeyClass The class of the output key.
363    * @param outputValueClass The class of the output value.
364    * @param job The current job to adjust. Make sure the passed job is carrying
365    *          all necessary HBase configuration.
366    * @throws IOException When setting up the details fails.
367    */
368   public static void initTableMapperJob(List<Scan> scans,
369       Class<? extends TableMapper> mapper,
370       Class<? extends WritableComparable> outputKeyClass,
371       Class<? extends Writable> outputValueClass, Job job) throws IOException {
372     initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
373         true);
374   }
375 
376   /**
377    * Use this before submitting a Multi TableMap job. It will appropriately set
378    * up the job.
379    *
380    * @param scans The list of {@link Scan} objects to read from.
381    * @param mapper The mapper class to use.
382    * @param outputKeyClass The class of the output key.
383    * @param outputValueClass The class of the output value.
384    * @param job The current job to adjust. Make sure the passed job is carrying
385    *          all necessary HBase configuration.
386    * @param addDependencyJars upload HBase jars and jars for any of the
387    *          configured job classes via the distributed cache (tmpjars).
388    * @throws IOException When setting up the details fails.
389    */
390   public static void initTableMapperJob(List<Scan> scans,
391       Class<? extends TableMapper> mapper,
392       Class<? extends WritableComparable> outputKeyClass,
393       Class<? extends Writable> outputValueClass, Job job,
394       boolean addDependencyJars) throws IOException {
395     initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
396       addDependencyJars, true);
397   }
398 
399   /**
400    * Use this before submitting a Multi TableMap job. It will appropriately set
401    * up the job.
402    *
403    * @param scans The list of {@link Scan} objects to read from.
404    * @param mapper The mapper class to use.
405    * @param outputKeyClass The class of the output key.
406    * @param outputValueClass The class of the output value.
407    * @param job The current job to adjust. Make sure the passed job is carrying
408    *          all necessary HBase configuration.
409    * @param addDependencyJars upload HBase jars and jars for any of the
410    *          configured job classes via the distributed cache (tmpjars).
411    * @param initCredentials whether to initialize hbase auth credentials for the job
412    * @throws IOException When setting up the details fails.
413    */
414   public static void initTableMapperJob(List<Scan> scans,
415       Class<? extends TableMapper> mapper,
416       Class<? extends WritableComparable> outputKeyClass,
417       Class<? extends Writable> outputValueClass, Job job,
418       boolean addDependencyJars,
419       boolean initCredentials) throws IOException {
420     job.setInputFormatClass(MultiTableInputFormat.class);
421     if (outputValueClass != null) {
422       job.setMapOutputValueClass(outputValueClass);
423     }
424     if (outputKeyClass != null) {
425       job.setMapOutputKeyClass(outputKeyClass);
426     }
427     job.setMapperClass(mapper);
428     Configuration conf = job.getConfiguration();
429     HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
430     List<String> scanStrings = new ArrayList<String>();
431 
432     for (Scan scan : scans) {
433       scanStrings.add(convertScanToString(scan));
434     }
435     job.getConfiguration().setStrings(MultiTableInputFormat.SCANS,
436       scanStrings.toArray(new String[scanStrings.size()]));
437 
438     if (addDependencyJars) {
439       addDependencyJars(job);
440     }
441 
442     if (initCredentials) {
443       initCredentials(job);
444     }
445   }
446 
447   public static void initCredentials(Job job) throws IOException {
448     UserProvider userProvider = UserProvider.instantiate(job.getConfiguration());
449     if (userProvider.isHadoopSecurityEnabled()) {
450       // propagate delegation related props from launcher job to MR job
451       if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
452         job.getConfiguration().set("mapreduce.job.credentials.binary",
453                                    System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
454       }
455     }
456 
457     if (userProvider.isHBaseSecurityEnabled()) {
458       try {
459         // init credentials for remote cluster
460         String quorumAddress = job.getConfiguration().get(TableOutputFormat.QUORUM_ADDRESS);
461         User user = userProvider.getCurrent();
462         if (quorumAddress != null) {
463           Configuration peerConf = HBaseConfiguration.create(job.getConfiguration());
464           ZKUtil.applyClusterKeyToConf(peerConf, quorumAddress);
465           HConnection peerConn = HConnectionManager.createConnection(peerConf);
466           try {
467             TokenUtil.addTokenForJob(peerConn, user, job);
468           } finally {
469             peerConn.close();
470           }
471         }
472 
473         HConnection conn = HConnectionManager.createConnection(job.getConfiguration());
474         try {
475           TokenUtil.addTokenForJob(conn, user, job);
476         } finally {
477           conn.close();
478         }
479       } catch (InterruptedException ie) {
480         LOG.info("Interrupted obtaining user authentication token");
481         Thread.currentThread().interrupt();
482       }
483     }
484   }
485 
486   /**
487    * Obtain an authentication token, for the specified cluster, on behalf of the current user
488    * and add it to the credentials for the given map reduce job.
489    *
490    * The quorumAddress is the key to the ZK ensemble, which contains:
491    * hbase.zookeeper.quorum, hbase.zookeeper.client.port and zookeeper.znode.parent
492    *
493    * @param job The job that requires the permission.
494    * @param quorumAddress string that contains the 3 required configuratins
495    * @throws IOException When the authentication token cannot be obtained.
496    */
497   public static void initCredentialsForCluster(Job job, String quorumAddress)
498       throws IOException {
499     UserProvider userProvider = UserProvider.instantiate(job.getConfiguration());
500     if (userProvider.isHBaseSecurityEnabled()) {
501       try {
502         Configuration peerConf = HBaseConfiguration.create(job.getConfiguration());
503         ZKUtil.applyClusterKeyToConf(peerConf, quorumAddress);
504         HConnection peerConn = HConnectionManager.createConnection(peerConf);
505         try {
506           TokenUtil.addTokenForJob(peerConn, userProvider.getCurrent(), job);
507         } finally {
508           peerConn.close();
509         }
510       } catch (InterruptedException e) {
511         LOG.info("Interrupted obtaining user authentication token");
512         Thread.interrupted();
513       }
514     }
515   }
516 
517   /**
518    * Writes the given scan into a Base64 encoded string.
519    *
520    * @param scan  The scan to write out.
521    * @return The scan saved in a Base64 encoded string.
522    * @throws IOException When writing the scan fails.
523    */
524   static String convertScanToString(Scan scan) throws IOException {
525     ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
526     return Base64.encodeBytes(proto.toByteArray());
527   }
528 
529   /**
530    * Converts the given Base64 string back into a Scan instance.
531    *
532    * @param base64  The scan details.
533    * @return The newly created Scan instance.
534    * @throws IOException When reading the scan instance fails.
535    */
536   static Scan convertStringToScan(String base64) throws IOException {
537     byte [] decoded = Base64.decode(base64);
538     ClientProtos.Scan scan;
539     try {
540       scan = ClientProtos.Scan.parseFrom(decoded);
541     } catch (InvalidProtocolBufferException ipbe) {
542       throw new IOException(ipbe);
543     }
544 
545     return ProtobufUtil.toScan(scan);
546   }
547 
548   /**
549    * Use this before submitting a TableReduce job. It will
550    * appropriately set up the JobConf.
551    *
552    * @param table  The output table.
553    * @param reducer  The reducer class to use.
554    * @param job  The current job to adjust.
555    * @throws IOException When determining the region count fails.
556    */
557   public static void initTableReducerJob(String table,
558     Class<? extends TableReducer> reducer, Job job)
559   throws IOException {
560     initTableReducerJob(table, reducer, job, null);
561   }
562 
563   /**
564    * Use this before submitting a TableReduce job. It will
565    * appropriately set up the JobConf.
566    *
567    * @param table  The output table.
568    * @param reducer  The reducer class to use.
569    * @param job  The current job to adjust.
570    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
571    * default partitioner.
572    * @throws IOException When determining the region count fails.
573    */
574   public static void initTableReducerJob(String table,
575     Class<? extends TableReducer> reducer, Job job,
576     Class partitioner) throws IOException {
577     initTableReducerJob(table, reducer, job, partitioner, null, null, null);
578   }
579 
580   /**
581    * Use this before submitting a TableReduce job. It will
582    * appropriately set up the JobConf.
583    *
584    * @param table  The output table.
585    * @param reducer  The reducer class to use.
586    * @param job  The current job to adjust.  Make sure the passed job is
587    * carrying all necessary HBase configuration.
588    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
589    * default partitioner.
590    * @param quorumAddress Distant cluster to write to; default is null for
591    * output to the cluster that is designated in <code>hbase-site.xml</code>.
592    * Set this String to the zookeeper ensemble of an alternate remote cluster
593    * when you would have the reduce write a cluster that is other than the
594    * default; e.g. copying tables between clusters, the source would be
595    * designated by <code>hbase-site.xml</code> and this param would have the
596    * ensemble address of the remote cluster.  The format to pass is particular.
597    * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
598    * </code> such as <code>server,server2,server3:2181:/hbase</code>.
599    * @param serverClass redefined hbase.regionserver.class
600    * @param serverImpl redefined hbase.regionserver.impl
601    * @throws IOException When determining the region count fails.
602    */
603   public static void initTableReducerJob(String table,
604     Class<? extends TableReducer> reducer, Job job,
605     Class partitioner, String quorumAddress, String serverClass,
606     String serverImpl) throws IOException {
607     initTableReducerJob(table, reducer, job, partitioner, quorumAddress,
608         serverClass, serverImpl, true);
609   }
610 
611   /**
612    * Use this before submitting a TableReduce job. It will
613    * appropriately set up the JobConf.
614    *
615    * @param table  The output table.
616    * @param reducer  The reducer class to use.
617    * @param job  The current job to adjust.  Make sure the passed job is
618    * carrying all necessary HBase configuration.
619    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
620    * default partitioner.
621    * @param quorumAddress Distant cluster to write to; default is null for
622    * output to the cluster that is designated in <code>hbase-site.xml</code>.
623    * Set this String to the zookeeper ensemble of an alternate remote cluster
624    * when you would have the reduce write a cluster that is other than the
625    * default; e.g. copying tables between clusters, the source would be
626    * designated by <code>hbase-site.xml</code> and this param would have the
627    * ensemble address of the remote cluster.  The format to pass is particular.
628    * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
629    * </code> such as <code>server,server2,server3:2181:/hbase</code>.
630    * @param serverClass redefined hbase.regionserver.class
631    * @param serverImpl redefined hbase.regionserver.impl
632    * @param addDependencyJars upload HBase jars and jars for any of the configured
633    *           job classes via the distributed cache (tmpjars).
634    * @throws IOException When determining the region count fails.
635    */
636   public static void initTableReducerJob(String table,
637     Class<? extends TableReducer> reducer, Job job,
638     Class partitioner, String quorumAddress, String serverClass,
639     String serverImpl, boolean addDependencyJars) throws IOException {
640 
641     Configuration conf = job.getConfiguration();
642     HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
643     job.setOutputFormatClass(TableOutputFormat.class);
644     if (reducer != null) job.setReducerClass(reducer);
645     conf.set(TableOutputFormat.OUTPUT_TABLE, table);
646     conf.setStrings("io.serializations", conf.get("io.serializations"),
647         MutationSerialization.class.getName(), ResultSerialization.class.getName());
648     // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
649     if (quorumAddress != null) {
650       // Calling this will validate the format
651       ZKUtil.transformClusterKey(quorumAddress);
652       conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress);
653     }
654     if (serverClass != null && serverImpl != null) {
655       conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
656       conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
657     }
658     job.setOutputKeyClass(ImmutableBytesWritable.class);
659     job.setOutputValueClass(Writable.class);
660     if (partitioner == HRegionPartitioner.class) {
661       job.setPartitionerClass(HRegionPartitioner.class);
662       int regions = MetaReader.getRegionCount(conf, table);
663       if (job.getNumReduceTasks() > regions) {
664         job.setNumReduceTasks(regions);
665       }
666     } else if (partitioner != null) {
667       job.setPartitionerClass(partitioner);
668     }
669 
670     if (addDependencyJars) {
671       addDependencyJars(job);
672     }
673 
674     initCredentials(job);
675   }
676 
677   /**
678    * Ensures that the given number of reduce tasks for the given job
679    * configuration does not exceed the number of regions for the given table.
680    *
681    * @param table  The table to get the region count for.
682    * @param job  The current job to adjust.
683    * @throws IOException When retrieving the table details fails.
684    */
685   public static void limitNumReduceTasks(String table, Job job)
686   throws IOException {
687     int regions = MetaReader.getRegionCount(job.getConfiguration(), table);
688     if (job.getNumReduceTasks() > regions)
689       job.setNumReduceTasks(regions);
690   }
691 
692   /**
693    * Sets the number of reduce tasks for the given job configuration to the
694    * number of regions the given table has.
695    *
696    * @param table  The table to get the region count for.
697    * @param job  The current job to adjust.
698    * @throws IOException When retrieving the table details fails.
699    */
700   public static void setNumReduceTasks(String table, Job job)
701   throws IOException {
702     job.setNumReduceTasks(MetaReader.getRegionCount(job.getConfiguration(), table));
703   }
704 
705   /**
706    * Sets the number of rows to return and cache with each scanner iteration.
707    * Higher caching values will enable faster mapreduce jobs at the expense of
708    * requiring more heap to contain the cached rows.
709    *
710    * @param job The current job to adjust.
711    * @param batchSize The number of rows to return in batch with each scanner
712    * iteration.
713    */
714   public static void setScannerCaching(Job job, int batchSize) {
715     job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
716   }
717 
718   /**
719    * Add HBase and its dependencies (only) to the job configuration.
720    * <p>
721    * This is intended as a low-level API, facilitating code reuse between this
722    * class and its mapred counterpart. It also of use to extenral tools that
723    * need to build a MapReduce job that interacts with HBase but want
724    * fine-grained control over the jars shipped to the cluster.
725    * </p>
726    * @param conf The Configuration object to extend with dependencies.
727    * @see org.apache.hadoop.hbase.mapred.TableMapReduceUtil
728    * @see <a href="https://issues.apache.org/jira/browse/PIG-3285">PIG-3285</a>
729    */
730   public static void addHBaseDependencyJars(Configuration conf) throws IOException {
731     addDependencyJars(conf,
732       // explicitly pull a class from each module
733       org.apache.hadoop.hbase.HConstants.class,                      // hbase-common
734       org.apache.hadoop.hbase.protobuf.generated.ClientProtos.class, // hbase-protocol
735       org.apache.hadoop.hbase.client.Put.class,                      // hbase-client
736       org.apache.hadoop.hbase.CompatibilityFactory.class,            // hbase-hadoop-compat
737       org.apache.hadoop.hbase.mapreduce.TableMapper.class,           // hbase-server
738       // pull necessary dependencies
739       org.apache.zookeeper.ZooKeeper.class,
740       org.jboss.netty.channel.ChannelFactory.class,
741       com.google.protobuf.Message.class,
742       com.google.common.collect.Lists.class,
743       org.cloudera.htrace.Trace.class,
744       org.cliffc.high_scale_lib.Counter.class,
745       com.yammer.metrics.core.MetricsRegistry.class); // needed for mapred over snapshots
746   }
747 
748   /**
749    * Returns a classpath string built from the content of the "tmpjars" value in {@code conf}.
750    * Also exposed to shell scripts via `bin/hbase mapredcp`.
751    */
752   public static String buildDependencyClasspath(Configuration conf) {
753     if (conf == null) {
754       throw new IllegalArgumentException("Must provide a configuration object.");
755     }
756     Set<String> paths = new HashSet<String>(conf.getStringCollection("tmpjars"));
757     if (paths.size() == 0) {
758       throw new IllegalArgumentException("Configuration contains no tmpjars.");
759     }
760     StringBuilder sb = new StringBuilder();
761     for (String s : paths) {
762       // entries can take the form 'file:/path/to/file.jar'.
763       int idx = s.indexOf(":");
764       if (idx != -1) s = s.substring(idx + 1);
765       if (sb.length() > 0) sb.append(File.pathSeparator);
766       sb.append(s);
767     }
768     return sb.toString();
769   }
770 
771   /**
772    * Add the HBase dependency jars as well as jars for any of the configured
773    * job classes to the job configuration, so that JobClient will ship them
774    * to the cluster and add them to the DistributedCache.
775    */
776   public static void addDependencyJars(Job job) throws IOException {
777     addHBaseDependencyJars(job.getConfiguration());
778     try {
779       addDependencyJars(job.getConfiguration(),
780           // when making changes here, consider also mapred.TableMapReduceUtil
781           // pull job classes
782           job.getMapOutputKeyClass(),
783           job.getMapOutputValueClass(),
784           job.getInputFormatClass(),
785           job.getOutputKeyClass(),
786           job.getOutputValueClass(),
787           job.getOutputFormatClass(),
788           job.getPartitionerClass(),
789           job.getCombinerClass());
790     } catch (ClassNotFoundException e) {
791       throw new IOException(e);
792     }
793   }
794 
795   /**
796    * Add the jars containing the given classes to the job's configuration
797    * such that JobClient will ship them to the cluster and add them to
798    * the DistributedCache.
799    */
800   public static void addDependencyJars(Configuration conf,
801       Class<?>... classes) throws IOException {
802 
803     FileSystem localFs = FileSystem.getLocal(conf);
804     Set<String> jars = new HashSet<String>();
805     // Add jars that are already in the tmpjars variable
806     jars.addAll(conf.getStringCollection("tmpjars"));
807 
808     // add jars as we find them to a map of contents jar name so that we can avoid
809     // creating new jars for classes that have already been packaged.
810     Map<String, String> packagedClasses = new HashMap<String, String>();
811 
812     // Add jars containing the specified classes
813     for (Class<?> clazz : classes) {
814       if (clazz == null) continue;
815 
816       Path path = findOrCreateJar(clazz, localFs, packagedClasses);
817       if (path == null) {
818         LOG.warn("Could not find jar for class " + clazz +
819                  " in order to ship it to the cluster.");
820         continue;
821       }
822       if (!localFs.exists(path)) {
823         LOG.warn("Could not validate jar file " + path + " for class "
824                  + clazz);
825         continue;
826       }
827       jars.add(path.toString());
828     }
829     if (jars.isEmpty()) return;
830 
831     conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[jars.size()])));
832   }
833 
834   /**
835    * If org.apache.hadoop.util.JarFinder is available (0.23+ hadoop), finds
836    * the Jar for a class or creates it if it doesn't exist. If the class is in
837    * a directory in the classpath, it creates a Jar on the fly with the
838    * contents of the directory and returns the path to that Jar. If a Jar is
839    * created, it is created in the system temporary directory. Otherwise,
840    * returns an existing jar that contains a class of the same name. Maintains
841    * a mapping from jar contents to the tmp jar created.
842    * @param my_class the class to find.
843    * @param fs the FileSystem with which to qualify the returned path.
844    * @param packagedClasses a map of class name to path.
845    * @return a jar file that contains the class.
846    * @throws IOException
847    */
848   private static Path findOrCreateJar(Class<?> my_class, FileSystem fs,
849       Map<String, String> packagedClasses)
850   throws IOException {
851     // attempt to locate an existing jar for the class.
852     String jar = findContainingJar(my_class, packagedClasses);
853     if (null == jar || jar.isEmpty()) {
854       jar = getJar(my_class);
855       updateMap(jar, packagedClasses);
856     }
857 
858     if (null == jar || jar.isEmpty()) {
859       return null;
860     }
861 
862     LOG.debug(String.format("For class %s, using jar %s", my_class.getName(), jar));
863     return new Path(jar).makeQualified(fs);
864   }
865 
866   /**
867    * Add entries to <code>packagedClasses</code> corresponding to class files
868    * contained in <code>jar</code>.
869    * @param jar The jar who's content to list.
870    * @param packagedClasses map[class -> jar]
871    */
872   private static void updateMap(String jar, Map<String, String> packagedClasses) throws IOException {
873     if (null == jar || jar.isEmpty()) {
874       return;
875     }
876     ZipFile zip = null;
877     try {
878       zip = new ZipFile(jar);
879       for (Enumeration<? extends ZipEntry> iter = zip.entries(); iter.hasMoreElements();) {
880         ZipEntry entry = iter.nextElement();
881         if (entry.getName().endsWith("class")) {
882           packagedClasses.put(entry.getName(), jar);
883         }
884       }
885     } finally {
886       if (null != zip) zip.close();
887     }
888   }
889 
890   /**
891    * Find a jar that contains a class of the same name, if any. It will return
892    * a jar file, even if that is not the first thing on the class path that
893    * has a class with the same name. Looks first on the classpath and then in
894    * the <code>packagedClasses</code> map.
895    * @param my_class the class to find.
896    * @return a jar file that contains the class, or null.
897    * @throws IOException
898    */
899   private static String findContainingJar(Class<?> my_class, Map<String, String> packagedClasses)
900       throws IOException {
901     ClassLoader loader = my_class.getClassLoader();
902 
903     String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
904 
905     if (loader != null) {
906       // first search the classpath
907       for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
908         URL url = itr.nextElement();
909         if ("jar".equals(url.getProtocol())) {
910           String toReturn = url.getPath();
911           if (toReturn.startsWith("file:")) {
912             toReturn = toReturn.substring("file:".length());
913           }
914           // URLDecoder is a misnamed class, since it actually decodes
915           // x-www-form-urlencoded MIME type rather than actual
916           // URL encoding (which the file path has). Therefore it would
917           // decode +s to ' 's which is incorrect (spaces are actually
918           // either unencoded or encoded as "%20"). Replace +s first, so
919           // that they are kept sacred during the decoding process.
920           toReturn = toReturn.replaceAll("\\+", "%2B");
921           toReturn = URLDecoder.decode(toReturn, "UTF-8");
922           return toReturn.replaceAll("!.*$", "");
923         }
924       }
925     }
926 
927     // now look in any jars we've packaged using JarFinder. Returns null when
928     // no jar is found.
929     return packagedClasses.get(class_file);
930   }
931 
932   /**
933    * Invoke 'getJar' on a JarFinder implementation. Useful for some job
934    * configuration contexts (HBASE-8140) and also for testing on MRv2. First
935    * check if we have HADOOP-9426. Lacking that, fall back to the backport.
936    * @param my_class the class to find.
937    * @return a jar file that contains the class, or null.
938    */
939   private static String getJar(Class<?> my_class) {
940     String ret = null;
941     String hadoopJarFinder = "org.apache.hadoop.util.JarFinder";
942     Class<?> jarFinder = null;
943     try {
944       LOG.debug("Looking for " + hadoopJarFinder + ".");
945       jarFinder = Class.forName(hadoopJarFinder);
946       LOG.debug(hadoopJarFinder + " found.");
947       Method getJar = jarFinder.getMethod("getJar", Class.class);
948       ret = (String) getJar.invoke(null, my_class);
949     } catch (ClassNotFoundException e) {
950       LOG.debug("Using backported JarFinder.");
951       ret = JarFinder.getJar(my_class);
952     } catch (InvocationTargetException e) {
953       // function was properly called, but threw it's own exception. Unwrap it
954       // and pass it on.
955       throw new RuntimeException(e.getCause());
956     } catch (Exception e) {
957       // toss all other exceptions, related to reflection failure
958       throw new RuntimeException("getJar invocation failed.", e);
959     }
960 
961     return ret;
962   }
963 }